diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index 0412c5f37952d..a378bc6baa5a5 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -1,9 +1,14 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import sys import zipfile -# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB -VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250)) +# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB +# Note that we have 400 MiB quota, please use it wisely. +# See https://github.com/pypi/support/issues/3792 . +# Please also sync the value with the one in Dockerfile. +VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400)) def print_top_10_largest_files(zip_file): diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py index 8350e2705141e..36e1b6c01326a 100644 --- a/.buildkite/generate_index.py +++ b/.buildkite/generate_index.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import os diff --git a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml index 3ea0b7bb5cd66..4ef8b5c3709b3 100644 --- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml +++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml @@ -4,8 +4,8 @@ tasks: - name: "gsm8k" metrics: - name: "exact_match,strict-match" - value: 0.233 + value: 0.231 - name: "exact_match,flexible-extract" - value: 0.236 + value: 0.22 limit: 1000 num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml new file mode 100644 index 0000000000000..2928d75ce4469 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml @@ -0,0 +1,11 @@ +# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2 +model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.6353 + - name: "exact_match,flexible-extract" + value: 0.637 +limit: null +num_fewshot: null diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index afc935c1a9318..4ae23eff62f37 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ LM eval harness on model to compare vs HF baseline computed offline. Configs are found in configs/$MODEL.yaml @@ -12,6 +13,7 @@ from pathlib import Path import lm_eval import numpy +import pytest import yaml RTOL = 0.05 @@ -45,6 +47,10 @@ def test_lm_eval_correctness(): eval_config = yaml.safe_load( Path(TEST_DATA_FILE).read_text(encoding="utf-8")) + if eval_config[ + "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501 + pytest.skip("FBGEMM is currently failing on main.") + # Launch eval requests. results = launch_lm_eval(eval_config) diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index fbf41eb10a392..d3f5fc5cd4cee 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -1,15 +1,13 @@ # vLLM benchmark suite - ## Introduction This directory contains two sets of benchmark for vllm. + - Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance - Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm. - -See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results. - +See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results. ## Performance benchmark quick overview @@ -19,17 +17,14 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performan **For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run. - ## Nightly benchmark quick overview -**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B. +**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B. **Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy. **Benchmarking Duration**: about 3.5hrs. - - ## Trigger the benchmark Performance benchmark will be triggered when: @@ -39,16 +34,11 @@ Performance benchmark will be triggered when: Nightly benchmark will be triggered when: - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label. - - - ## Performance benchmark details - See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. - -#### Latency test +### Latency test Here is an example of one test inside `latency-tests.json`: @@ -68,23 +58,25 @@ Here is an example of one test inside `latency-tests.json`: ``` In this example: -- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`. -- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15` + +- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`. +- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15` Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly. WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file. +### Throughput test -#### Throughput test The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`. The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot. -#### Serving test +### Serving test + We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example: -``` +```json [ { "test_name": "serving_llama8B_tp1_sharegpt", @@ -109,6 +101,7 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t ``` Inside this example: + - The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`. - The `server-parameters` includes the command line arguments for vLLM server. - The `client-parameters` includes the command line arguments for `benchmark_serving.py`. @@ -118,36 +111,33 @@ The number of this test is less stable compared to the delay and latency benchma WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`. -#### Visualizing the results +### Visualizing the results + The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results. You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. If you do not see the table, please wait till the benchmark finish running. The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking. - - ## Nightly test details See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines. +### Workflow -#### Workflow - -- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines. +- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines. - Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container. - The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark. - At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite. -#### Nightly tests +### Nightly tests In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark. -#### Docker containers +### Docker containers The docker containers for benchmarking are specified in `nightly-pipeline.yaml`. WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`. WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git). - diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml index 679abf1814aa5..4259514940d3f 100644 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -10,12 +10,18 @@ steps: - image: badouralix/curl-jq command: - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh - + - label: "Cleanup H100" + agents: + queue: H100 + depends_on: ~ + command: docker system prune -a --volumes --force + - label: "A100" # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" agents: queue: A100 depends_on: wait-for-container-image + if: build.branch == "main" plugins: - kubernetes: podSpec: @@ -50,6 +56,7 @@ steps: agents: queue: H200 depends_on: wait-for-container-image + if: build.branch == "main" plugins: - docker#v5.12.0: image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT @@ -75,6 +82,7 @@ steps: agents: queue: H100 depends_on: wait-for-container-image + if: build.branch == "main" plugins: - docker#v5.12.0: image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT @@ -90,3 +98,87 @@ steps: environment: - VLLM_USAGE_SOURCE - HF_TOKEN + + # Premerge benchmark + - label: "A100" + # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" + agents: + queue: A100 + depends_on: wait-for-container-image + if: build.branch != "main" + plugins: + - kubernetes: + podSpec: + priorityClassName: perf-benchmark + containers: + - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + command: + - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + + - label: "H200" + # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" + agents: + queue: H200 + depends_on: wait-for-container-image + if: build.branch != "main" + plugins: + - docker#v5.12.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + command: + - bash + - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh + mount-buildkite-agent: true + propagate-environment: true + ipc: host + gpus: 4,5,6,7 + volumes: + - /data/benchmark-hf-cache:/root/.cache/huggingface + environment: + - VLLM_USAGE_SOURCE + - HF_TOKEN + + #- block: "Run H100 Benchmark" + #key: block-h100 + #depends_on: ~ + + - label: "H100" + # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" + agents: + queue: H100 + depends_on: wait-for-container-image + if: build.branch != "main" + plugins: + - docker#v5.12.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + command: + - bash + - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh + mount-buildkite-agent: true + propagate-environment: true + ipc: host + gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used + volumes: + - /data/benchmark-hf-cache:/root/.cache/huggingface + environment: + - VLLM_USAGE_SOURCE + - HF_TOKEN diff --git a/.buildkite/nightly-benchmarks/nightly-annotation.md b/.buildkite/nightly-benchmarks/nightly-annotation.md index 1e33793842bf8..e43ea765f1556 100644 --- a/.buildkite/nightly-benchmarks/nightly-annotation.md +++ b/.buildkite/nightly-benchmarks/nightly-annotation.md @@ -9,20 +9,19 @@ This file contains the downloading link for benchmarking results. Please download the visualization scripts in the post - ## Results reproduction - Find the docker we use in `benchmarking pipeline` - Deploy the docker, and inside the docker: - - Download `nightly-benchmarks.zip`. - - In the same folder, run the following code -``` -export HF_TOKEN= -apt update -apt install -y git -unzip nightly-benchmarks.zip -VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh -``` + - Download `nightly-benchmarks.zip`. + - In the same folder, run the following code: + + ```console + export HF_TOKEN= + apt update + apt install -y git + unzip nightly-benchmarks.zip + VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh + ``` And the results will be inside `./benchmarks/results`. - diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index 7dec7a0fe0b4e..5f003f42f07c0 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -2,6 +2,7 @@ # Nightly benchmark This benchmark aims to: + - Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload. - Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions. @@ -9,7 +10,6 @@ Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html) Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176) - ## Setup - Docker images: @@ -33,7 +33,7 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/ - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed. - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better). -# Known issues +## Known issues - TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105). -- TGI does not support `ignore-eos` flag. \ No newline at end of file +- TGI does not support `ignore-eos` flag. diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md index da32d1f073cea..cacaef986c658 100644 --- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md +++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md @@ -7,10 +7,8 @@ - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. - Evaluation metrics: end-to-end latency (mean, median, p99). - {latency_tests_markdown_table} - ## Throughput tests - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). @@ -19,10 +17,8 @@ - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. - Evaluation metrics: throughput. - {throughput_tests_markdown_table} - ## Serving tests - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). @@ -33,13 +29,11 @@ - We also added a speculative decoding test for llama-3 70B, under QPS 2 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). - {serving_tests_markdown_table} - ## json version of the benchmarking tables -This section contains the data of the markdown tables above in JSON format. +This section contains the data of the markdown tables above in JSON format. You can load the benchmarking tables into pandas dataframes as follows: ```python @@ -54,9 +48,9 @@ serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"]) ``` The json string for all benchmarking tables: + ```json {benchmarking_results_in_json_string} ``` You can also check the raw experiment data in the Artifact tab of the Buildkite page. - diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 9d3646e2f6a15..1030ec24e8d7f 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import os from pathlib import Path @@ -82,8 +84,13 @@ if __name__ == "__main__": # this result is generated via `benchmark_serving.py` # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands")) as f: - command = json.loads(f.read()) + try: + with open(test_file.with_suffix(".commands")) as f: + command = json.loads(f.read()) + except OSError as e: + print(e) + continue + raw_result.update(command) # update the test name of this result @@ -97,8 +104,13 @@ if __name__ == "__main__": # this result is generated via `benchmark_latency.py` # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands")) as f: - command = json.loads(f.read()) + try: + with open(test_file.with_suffix(".commands")) as f: + command = json.loads(f.read()) + except OSError as e: + print(e) + continue + raw_result.update(command) # update the test name of this result @@ -119,8 +131,13 @@ if __name__ == "__main__": # this result is generated via `benchmark_throughput.py` # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands")) as f: - command = json.loads(f.read()) + try: + with open(test_file.with_suffix(".commands")) as f: + command = json.loads(f.read()) + except OSError as e: + print(e) + continue + raw_result.update(command) # update the test name of this result diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py index 68ac5909e5951..5e17b79d26a1b 100644 --- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py +++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse from transformers import AutoTokenizer diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py index 052060c576300..0ff95a0911b16 100644 --- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import json from pathlib import Path diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py index 18bcc3a8714c4..e5f179a0f5b68 100644 --- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py +++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from lmdeploy.serve.openai.api_client import APIClient api_client = APIClient("http://localhost:8000") diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh index 32bd34c431c89..4d01a314adc47 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh @@ -426,7 +426,7 @@ main() { pip install -U transformers - pip install -r requirements-dev.txt + pip install -r requirements/dev.txt which genai-perf # check storage diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index 0d16a83781ab2..4cd449b141ece 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -309,11 +309,14 @@ run_serving_tests() { new_test_name=$test_name"_qps_"$qps + # pass the tensor parallel size to the client so that it can be displayed + # on the benchmark dashboard client_command="python3 benchmark_serving.py \ --save-result \ --result-dir $RESULTS_FOLDER \ --result-filename ${new_test_name}.json \ --request-rate $qps \ + --metadata "tensor_parallel_size=$tp" \ $client_args" echo "Running test case $test_name with qps $qps" @@ -345,6 +348,11 @@ main() { check_gpus check_hf_token + # Set to v1 to run v1 benchmark + if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then + export VLLM_USE_V1=1 + fi + # dependencies (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get update && apt-get -y install jq) @@ -353,7 +361,7 @@ main() { # get the current IP address, required by benchmark_serving.py export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') # turn of the reporting of the status of each request, to clean up the terminal output - export VLLM_LOG_LEVEL="WARNING" + export VLLM_LOGGING_LEVEL="WARNING" # prepare for benchmarking cd benchmarks || exit 1 diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index 92d6fad73a94c..62ee5e10b5095 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import datetime import json import os diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh index aa0f7ade808e0..50e1ab0242202 100644 --- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh +++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh @@ -1,6 +1,10 @@ #!/bin/sh TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token) -URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT" +if [[ "$BUILDKITE_BRANCH" == "main" ]]; then + URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT" +else + URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" +fi TIMEOUT_SECONDS=10 diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests.json b/.buildkite/nightly-benchmarks/tests/latency-tests.json index 1841186da158f..7762a239f96ab 100644 --- a/.buildkite/nightly-benchmarks/tests/latency-tests.json +++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json @@ -29,4 +29,4 @@ "num-iters": 15 } } -] \ No newline at end of file +] diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json index facb0eac749ca..415171e268b08 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json @@ -66,8 +66,7 @@ "swap_space": 16, "speculative_model": "turboderp/Qwama-0.5B-Instruct", "num_speculative_tokens": 4, - "speculative_draft_tensor_parallel_size": 1, - "use_v2_block_manager": "" + "speculative_draft_tensor_parallel_size": 1 }, "client_parameters": { "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests.json b/.buildkite/nightly-benchmarks/tests/throughput-tests.json index 91ef6d16be638..9bc87cbcd2bc5 100644 --- a/.buildkite/nightly-benchmarks/tests/throughput-tests.json +++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json @@ -32,4 +32,4 @@ "backend": "vllm" } } -] \ No newline at end of file +] diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 51618a2955fb1..18f582b6e4c94 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,4 +1,15 @@ steps: + - label: "Build wheel - CUDA 12.4" + agents: + queue: cpu_queue_postmerge + commands: + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ." + - "mkdir artifacts" + - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" + - "bash .buildkite/upload-wheels.sh" + env: + DOCKER_BUILDKIT: "1" + - label: "Build wheel - CUDA 12.1" agents: queue: cpu_queue_postmerge @@ -37,7 +48,7 @@ steps: queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ." - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - label: "Build and publish TPU release image" @@ -56,6 +67,11 @@ steps: env: DOCKER_BUILDKIT: "1" + - input: "Provide Release version here" + fields: + - text: "What is the release version?" + key: "release-version" + - block: "Build CPU release image" key: block-cpu-release-image-build depends_on: ~ @@ -66,7 +82,7 @@ steps: queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ." - - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION" + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ." + - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" env: DOCKER_BUILDKIT: "1" diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 3515ccd65667e..0680bae13ddbf 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -77,7 +77,6 @@ echo "Commands:$commands" #ignore certain kernels tests if [[ $commands == *" kernels "* ]]; then commands="${commands} \ - --ignore=kernels/test_attention.py \ --ignore=kernels/test_attention_selector.py \ --ignore=kernels/test_blocksparse_attention.py \ --ignore=kernels/test_causal_conv1d.py \ @@ -92,19 +91,40 @@ if [[ $commands == *" kernels "* ]]; then --ignore=kernels/test_moe.py \ --ignore=kernels/test_prefix_prefill.py \ --ignore=kernels/test_rand.py \ - --ignore=kernels/test_sampler.py" + --ignore=kernels/test_sampler.py \ + --ignore=kernels/test_cascade_flash_attn.py \ + --ignore=kernels/test_mamba_mixer2.py \ + --ignore=kernels/test_aqlm.py \ + --ignore=kernels/test_machete_mm.py \ + --ignore=kernels/test_mha_attn.py \ + --ignore=kernels/test_block_fp8.py \ + --ignore=kernels/test_permute_cols.py" fi -#ignore certain Entrypoints tests +#ignore certain Entrypoints/openai tests if [[ $commands == *" entrypoints/openai "* ]]; then commands=${commands//" entrypoints/openai "/" entrypoints/openai \ - --ignore=entrypoints/openai/test_accuracy.py \ --ignore=entrypoints/openai/test_audio.py \ - --ignore=entrypoints/openai/test_encoder_decoder.py \ - --ignore=entrypoints/openai/test_embedding.py \ - --ignore=entrypoints/openai/test_oot_registration.py "} + --ignore=entrypoints/openai/test_chat.py \ + --ignore=entrypoints/openai/test_shutdown.py \ + --ignore=entrypoints/openai/test_completion.py \ + --ignore=entrypoints/openai/test_sleep.py \ + --ignore=entrypoints/openai/test_models.py \ + --ignore=entrypoints/openai/test_prompt_validation.py "} fi +#ignore certain Entrypoints/llm tests +if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then + commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "} +fi + +# --ignore=entrypoints/openai/test_encoder_decoder.py \ +# --ignore=entrypoints/openai/test_embedding.py \ +# --ignore=entrypoints/openai/test_oot_registration.py +# --ignore=entrypoints/openai/test_accuracy.py \ +# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13 + + PARALLEL_JOB_COUNT=8 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. if [[ $commands == *"--shard-id="* ]]; then @@ -121,6 +141,8 @@ if [[ $commands == *"--shard-id="* ]]; then --rm \ -e HIP_VISIBLE_DEVICES="${GPU}" \ -e HF_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ -v "${HF_CACHE}:${HF_MOUNT}" \ -e "HF_HOME=${HF_MOUNT}" \ --name "${container_name}_${GPU}" \ @@ -148,6 +170,8 @@ else --rm \ -e HIP_VISIBLE_DEVICES=0 \ -e HF_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ -v "${HF_CACHE}:${HF_MOUNT}" \ -e "HF_HOME=${HF_MOUNT}" \ --name "${container_name}" \ diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index e19ace782feb5..05744bb5225b8 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -19,23 +19,27 @@ remove_docker_container # Run the image, setting --shm-size=4g for tensor parallel. docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER" + --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER" docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 + --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 function cpu_tests() { set -e export NUMA_NODE=$2 + export BUILDKITE_BUILD_NUMBER=$3 # offline inference docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c " set -e - python3 examples/offline_inference/basic.py" + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" # Run basic model test docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e - pip install -r vllm/requirements-test.txt + pip install -r vllm/requirements/test.txt + pip install -r vllm/requirements/cpu.txt + pytest -v -s tests/kernels/test_cache.py -m cpu_model + pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model pytest -v -s tests/models/decoder_only/language -m cpu_model pytest -v -s tests/models/embedding/language -m cpu_model pytest -v -s tests/models/encoder_decoder/language -m cpu_model @@ -85,4 +89,4 @@ function cpu_tests() { # All of CPU tests are expected to be finished less than 40 mins. export -f cpu_tests -timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" +timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER" diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh index 3e4e409466b8a..5c004b47778fb 100644 --- a/.buildkite/run-gh200-test.sh +++ b/.buildkite/run-gh200-test.sh @@ -14,6 +14,7 @@ DOCKER_BUILDKIT=1 docker build . \ -t gh200-test \ --build-arg max_jobs=66 \ --build-arg nvcc_threads=2 \ + --build-arg RUN_WHEEL_CHECK=false \ --build-arg torch_cuda_arch_list="9.0+PTX" \ --build-arg vllm_fa_cmake_gpu_arches="90-real" @@ -23,6 +24,6 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and test offline inference -docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' - python3 examples/offline_inference/basic.py +docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' + python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B ' diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh index 1edcb1d2669e9..f83eb927aae4e 100644 --- a/.buildkite/run-hpu-test.sh +++ b/.buildkite/run-hpu-test.sh @@ -20,5 +20,5 @@ trap remove_docker_container_and_exit EXIT remove_docker_container # Run the image and launch offline inference -docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py +docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m EXITCODE=$? diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh index 0590dad4f311f..ad5ae6f415748 100644 --- a/.buildkite/run-neuron-test.sh +++ b/.buildkite/run-neuron-test.sh @@ -29,9 +29,6 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then docker image prune -f # Remove unused volumes / force the system prune for old images as well. docker volume prune -f && docker system prune -f - # Remove huggingface model artifacts and compiler cache - rm -rf "${HF_MOUNT:?}/*" - rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*" echo "$current_time" > /tmp/neuron-docker-build-timestamp fi else @@ -47,11 +44,11 @@ remove_docker_container() { trap remove_docker_container EXIT # Run the image -docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \ +docker run --rm -it --device=/dev/neuron0 --network bridge \ -v "${HF_CACHE}:${HF_MOUNT}" \ -e "HF_HOME=${HF_MOUNT}" \ -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ --name "${container_name}" \ ${image_name} \ - /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py" + /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys" diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh deleted file mode 100755 index 6159b21ff8206..0000000000000 --- a/.buildkite/run-openvino-test.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -# This script build the OpenVINO docker image and run the offline inference inside the container. -# It serves a sanity check for compilation and basic model usage. -set -ex - -# Try building the docker image -docker build -t openvino-test -f Dockerfile.openvino . - -# Setup cleanup -remove_docker_container() { docker rm -f openvino-test || true; } -trap remove_docker_container EXIT -remove_docker_container - -# Run the image and launch offline inference -docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh deleted file mode 100644 index 650af0fac4c61..0000000000000 --- a/.buildkite/run-tpu-test.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -set -e - -# Build the docker image. -docker build -f Dockerfile.tpu -t vllm-tpu . - -# Set up cleanup. -remove_docker_container() { docker rm -f tpu-test || true; } -trap remove_docker_container EXIT -# Remove the container that might not be cleaned up in the previous run. -remove_docker_container - -# For HF_TOKEN. -source /etc/environment -# Run a simple end-to-end example. -docker run --privileged --net host --shm-size=16G -it \ - -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ - vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \ - && python3 -m pip install pytest \ - && python3 -m pip install lm_eval[api]==0.4.4 \ - && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \ - && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ - && python3 /workspace/vllm/tests/tpu/test_compilation.py \ - && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \ - && python3 /workspace/vllm/examples/offline_inference/tpu.py" diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh new file mode 100755 index 0000000000000..d557feefba7aa --- /dev/null +++ b/.buildkite/run-tpu-v1-test.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +set -e + +# Build the docker image. +docker build -f Dockerfile.tpu -t vllm-tpu . + +# Set up cleanup. +remove_docker_container() { docker rm -f tpu-test || true; } +trap remove_docker_container EXIT +# Remove the container that might not be cleaned up in the previous run. +remove_docker_container + +# For HF_TOKEN. +source /etc/environment +# Run a simple end-to-end example. +docker run --privileged --net host --shm-size=16G -it \ + -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ + vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \ + && python3 -m pip install pytest \ + && python3 -m pip install lm_eval[api]==0.4.4 \ + && export VLLM_USE_V1=1 \ + && export VLLM_XLA_CHECK_RECOMPILATION=1 \ + && echo TEST_1 \ + && pytest /workspace/vllm/tests/tpu/test_compilation.py \ + && echo TEST_2 \ + && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \ + && echo TEST_3 \ + && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \ + && echo TEST_4 \ + && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \ + && echo TEST_5 \ + && python3 /workspace/vllm/examples/offline_inference/tpu.py \ + && echo TEST_6 \ + && pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py" \ + + +# TODO: This test fails because it uses RANDOM_SEED sampling +# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ + diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh index 4d344e58db8ac..3a0e6bdb2caaf 100644 --- a/.buildkite/run-xpu-test.sh +++ b/.buildkite/run-xpu-test.sh @@ -4,16 +4,28 @@ # It serves a sanity check for compilation and basic model usage. set -ex +image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}" +container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" + # Try building the docker image -docker build -t xpu-test -f Dockerfile.xpu . +docker build -t ${image_name} -f Dockerfile.xpu . # Setup cleanup -remove_docker_container() { docker rm -f xpu-test || true; } +remove_docker_container() { + docker rm -f "${container_name}" || true; + docker image rm -f "${image_name}" || true; + docker system prune -f || true; +} trap remove_docker_container EXIT -remove_docker_container # Run the image and test offline inference/tensor parallel -docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c ' - python3 examples/offline_inference/basic.py - python3 examples/offline_inference/cli.py -tp 2 +docker run \ + --device /dev/dri \ + -v /dev/dri/by-path:/dev/dri/by-path \ + --entrypoint="" \ + --name "${container_name}" \ + "${image_name}" \ + sh -c ' + VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m + VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2 ' diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index daec46760117d..217f869f1f3c5 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -2,7 +2,7 @@ # adding a new command to an existing step. See different options here for examples. # This script will be feed into Jinja template in `test-template-aws.j2` at -# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 +# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 # to generate the final pipeline yaml file. # Documentation @@ -15,7 +15,7 @@ # mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd] # gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100 # num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4. -# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host, +# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host, # in this case, commands must be specified. the first command runs on first host, the second # command runs on the second host. # working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests @@ -24,8 +24,8 @@ # When adding a test # - If the test belong to an existing group, add it there # - If the test is short, add to any existing step -# - If the test takes more than 10min, then it is okay to create a new step. -# Note that all steps execute in parallel. +# - If the test takes more than 10min, then it is okay to create a new step. +# Note that all steps execute in parallel. steps: ##### fast check tests ##### @@ -35,13 +35,12 @@ steps: fast_check: true no_gpu: True commands: - - pip install -r requirements-docs.txt + - pip install -r ../../requirements/docs.txt - SPHINXOPTS=\"-W\" make html # Check API reference (if it fails, you may have missing mock imports) - grep \"sig sig-object py\" build/html/api/inference_params.html - label: Async Engine, Inputs, Utils, Worker Test # 24min - fast_check: true source_file_dependencies: - vllm/ - tests/mq_llm_engine @@ -50,9 +49,9 @@ steps: - tests/multimodal - tests/test_utils - tests/worker - - tests/standalone_tests/lazy_torch_compile.py + - tests/standalone_tests/lazy_imports.py commands: - - python3 standalone_tests/lazy_torch_compile.py + - python3 standalone_tests/lazy_imports.py - pytest -v -s mq_llm_engine # MQLLMEngine - pytest -v -s async_engine # AsyncLLMEngine - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py @@ -78,6 +77,7 @@ steps: - tests/basic_correctness/test_preemption - tests/basic_correctness/test_cumem.py commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s basic_correctness/test_cumem.py - pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_cpu_offload.py @@ -107,45 +107,59 @@ steps: mirror_hardwares: [amd] source_file_dependencies: - vllm/ + - tests/entrypoints/llm + - tests/entrypoints/openai + - tests/entrypoints/test_chat_utils + - tests/entrypoints/offline_mode commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py + - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ - pytest -v -s entrypoints/test_chat_utils.py - - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - label: Distributed Tests (4 GPUs) # 10min working_dir: "/vllm-workspace/tests" num_gpus: 4 - fast_check: true source_file_dependencies: - vllm/distributed/ - vllm/core/ - - tests/distributed + - tests/distributed/test_utils + - tests/distributed/test_pynccl - tests/spec_decode/e2e/test_integration_dist_tp4 - - tests/compile + - tests/compile/test_basic_correctness - examples/offline_inference/rlhf.py + - examples/offline_inference/rlhf_colocate.py + - tests/examples/offline_inference/data_parallel.py commands: + # test with tp=2 and external_dp=2 + - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with internal dp + - python3 ../examples/offline_inference/data_parallel.py - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py # TODO: create a dedicated test section for multi-GPU example tests # when we have multiple distributed example tests - - python3 ../examples/offline_inference/rlhf.py + - pushd ../examples/offline_inference + - VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py + - VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py + - popd - label: Metrics, Tracing Test # 10min - num_gpus: 2 - fast_check: true + num_gpus: 2 source_file_dependencies: - vllm/ - tests/metrics - tests/tracing commands: - - pytest -v -s metrics + - pytest -v -s metrics - "pip install \ 'opentelemetry-sdk>=1.26.0,<1.27.0' \ 'opentelemetry-api>=1.26.0,<1.27.0' \ @@ -172,6 +186,9 @@ steps: - vllm/ - tests/engine - tests/tokenization + - tests/test_sequence + - tests/test_config + - tests/test_logger commands: - pytest -v -s engine test_sequence.py test_config.py test_logger.py # OOM in the CI unless we run this separately @@ -183,7 +200,23 @@ steps: - vllm/ - tests/v1 commands: - - VLLM_USE_V1=1 pytest -v -s v1 + # split the test to avoid interference + - pytest -v -s v1/core + - pytest -v -s v1/entrypoints + - pytest -v -s v1/engine + - pytest -v -s v1/entrypoints + - pytest -v -s v1/sample + - pytest -v -s v1/worker + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_stats.py + - pytest -v -s v1/test_utils.py + - pytest -v -s v1/test_oracle.py + # TODO: accuracy does not match, whether setting + # VLLM_USE_FLASHINFER_SAMPLER or not on H100. + - pytest -v -s v1/e2e + # Integration test for streaming correctness (requires special branch). + - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - label: Examples Test # 25min working_dir: "/vllm-workspace/examples" @@ -193,19 +226,22 @@ steps: - examples/ commands: - pip install tensorizer # for tensorizer test - - python3 offline_inference/basic.py - - python3 offline_inference/cpu_offload.py - - python3 offline_inference/chat.py + - python3 offline_inference/basic/generate.py --model facebook/opt-125m + - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 offline_inference/basic/chat.py - python3 offline_inference/prefix_caching.py - python3 offline_inference/llm_engine_example.py - - python3 offline_inference/vision_language.py - - python3 offline_inference/vision_language_multi_image.py - - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_embedding.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder.py - - python3 offline_inference/classification.py - - python3 offline_inference/embedding.py - - python3 offline_inference/scoring.py - - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 + - python3 offline_inference/basic/classify.py + - python3 offline_inference/basic/embed.py + - python3 offline_inference/basic/score.py + - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - label: Prefix Caching Test # 9min mirror_hardwares: [amd] @@ -232,7 +268,7 @@ steps: - vllm/model_executor/guided_decoding - tests/test_logits_processor - tests/model_executor/test_guided_processors - commands: + commands: - pytest -v -s test_logits_processor.py - pytest -v -s model_executor/test_guided_processors.py @@ -243,7 +279,7 @@ steps: - vllm/model_executor/models/eagle.py commands: - pytest -v -s spec_decode/e2e/test_multistep_correctness.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py + - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py - pytest -v -s spec_decode/e2e/test_eagle_correctness.py - label: LoRA Test %N # 15min each @@ -251,11 +287,10 @@ steps: source_file_dependencies: - vllm/lora - tests/lora - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py + command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py parallelism: 4 -- label: "PyTorch Fullgraph Smoke Test" # 9min - fast_check: true +- label: PyTorch Fullgraph Smoke Test # 9min source_file_dependencies: - vllm/ - tests/compile @@ -264,8 +299,9 @@ steps: # these tests need to be separated, cannot combine - pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_toy_llama.py + - pytest -v -s compile/test_pass_manager.py -- label: "PyTorch Fullgraph Test" # 18min +- label: PyTorch Fullgraph Test # 18min source_file_dependencies: - vllm/ - tests/compile @@ -317,6 +353,14 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - bash ./run-tests.sh -c configs/models-small.txt -t 1 +- label: OpenAI API correctness + source_file_dependencies: + - csrc/ + - vllm/entrypoints/openai/ + - vllm/model_executor/models/whisper.py + commands: # LMEval+Transcription WER check + - pytest -s entrypoints/openai/correctness/ + - label: Encoder Decoder tests # 5min source_file_dependencies: - vllm/ @@ -340,8 +384,10 @@ steps: - vllm/ - tests/models commands: + - pytest -v -s models/test_transformers.py - pytest -v -s models/test_registry.py - - pytest -v -s models/test_initialization.py + # V1 Test: https://github.com/vllm-project/vllm/issues/14531 + - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py - label: Language Models Test (Standard) # 32min #mirror_hardwares: [amd] @@ -469,26 +515,25 @@ steps: - vllm/worker/model_runner.py - entrypoints/llm/test_collective_rpc.py commands: - - pytest -v -s entrypoints/llm/test_collective_rpc.py - - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py + - VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' # Avoid importing model tests that cause CUDA reinitialization error + - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)' - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)' - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)' # this test fails consistently. # TODO: investigate and fix # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py + - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py + - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py - label: Plugin Tests (2 GPUs) # 40min working_dir: "/vllm-workspace/tests" num_gpus: 2 - fast_check: true source_file_dependencies: - vllm/plugins/ - tests/plugins/ @@ -499,6 +544,7 @@ steps: - pip uninstall vllm_add_dummy_platform -y # end platform plugin tests # other tests continue here: + - pytest -v -s plugins_tests/test_scheduler_plugins.py - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s distributed/test_distributed_oot.py - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process @@ -548,11 +594,12 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn # This test runs llama 13B, so it is required to run on 4 GPUs. - pytest -v -s -x lora/test_long_context.py - # There is some Tensor Parallelism related processing logic in LoRA that + # There is some Tensor Parallelism related processing logic in LoRA that # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py - pytest -v -s -x lora/test_minicpmv_tp.py + - pytest -v -s -x lora/test_transfomers_model.py - label: Weight Loading Multiple GPU Test # 33min @@ -573,7 +620,7 @@ steps: - vllm/ - tests/weight_loading commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt ##### multi gpus test ##### @@ -585,7 +632,7 @@ steps: num_gpus: 4 source_file_dependencies: - vllm/ - commands: + commands: # NOTE: don't test llama model here, it seems hf implementation is buggy # see https://github.com/vllm-project/vllm/pull/5689 for details - pytest -v -s distributed/test_custom_all_reduce.py diff --git a/.buildkite/upload-wheels.sh b/.buildkite/upload-wheels.sh index 3c756659a715a..a681f89270600 100644 --- a/.buildkite/upload-wheels.sh +++ b/.buildkite/upload-wheels.sh @@ -50,8 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" if [[ $normal_wheel == *"cu118"* ]]; then # if $normal_wheel matches cu118, do not upload the index.html echo "Skipping index files for cu118 wheels" +elif [[ $normal_wheel == *"cu121"* ]]; then + # if $normal_wheel matches cu121, do not upload the index.html + echo "Skipping index files for cu121 wheels" else - # only upload index.html for cu12 wheels (default wheels) + # only upload index.html for cu124 wheels (default wheels) aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" fi @@ -63,8 +66,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/" if [[ $normal_wheel == *"cu118"* ]]; then # if $normal_wheel matches cu118, do not upload the index.html echo "Skipping index files for cu118 wheels" +elif [[ $normal_wheel == *"cu121"* ]]; then + # if $normal_wheel matches cu121, do not upload the index.html + echo "Skipping index files for cu121 wheels" else - # only upload index.html for cu12 wheels (default wheels) + # only upload index.html for cu124 wheels (default wheels) aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" fi diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index bc324d8b988b1..860c5c6cd5374 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -10,27 +10,32 @@ /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth -/vllm/model_executor/guided_decoding @mgoin +/vllm/model_executor/guided_decoding @mgoin @russellb /vllm/multimodal @DarkLight1337 @ywang96 CMakeLists.txt @tlrmchlsmth # vLLM V1 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat +/vllm/v1/structured_output @mgoin @russellb # Test ownership -/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo -/tests/test_inputs.py @DarkLight1337 @ywang96 -/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo -/tests/models @DarkLight1337 @ywang96 -/tests/multimodal @DarkLight1337 @ywang96 -/tests/prefix_caching @comaniac @KuntaiDu -/tests/spec_decode @njhill @LiuXiaoxuanPKU -/tests/kernels @tlrmchlsmth @WoosukKwon -/tests/quantization @mgoin @robertgshaw2-redhat /.buildkite/lm-eval-harness @mgoin @simon-mo +/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo +/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac /tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao +/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo +/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb +/tests/kernels @tlrmchlsmth @WoosukKwon +/tests/model_executor/test_guided_processors.py @mgoin @russellb +/tests/models @DarkLight1337 @ywang96 /tests/multi_step @alexm-redhat @comaniac +/tests/multimodal @DarkLight1337 @ywang96 +/tests/prefix_caching @comaniac @KuntaiDu +/tests/quantization @mgoin @robertgshaw2-redhat +/tests/spec_decode @njhill @LiuXiaoxuanPKU +/tests/test_inputs.py @DarkLight1337 @ywang96 +/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb +/tests/v1/structured_output @mgoin @russellb /tests/weight_loading @mgoin @youkaichao -/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml index 30db1721a9df7..d4113da8b5b81 100644 --- a/.github/ISSUE_TEMPLATE/400-bug-report.yml +++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml @@ -30,15 +30,6 @@ body: validations: required: true -- type: textarea - attributes: - label: Model Input Dumps - description: | - If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process. - placeholder: | - Upload the dumped input file. - validations: - required: false - type: textarea attributes: label: 🐛 Describe the bug diff --git a/.github/ISSUE_TEMPLATE/800-misc-discussion.yml b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml deleted file mode 100644 index 79e6e9080d51c..0000000000000 --- a/.github/ISSUE_TEMPLATE/800-misc-discussion.yml +++ /dev/null @@ -1,28 +0,0 @@ -name: 🎲 Misc/random discussions that do not fit into the above categories. -description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues. -title: "[Misc]: " -labels: ["misc"] - -body: -- type: markdown - attributes: - value: > - #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). -- type: textarea - attributes: - label: Anything you want to discuss about vllm. - description: > - Anything you want to discuss about vllm. - validations: - required: true -- type: markdown - attributes: - value: > - Thanks for contributing 🎉! -- type: checkboxes - id: askllm - attributes: - label: Before submitting a new issue... - options: - - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. - required: true diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 3ba13e0cec6cb..fa40268d67727 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1 +1,5 @@ blank_issues_enabled: false +contact_links: + - name: Questions + url: https://discuss.vllm.ai + about: Ask questions and discuss with other vLLM community members diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 51a73c857ccb2..a20c5baf895c1 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -2,4 +2,5 @@ FILL IN THE PR DESCRIPTION HERE FIX #xxxx (*link existing issues this PR will resolve*) -**BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html ** + +**BEFORE SUBMITTING, PLEASE READ ** diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 683b70cd89989..a017d69be9910 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -23,7 +23,7 @@ updates: - dependency-name: "lm-format-enforcer" - dependency-name: "gguf" - dependency-name: "compressed-tensors" - - dependency-name: "ray[adag]" + - dependency-name: "ray[cgraph]" # Ray Compiled Graph - dependency-name: "lm-eval" groups: minor-update: diff --git a/.github/mergify.yml b/.github/mergify.yml index ca4bd7ee2b87f..54f56210b286a 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -5,6 +5,7 @@ pull_request_rules: - or: - files~=^[^/]+\.md$ - files~=^docs/ + - files~=^examples/ actions: label: add: @@ -35,6 +36,58 @@ pull_request_rules: add: - frontend +- name: label-multi-modality + description: Automatically apply multi-modality label + conditions: + - or: + - files~=^vllm/multimodal/ + - files~=^tests/multimodal/ + - files~=^tests/models/multimodal/ + - files~=^tests/models/*/audio_language/ + - files~=^tests/models/*/vision_language/ + - files=tests/models/test_vision.py + actions: + label: + add: + - multi-modality + +- name: label-structured-output + description: Automatically apply structured-output label + conditions: + - or: + - files~=^vllm/model_executor/guided_decoding/ + - files=tests/model_executor/test_guided_processors.py + - files=tests/entrypoints/llm/test_guided_generate.py + - files=benchmarks/benchmark_serving_guided.py + - files=benchmarks/benchmark_guided.py + actions: + label: + add: + - structured-output + +- name: label-speculative-decoding + description: Automatically apply speculative-decoding label + conditions: + - or: + - files~=^vllm/spec_decode/ + - files=vllm/model_executor/layers/spec_decode_base_sampler.py + - files~=^tests/spec_decode/ + actions: + label: + add: + - speculative-decoding + +- name: label-v1 + description: Automatically apply v1 label + conditions: + - or: + - files~=^vllm/v1/ + - files~=^tests/v1/ + actions: + label: + add: + - v1 + - name: ping author on conflicts and add 'needs-rebase' label conditions: - conflict diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml index 0085a1cc22373..50fea0c43cb8c 100644 --- a/.github/workflows/cleanup_pr_body.yml +++ b/.github/workflows/cleanup_pr_body.yml @@ -16,7 +16,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Set up Python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: '3.12' diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index 556b60d2fca12..b199d0867a648 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -12,17 +12,17 @@ jobs: fetch-depth: 0 - name: Set up Helm - uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0 + uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0 with: version: v3.14.4 #Python is required because ct lint runs Yamale and yamllint which require Python. - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: '3.13' - name: Set up chart-testing - uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1 + uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0 with: version: v3.10.1 @@ -47,7 +47,7 @@ jobs: aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive - name: Create kind cluster - uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0 + uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0 - name: Build the Docker image vllm cpu run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env . diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 06564969dc778..6ab63a4027704 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -10,10 +10,11 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 with: python-version: "3.12" - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" + - run: echo "::add-matcher::.github/workflows/matchers/mypy.json" - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 with: extra_args: --all-files --hook-stage manual diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index e40ceaaa8b037..bfd02879965ee 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -39,7 +39,7 @@ jobs: const script = require('.github/workflows/scripts/create_release.js') await script(github, context, core) - # NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow. + # NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow. # wheel: # name: Build Wheel # runs-on: ${{ matrix.os }} @@ -50,7 +50,7 @@ jobs: # matrix: # os: ['ubuntu-20.04'] # python-version: ['3.9', '3.10', '3.11', '3.12'] - # pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt. + # pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements/cuda.txt. # cuda-version: ['11.8', '12.1'] # steps: diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml index df62539c0b3d9..27318c2fdd93f 100644 --- a/.github/workflows/reminder_comment.yml +++ b/.github/workflows/reminder_comment.yml @@ -2,7 +2,6 @@ name: PR Reminder Comment Bot on: pull_request_target: types: [opened] - jobs: pr_reminder: runs-on: ubuntu-latest @@ -15,7 +14,12 @@ jobs: owner: context.repo.owner, repo: context.repo.repo, issue_number: context.issue.number, - body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀' + body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' + + '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' + + 'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' + + 'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' + + 'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' + + '🚀' }) env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh index 122e4e101e201..0f010832b465d 100644 --- a/.github/workflows/scripts/build.sh +++ b/.github/workflows/scripts/build.sh @@ -9,7 +9,7 @@ PATH=${cuda_home}/bin:$PATH LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH # Install requirements -$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt +$python_executable -m pip install -r requirements/build.txt -r requirements/cuda.txt # Limit the number of parallel jobs to avoid OOM export MAX_JOBS=1 diff --git a/.github/workflows/scripts/create_release.js b/.github/workflows/scripts/create_release.js index 475742118afeb..0feb5dc2cf84b 100644 --- a/.github/workflows/scripts/create_release.js +++ b/.github/workflows/scripts/create_release.js @@ -1,4 +1,4 @@ -// Uses Github's API to create the release and wait for result. +// Uses GitHub's API to create the release and wait for result. // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately. module.exports = async (github, context, core) => { diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 81e7c9b050760..656f3d3fa7bc4 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -13,7 +13,7 @@ jobs: actions: write runs-on: ubuntu-latest steps: - - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0 + - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0 with: # Increasing this value ensures that changes to this workflow # propagate to all issues and PRs in days rather than months diff --git a/.gitignore b/.gitignore index 89dab8f13bab1..6f5cbd0733da0 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,8 @@ /vllm/_version.py # vllm-flash-attn built from source -vllm/vllm_flash_attn/ +vllm/vllm_flash_attn/* +!vllm/vllm_flash_attn/fa_utils.py # Byte-compiled / optimized / DLL files __pycache__/ @@ -197,7 +198,7 @@ _build/ hip_compat.h # Benchmark dataset -benchmarks/*.json +benchmarks/**/*.json # Linting actionlint diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 432bf5ed18dbc..484cd171f5f52 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,43 +1,51 @@ default_stages: - pre-commit # Run locally - manual # Run in CI +exclude: 'vllm/third_party/.*' repos: - repo: https://github.com/google/yapf - rev: v0.32.0 + rev: v0.43.0 hooks: - id: yapf args: [--in-place, --verbose] additional_dependencies: [toml] # TODO: Remove when yapf is upgraded - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.5 + rev: v0.9.3 hooks: - id: ruff - args: [--output-format, github] + args: [--output-format, github, --fix] - repo: https://github.com/codespell-project/codespell - rev: v2.3.0 + rev: v2.4.0 hooks: - id: codespell - exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*' + additional_dependencies: ['tomli'] + args: ['--toml', 'pyproject.toml'] - repo: https://github.com/PyCQA/isort - rev: 5.13.2 + rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0 hooks: - id: isort - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v18.1.5 + rev: v19.1.7 hooks: - id: clang-format - exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))' + exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*' types_or: [c++, cuda] args: [--style=file, --verbose] - repo: https://github.com/jackdewinter/pymarkdown rev: v0.9.27 hooks: - id: pymarkdown - files: docs/.* + args: [fix] - repo: https://github.com/rhysd/actionlint - rev: v1.7.6 + rev: v1.7.7 hooks: - id: actionlint +- repo: https://github.com/astral-sh/uv-pre-commit + rev: 0.6.2 + hooks: + - id: pip-compile + args: [requirements/test.in, -o, requirements/test.txt] + files: ^requirements/test\.(in|txt)$ - repo: local hooks: - id: mypy-local @@ -45,7 +53,7 @@ repos: entry: tools/mypy.sh 0 "local" language: python types: [python] - additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests] + additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests] stages: [pre-commit] # Don't run in CI - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.9 @@ -85,8 +93,37 @@ repos: entry: tools/png-lint.sh language: script types: [png] + - id: signoff-commit + name: Sign-off Commit + entry: bash + args: + - -c + - | + if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then + printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG + fi + language: system + verbose: true + stages: [commit-msg] + - id: check-spdx-header + name: Check SPDX headers + entry: python tools/check_spdx_header.py + language: python + types: [python] + - id: check-filenames + name: Check for spaces in all filenames + entry: bash + args: + - -c + - 'git ls-files | grep " " && echo "Filenames should not contain spaces!" && exit 1 || exit 0' + language: system + always_run: true + pass_filenames: false + # Keep `suggestion` last - id: suggestion name: Suggestion entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."' language: system verbose: true + pass_filenames: false + # Insert new entries above the `suggestion` entry diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 284196bc2d279..2781ec223b665 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -18,4 +18,4 @@ formats: [] # Optionally declare the Python requirements required to build your docs python: install: - - requirements: docs/requirements-docs.txt + - requirements: requirements/docs.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b2cdff7aa343..ae69268ece8e5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,10 +31,10 @@ set(ignoreMe "${VLLM_PYTHON_PATH}") set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12") # Supported NVIDIA architectures. -set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0") +set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0") # Supported AMD GPU architectures. -set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101") +set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101") # # Supported/expected torch versions for CUDA/ROCm. @@ -46,8 +46,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11 # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1") -set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1") +set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0") +set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0") # # Try to find python package with an executable that exactly matches @@ -174,6 +174,25 @@ include(FetchContent) file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") +# +# Set rocm version dev int. +# +if(VLLM_GPU_LANG STREQUAL "HIP") + # + # Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info + # + set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3") + + + # + # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates + # a lot of warnings that always mask real issues. Suppressing until this is properly addressed. + # + set(CMAKE_${VLLM_GPU_LANG}_FLAGS "${CMAKE_${VLLM_GPU_LANG}_FLAGS} -Wno-unused-result") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result") +endif() + # # Define other extension targets # @@ -192,7 +211,7 @@ set_gencode_flags_for_srcs( if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Enabling cumem allocator extension.") # link against cuda driver library - list(APPEND CUMEM_LIBS cuda) + list(APPEND CUMEM_LIBS CUDA::cuda_driver) define_gpu_extension_target( cumem_allocator DESTINATION vllm @@ -229,7 +248,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case. - set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use") + # Please keep this in sync with FetchContent_Declare line below. + set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use") # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) @@ -246,7 +266,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") FetchContent_Declare( cutlass GIT_REPOSITORY https://github.com/nvidia/cutlass.git - GIT_TAG v3.6.0 + # Please keep this in sync with CUTLASS_REVISION line above. + GIT_TAG v3.8.0 GIT_PROGRESS TRUE # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history. @@ -265,8 +286,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "csrc/custom_all_reduce.cu" "csrc/permute_cols.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" + "csrc/quantization/fp4/nvfp4_quant_entry.cu" + "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu" "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu" - "csrc/sparse/cutlass/sparse_compressor_entry.cu" "csrc/cutlass_extensions/common.cpp") set_gencode_flags_for_srcs( @@ -276,7 +298,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Only build Marlin kernels if we are building for at least some compatible archs. # Keep building Marlin for 9.0 as there are some group sizes and shapes that # are not supported by Machete yet. - cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS}) + cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}") if (MARLIN_ARCHS) set(MARLIN_SRCS "csrc/quantization/fp8/fp8_marlin.cu" @@ -296,38 +318,87 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") " in CUDA target architectures") endif() + # Only build AllSpark kernels if we are building for at least some compatible archs. + cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}") + if (ALLSPARK_ARCHS) + set(ALLSPARK_SRCS + "csrc/quantization/gptq_allspark/allspark_repack.cu" + "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu") + set_gencode_flags_for_srcs( + SRCS "${ALLSPARK_SRCS}" + CUDA_ARCHS "${ALLSPARK_ARCHS}") + list(APPEND VLLM_EXT_SRC "${ALLSPARK_SRCS}") + message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}") + else() + message(STATUS "Not building AllSpark kernels as no compatible archs found" + " in CUDA target architectures") + endif() + + + set(SCALED_MM_3X_ARCHS) # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require - # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now). - cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) - set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu") + # CUDA 12.0 or later + cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS) + set(SRCS + "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu" + "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu" + "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu" + "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu" + "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" - CUDA_ARCHS "${SCALED_MM_3X_ARCHS}") + CUDA_ARCHS "${SCALED_MM_ARCHS}") list(APPEND VLLM_EXT_SRC "${SRCS}") - list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1") - message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1") + # Let scaled_mm_c2x know it doesn't need to build these arches + list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}") + message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}") else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) - message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is " + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS) + message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is " "not >= 12.0, we recommend upgrading to CUDA 12.0 or " "later if you intend on running FP8 quantized models on " "Hopper.") else() - message(STATUS "Not building scaled_mm_c3x as no compatible archs found " + message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found " "in CUDA target architectures") endif() + endif() - # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't - # build any 3x kernels - set(SCALED_MM_3X_ARCHS) + # The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require + # CUDA 12.8 or later + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS) + set(SRCS + "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu" + "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu" + ) + set_gencode_flags_for_srcs( + SRCS "${SRCS}" + CUDA_ARCHS "${SCALED_MM_ARCHS}") + list(APPEND VLLM_EXT_SRC "${SRCS}") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1") + # Let scaled_mm_c2x know it doesn't need to build these arches + list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}") + message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}") + else() + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS) + message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is " + "not >= 12.8, we recommend upgrading to CUDA 12.8 or " + "later if you intend on running FP8 quantized models on " + "Blackwell.") + else() + message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found " + "in CUDA target architectures") + endif() endif() # # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x) # kernels for the remaining archs that are not already built for 3x. cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS - "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}") + "7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}") # subtract out the archs that are already built for 3x list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) if (SCALED_MM_2X_ARCHS) @@ -352,18 +423,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # 2:4 Sparse Kernels # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor - # require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now). - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS) - set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu" - "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu") + # require CUDA 12.2 or later (and only work on Hopper). + cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS) + set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" - CUDA_ARCHS "${SCALED_MM_3X_ARCHS}") + CUDA_ARCHS "${SCALED_MM_ARCHS}") list(APPEND VLLM_EXT_SRC "${SRCS}") list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1") - message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}") + message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}") else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS) + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS) message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is " "not >= 12.2, we recommend upgrading to CUDA 12.2 or later " "if you intend on running FP8 sparse quantized models on Hopper.") @@ -373,6 +444,23 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() endif() + # FP4 Archs and flags + cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS) + set(SRCS + "csrc/quantization/fp4/nvfp4_quant_kernels.cu" + "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu") + set_gencode_flags_for_srcs( + SRCS "${SRCS}" + CUDA_ARCHS "${FP4_ARCHS}") + list(APPEND VLLM_EXT_SRC "${SRCS}") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1") + message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}") + else() + message(STATUS "Not building NVFP4 as no compatible archs were found.") + # clear FP4_ARCHS + set(FP4_ARCHS) + endif() # # Machete kernels @@ -454,7 +542,8 @@ define_gpu_extension_target( SOURCES ${VLLM_EXT_SRC} COMPILE_FLAGS ${VLLM_GPU_FLAGS} ARCHITECTURES ${VLLM_GPU_ARCHES} - INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR} + INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR} + INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR} USE_SABI 3 WITH_SOABI) @@ -473,12 +562,24 @@ set(VLLM_MOE_EXT_SRC "csrc/moe/moe_align_sum_kernels.cu" "csrc/moe/topk_softmax_kernels.cu") +if(VLLM_GPU_LANG STREQUAL "CUDA") + list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu") +endif() + set_gencode_flags_for_srcs( SRCS "${VLLM_MOE_EXT_SRC}" CUDA_ARCHS "${CUDA_ARCHS}") if(VLLM_GPU_LANG STREQUAL "CUDA") - cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}") + set(VLLM_MOE_WNA16_SRC + "csrc/moe/moe_wna16.cu") + + set_gencode_flags_for_srcs( + SRCS "${VLLM_MOE_WNA16_SRC}" + CUDA_ARCHS "${CUDA_ARCHS}") + + list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}") + cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}") if (MARLIN_MOE_ARCHS) set(MARLIN_MOE_SRC "csrc/moe/marlin_kernels/marlin_moe_kernel.h" @@ -532,77 +633,8 @@ if(VLLM_GPU_LANG STREQUAL "HIP") WITH_SOABI) endif() -# vllm-flash-attn currently only supported on CUDA -if (NOT VLLM_GPU_LANG STREQUAL "CUDA") - return() +# For CUDA we also build and ship some external projects. +if (VLLM_GPU_LANG STREQUAL "CUDA") + include(cmake/external_projects/flashmla.cmake) + include(cmake/external_projects/vllm_flash_attn.cmake) endif () - -# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target -# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the -# arches in the CUDA case (and instead set the gencodes on a per file basis) -# we need to manually set VLLM_GPU_ARCHES here. -if(VLLM_GPU_LANG STREQUAL "CUDA") - foreach(_ARCH ${CUDA_ARCHS}) - string(REPLACE "." "" _ARCH "${_ARCH}") - list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real") - endforeach() -endif() - -# -# Build vLLM flash attention from source -# -# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM. -# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs. -# They should be identical but if they aren't, this is a massive footgun. -# -# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place. -# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3). -# If no component is specified, vllm-flash-attn is still installed. - -# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading. -# This is to enable local development of vllm-flash-attn within vLLM. -# It can be set as an environment variable or passed as a cmake argument. -# The environment variable takes precedence. -if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR}) - set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR}) -endif() - -if(VLLM_FLASH_ATTN_SRC_DIR) - FetchContent_Declare( - vllm-flash-attn SOURCE_DIR - ${VLLM_FLASH_ATTN_SRC_DIR} - BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn - ) -else() - FetchContent_Declare( - vllm-flash-attn - GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 90eacc1af2a7c3de62ea249e929ed5faccf38954 - GIT_PROGRESS TRUE - # Don't share the vllm-flash-attn build between build types - BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn - ) -endif() - - -# Fetch the vllm-flash-attn library -FetchContent_MakeAvailable(vllm-flash-attn) -message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}") - -# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in -# case only one is built, in the case both are built redundant work is done) -install( - DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ - DESTINATION vllm_flash_attn - COMPONENT _vllm_fa2_C - FILES_MATCHING PATTERN "*.py" -) - -install( - DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ - DESTINATION vllm_flash_attn - COMPONENT _vllm_fa3_C - FILES_MATCHING PATTERN "*.py" -) - -# Nothing after vllm-flash-attn, see comment about macros above diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index f801b5f8f5513..5268ff135c9d0 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -61,7 +61,7 @@ representative at an online or offline/IRL event. Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement in the #code-of-conduct -channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g). +channel in the [vLLM Slack](https://slack.vllm.ai). All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the @@ -125,4 +125,3 @@ Community Impact Guidelines were inspired by For answers to common questions about this code of conduct, see the [Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at [Contributor Covenant translations](https://www.contributor-covenant.org/translations). - diff --git a/Dockerfile b/Dockerfile index 261f5440aee47..d1ecef586d50b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,6 +27,13 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ && python3 --version && python3 -m pip --version +# Install uv for faster pip installs +RUN --mount=type=cache,target=/root/.cache/uv \ + python3 -m pip install uv + +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 # as it was causing spam when compiling the CUTLASS kernels @@ -50,15 +57,16 @@ WORKDIR /workspace # we need to install torch and torchvision from the nightly builds first, # pytorch will not appear as a vLLM dependency in all of the following steps # after this step -RUN --mount=type=cache,target=/root/.cache/pip \ +RUN --mount=type=cache,target=/root/.cache/uv \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \ + uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \ + uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \ fi -COPY requirements-common.txt requirements-common.txt -COPY requirements-cuda.txt requirements-cuda.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install -r requirements-cuda.txt +COPY requirements/common.txt requirements/common.txt +COPY requirements/cuda.txt requirements/cuda.txt +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r requirements/cuda.txt # cuda arch list used by torch # can be useful for both `dev` and `test` @@ -76,15 +84,19 @@ FROM base AS build ARG TARGETPLATFORM # install build dependencies -COPY requirements-build.txt requirements-build.txt +COPY requirements/build.txt requirements/build.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install -r requirements-build.txt +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 + +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r requirements/build.txt COPY . . ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ - if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi + if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi # max jobs used by Ninja to build extensions ARG max_jobs=2 @@ -98,7 +110,7 @@ ARG SCCACHE_BUCKET_NAME=vllm-build-sccache ARG SCCACHE_REGION_NAME=us-west-2 ARG SCCACHE_S3_NO_CREDENTIALS=0 # if USE_SCCACHE is set, use sccache to speed up compilation -RUN --mount=type=cache,target=/root/.cache/pip \ +RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=.git,target=.git \ if [ "$USE_SCCACHE" = "1" ]; then \ echo "Installing sccache..." \ @@ -118,16 +130,19 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ - --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=.git,target=.git \ if [ "$USE_SCCACHE" != "1" ]; then \ + # Clean any existing CMake artifacts + rm -rf .deps && \ + mkdir -p .deps && \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ fi # Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py -# Default max size of the wheel is 250MB -ARG VLLM_MAX_SIZE_MB=250 +# sync the default value with .buildkite/check-wheel-size.py +ARG VLLM_MAX_SIZE_MB=400 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB ARG RUN_WHEEL_CHECK=true RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ @@ -140,16 +155,21 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ #################### DEV IMAGE #################### FROM base as dev -COPY requirements-lint.txt requirements-lint.txt -COPY requirements-test.txt requirements-test.txt -COPY requirements-dev.txt requirements-dev.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install -r requirements-dev.txt +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 + +COPY requirements/lint.txt requirements/lint.txt +COPY requirements/test.txt requirements/test.txt +COPY requirements/dev.txt requirements/dev.txt +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r requirements/dev.txt #################### DEV IMAGE #################### #################### vLLM installation IMAGE #################### # image with vLLM installed -FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base +# TODO: Restore to base image after FlashInfer AOT wheel fixed +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base ARG CUDA_VERSION=12.4.1 ARG PYTHON_VERSION=3.12 WORKDIR /vllm-workspace @@ -173,6 +193,13 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ && python3 --version && python3 -m pip --version +# Install uv for faster pip installs +RUN --mount=type=cache,target=/root/.cache/uv \ + python3 -m pip install uv + +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 # Workaround for https://github.com/openai/triton/issues/2507 and # https://github.com/pytorch/pytorch/issues/107960 -- hopefully @@ -184,22 +211,44 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ # we need to install torch and torchvision from the nightly builds first, # pytorch will not appear as a vLLM dependency in all of the following steps # after this step -RUN --mount=type=cache,target=/root/.cache/pip \ +RUN --mount=type=cache,target=/root/.cache/uv \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \ + uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \ + uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \ fi # Install vllm wheel first, so that torch etc will be installed. RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ - --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install dist/*.whl --verbose + --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system dist/*.whl --verbose -RUN --mount=type=cache,target=/root/.cache/pip \ +# If we need to build FlashInfer wheel before its release: +# $ export FLASHINFER_ENABLE_AOT=1 +# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+ +# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX' +# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive +# $ cd flashinfer +# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4 +# $ rm -rf build +# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose +# $ ls dist +# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl + +RUN --mount=type=cache,target=/root/.cache/uv \ . /etc/environment && \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ - python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \ + uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \ fi COPY examples examples + +# Although we build Flashinfer with AOT mode, there's still +# some issues w.r.t. JIT compilation. Therefore we need to +# install build dependencies for JIT compilation. +# TODO: Remove this once FlashInfer AOT wheel is fixed +COPY requirements/build.txt requirements/build.txt +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r requirements/build.txt + #################### vLLM installation IMAGE #################### #################### TEST IMAGE #################### @@ -209,17 +258,21 @@ FROM vllm-base AS test ADD . /vllm-workspace/ -# install development dependencies (for testing) -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install -r requirements-dev.txt +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 # install development dependencies (for testing) -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install -e tests/vllm_test_utils +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r requirements/dev.txt + +# install development dependencies (for testing) +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -e tests/vllm_test_utils # enable fast downloads from hf (for testing) -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install hf_transfer +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system hf_transfer ENV HF_HUB_ENABLE_HF_TRANSFER 1 # Copy in the v1 package for testing (it isn't distributed yet) @@ -237,12 +290,16 @@ RUN mv vllm test_docs/ # base openai image with additional requirements, for any subsequent openai-style images FROM vllm-base AS vllm-openai-base +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 + # install additional dependencies for openai api server -RUN --mount=type=cache,target=/root/.cache/pip \ +RUN --mount=type=cache,target=/root/.cache/uv \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ + uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ else \ - pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ + uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ fi ENV VLLM_USAGE_SOURCE production-docker-image diff --git a/Dockerfile.arm b/Dockerfile.arm index 093ee2209222f..bad093684239c 100644 --- a/Dockerfile.arm +++ b/Dockerfile.arm @@ -26,18 +26,18 @@ WORKDIR /workspace ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \ + --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \ pip install --upgrade pip && \ - pip install -r requirements-build.txt + pip install -r requirements/build.txt FROM cpu-test-arm AS build WORKDIR /workspace/vllm RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \ - --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \ - pip install -v -r requirements-cpu.txt + --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \ + --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \ + pip install -v -r requirements/cpu.txt COPY . . ARG GIT_REPO_CHECK=0 diff --git a/Dockerfile.cpu b/Dockerfile.cpu index ebe226cf6d148..a10090529d8a9 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -22,25 +22,25 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li RUN echo 'ulimit -c 0' >> ~/.bashrc -RUN pip install intel_extension_for_pytorch==2.5.0 +RUN pip install intel_extension_for_pytorch==2.6.0 WORKDIR /workspace ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \ + --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \ pip install --upgrade pip && \ - pip install -r requirements-build.txt + pip install -r requirements/build.txt FROM cpu-test-1 AS build WORKDIR /workspace/vllm RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \ - --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \ - pip install -v -r requirements-cpu.txt + --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \ + --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \ + pip install -v -r requirements/cpu.txt COPY . . ARG GIT_REPO_CHECK=0 diff --git a/Dockerfile.hpu b/Dockerfile.hpu index 66cf68c32f2ca..48211c88f872b 100644 --- a/Dockerfile.hpu +++ b/Dockerfile.hpu @@ -4,7 +4,7 @@ COPY ./ /workspace/vllm WORKDIR /workspace/vllm -RUN pip install -v -r requirements-hpu.txt +RUN pip install -v -r requirements/hpu.txt ENV no_proxy=localhost,127.0.0.1 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true diff --git a/Dockerfile.neuron b/Dockerfile.neuron index e9cb82889decd..067645906366e 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -23,10 +23,12 @@ WORKDIR ${APP_MOUNT}/vllm RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas RUN python3 -m pip install sentencepiece transformers==4.45.2 -U -RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install pytest +# uninstall transformers-neuronx package explicitly to avoid version conflict +RUN python3 -m pip uninstall -y transformers-neuronx + COPY . . ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ @@ -34,7 +36,7 @@ RUN --mount=type=bind,source=.git,target=.git \ RUN python3 -m pip install -U \ 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ - -r requirements-neuron.txt + -r requirements/neuron.txt ENV VLLM_TARGET_DEVICE neuron RUN --mount=type=bind,source=.git,target=.git \ @@ -43,6 +45,10 @@ RUN --mount=type=bind,source=.git,target=.git \ # install development dependencies (for testing) RUN python3 -m pip install -e tests/vllm_test_utils +# install transformers-neuronx package as an optional dependencies (for V0) +# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict +RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps + # overwrite entrypoint to run bash script RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py diff --git a/Dockerfile.openvino b/Dockerfile.openvino deleted file mode 100644 index 32bcbfa9cc168..0000000000000 --- a/Dockerfile.openvino +++ /dev/null @@ -1,29 +0,0 @@ -# The vLLM Dockerfile is used to construct vLLM image that can be directly used -# to run the OpenAI compatible server. - -FROM ubuntu:22.04 AS dev - -RUN apt-get update -y && \ - apt-get install -y \ - git python3-pip \ - ffmpeg libsm6 libxext6 libgl1 -WORKDIR /workspace - -COPY . . -ARG GIT_REPO_CHECK=0 -RUN --mount=type=bind,source=.git,target=.git \ - if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi - -RUN python3 -m pip install -U pip -# install build requirements -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt -# build vLLM with OpenVINO backend -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace - -COPY examples/ /workspace/examples -COPY benchmarks/ /workspace/benchmarks - -# install development dependencies (for testing) -RUN python3 -m pip install -e tests/vllm_test_utils - -CMD ["/bin/bash"] diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le index d3cd1c7b313bc..913c289adc01e 100644 --- a/Dockerfile.ppc64le +++ b/Dockerfile.ppc64le @@ -1,38 +1,267 @@ -FROM mambaorg/micromamba -ARG MAMBA_DOCKERFILE_ACTIVATE=1 -USER root +ARG BASE_UBI_IMAGE_TAG=9.5-1741850109 -ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/" +############################################################### +# base stage with basic dependencies +############################################################### -RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev +FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base-builder -# Some packages in requirements-cpu are installed here -# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba -# Currently these may not be available for venv or pip directly -RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes +ARG PYTHON_VERSION=3.12 +ARG OPENBLAS_VERSION=0.3.29 + +# Set Environment Variables for venv, cargo & openblas +ENV VIRTUAL_ENV=/opt/vllm +ENV PATH=${VIRTUAL_ENV}/bin:/root/.cargo/bin:$PATH +ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig/ +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64:/usr/local/lib:/usr/lib64:/usr/lib +ENV UV_LINK_MODE=copy + +# install gcc-13, python, rust, openblas +# Note: A symlink for libatomic.so is created for gcc-13 (linker fails to find libatomic otherwise - reqd. for sentencepiece) +# Note: A dummy file 'control' is created in /tmp/ to artificially create dependencies between stages when building stages in parallel +# when `--jobs=` is passed with podman build command +RUN microdnf install -y openssl-devel dnf \ + && dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \ + https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \ + https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \ + && dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os \ + && dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/AppStream/`arch`/os \ + && dnf config-manager --set-enabled crb \ + && dnf install -y \ + git tar gcc-toolset-13 automake libtool numactl-devel lapack-devel \ + pkgconfig xsimd zeromq-devel kmod findutils protobuf* \ + libtiff-devel libjpeg-devel openjpeg2-devel zlib-devel \ + freetype-devel lcms2-devel libwebp-devel tcl-devel tk-devel \ + harfbuzz-devel fribidi-devel libraqm-devel libimagequant-devel libxcb-devel \ + python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \ + && dnf clean all \ + && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \ + && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \ + && python -m pip install -U pip uv \ + && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python cmake ninja cython scikit_build_core scikit_build \ + && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \ + && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \ + && cd /tmp && touch control + +############################################################### +# Stage to build torch family +############################################################### + +FROM base-builder AS torch-builder + +ARG MAX_JOBS +ARG TORCH_VERSION=2.6.0 +ARG _GLIBCXX_USE_CXX11_ABI=1 +RUN --mount=type=cache,target=/root/.cache/uv \ + source /opt/rh/gcc-toolset-13/enable && \ + git clone --recursive https://github.com/pytorch/pytorch.git -b v${TORCH_VERSION} && \ + cd pytorch && \ + uv pip install -r requirements.txt && \ + python setup.py develop && \ + rm -f dist/torch*+git*whl && \ + MAX_JOBS=${MAX_JOBS:-$(nproc)} \ + PYTORCH_BUILD_VERSION=${TORCH_VERSION} PYTORCH_BUILD_NUMBER=1 uv build --wheel --out-dir /torchwheels/ + +ARG TORCHVISION_VERSION=0.21.0 +ARG TORCHVISION_USE_NVJPEG=0 +ARG TORCHVISION_USE_FFMPEG=0 +RUN --mount=type=cache,target=/root/.cache/uv \ + source /opt/rh/gcc-toolset-13/enable && \ + git clone --recursive https://github.com/pytorch/vision.git -b v${TORCHVISION_VERSION} && \ + cd vision && \ + MAX_JOBS=${MAX_JOBS:-$(nproc)} \ + BUILD_VERSION=${TORCHVISION_VERSION} \ + uv build --wheel --out-dir /torchwheels/ --no-build-isolation + +ARG TORCHAUDIO_VERSION=2.6.0 +ARG BUILD_SOX=1 +ARG BUILD_KALDI=1 +ARG BUILD_RNNT=1 +ARG USE_FFMPEG=0 +ARG USE_ROCM=0 +ARG USE_CUDA=0 +ARG TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_FFMPEG=1 +RUN --mount=type=cache,target=/root/.cache/uv \ + source /opt/rh/gcc-toolset-13/enable && \ + git clone --recursive https://github.com/pytorch/audio.git -b v${TORCHAUDIO_VERSION} && \ + cd audio && \ + MAX_JOBS=${MAX_JOBS:-$(nproc)} \ + BUILD_VERSION=${TORCHAUDIO_VERSION} \ + uv build --wheel --out-dir /torchwheels/ --no-build-isolation + +############################################################### +# Stage to build pyarrow +############################################################### + +FROM base-builder AS arrow-builder + +ARG MAX_JOBS +ARG PYARROW_PARALLEL +ARG PYARROW_VERSION=19.0.1 +RUN --mount=type=cache,target=/root/.cache/uv \ + source /opt/rh/gcc-toolset-13/enable && \ + git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \ + cd arrow/cpp && \ + mkdir build && cd build && \ + cmake -DCMAKE_BUILD_TYPE=release \ + -DCMAKE_INSTALL_PREFIX=/usr/local \ + -DARROW_PYTHON=ON \ + -DARROW_BUILD_TESTS=OFF \ + -DARROW_JEMALLOC=ON \ + -DARROW_BUILD_STATIC="OFF" \ + -DARROW_PARQUET=ON \ + .. && \ + make install -j ${MAX_JOBS:-$(nproc)} && \ + cd ../../python/ && \ + uv pip install -v -r requirements-wheel-build.txt && \ + PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \ + python setup.py build_ext \ + --build-type=release --bundle-arrow-cpp \ + bdist_wheel --dist-dir /arrowwheels/ + +############################################################### +# Stage to build opencv +############################################################### + +FROM base-builder AS cv-builder + +ARG MAX_JOBS +ARG OPENCV_VERSION=84 +ARG ENABLE_HEADLESS=1 +RUN --mount=type=cache,target=/root/.cache/uv \ + source /opt/rh/gcc-toolset-13/enable && \ + git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \ + cd opencv-python && \ + sed -i 's/"setuptools==59.2.0",/"setuptools<70.0",/g' pyproject.toml && \ + python -m build --wheel --installer=uv --outdir /opencvwheels/ + +############################################################### +# Stage to build vllm - this stage builds and installs +# vllm, tensorizer and vllm-tgis-adapter and builds uv cache +# for transitive dependencies - eg. grpcio +############################################################### + +FROM base-builder AS vllmcache-builder + +COPY --from=torch-builder /tmp/control /dev/null +COPY --from=arrow-builder /tmp/control /dev/null +COPY --from=cv-builder /tmp/control /dev/null + +ARG VLLM_TARGET_DEVICE=cpu + +# this step installs vllm and populates uv cache +# with all the transitive dependencies +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \ + --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \ + --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \ + --mount=type=bind,src=.,dst=/src/,rw \ + source /opt/rh/gcc-toolset-13/enable && \ + uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \ + sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \ + uv pip install pandas pythran pybind11 && \ + # sentencepiece.pc is in some pkgconfig inside uv cache + export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \ + uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \ + cd /src/ && \ + uv build --wheel --out-dir /vllmwheel/ --no-build-isolation && \ + uv pip install /vllmwheel/*.whl + + +############################################################### +# Stage to build numactl +############################################################### + +FROM base-builder AS numa-builder + +# Note: Building numactl with gcc-11. Compiling with gcc-13 in this builder stage will +# trigger recompilation with gcc-11 (and require libtool) in the final stage where we do not have gcc-13 +ARG MAX_JOBS +ARG NUMACTL_VERSION=2.0.19 +RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_VERSION} \ + && cd numactl \ + && autoreconf -i && ./configure \ + && make -j ${MAX_JOBS:-$(nproc)} + +############################################################### +# Stage to build lapack +############################################################### + +FROM base-builder AS lapack-builder + +ARG MAX_JOBS +ARG LAPACK_VERSION=3.12.1 +RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${LAPACK_VERSION} \ + && cd lapack && source /opt/rh/gcc-toolset-13/enable \ + && cmake -B build -S . \ + && cmake --build build -j ${MAX_JOBS:-$(nproc)} + + +############################################################### +# FINAL VLLM IMAGE STAGE # +############################################################### + +FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS vllm-openai + +ARG PYTHON_VERSION=3.12 +ARG OPENBLAS_VERSION=0.3.29 + +# Set Environment Variables for venv & openblas +ENV VIRTUAL_ENV=/opt/vllm +ENV PATH=${VIRTUAL_ENV}/bin:$PATH +ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig/ +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64:/usr/local/lib:/usr/lib64:/usr/lib +ENV UV_LINK_MODE=copy + +# create artificial dependencies between stages for independent stages to build in parallel +COPY --from=torch-builder /tmp/control /dev/null +COPY --from=arrow-builder /tmp/control /dev/null +COPY --from=cv-builder /tmp/control /dev/null +COPY --from=vllmcache-builder /tmp/control /dev/null +COPY --from=numa-builder /tmp/control /dev/null +COPY --from=lapack-builder /tmp/control /dev/null + +# install gcc-11, python, openblas, numactl, lapack +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \ + --mount=type=bind,from=lapack-builder,source=/lapack/,target=/lapack/,rw \ + rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ + microdnf install --nodocs -y \ + tar findutils openssl \ + pkgconfig xsimd g++ gcc-fortran libsndfile \ + libtiff libjpeg openjpeg2 zlib zeromq \ + freetype lcms2 libwebp tcl tk utf8proc \ + harfbuzz fribidi libraqm libimagequant libxcb \ + python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \ + && microdnf clean all \ + && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \ + && python -m pip install -U pip uv --no-cache \ + && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \ + && make -C /numactl install \ + && uv pip install cmake \ + && cmake --install /lapack/build \ + && uv pip uninstall cmake + +# consume previously built wheels (including vllm) +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \ + --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \ + --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \ + --mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \ + HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /vllmwheel/*.whl COPY ./ /workspace/vllm - WORKDIR /workspace/vllm ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi -RUN --mount=type=cache,target=/root/.cache/pip \ - RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \ - 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ - torch==2.3.1 \ - -r requirements-cpu.txt \ - xformers uvloop==0.20.0 - -RUN --mount=type=bind,source=.git,target=.git \ - VLLM_TARGET_DEVICE=cpu python3 setup.py install - # install development dependencies (for testing) -RUN python3 -m pip install -e tests/vllm_test_utils +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install -e tests/vllm_test_utils WORKDIR /workspace/ RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks -ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"] +ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 14c522afd7f9e..841e7978a424f 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -38,14 +38,14 @@ FROM fetch_vllm AS build_vllm ARG USE_CYTHON # Build vLLM RUN cd vllm \ - && python3 -m pip install -r requirements-rocm.txt \ + && python3 -m pip install -r requirements/rocm.txt \ && python3 setup.py clean --all \ - && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \ + && if [ ${USE_CYTHON} -eq "1" ]; then python3 tests/build_cython.py build_ext --inplace; fi \ && python3 setup.py bdist_wheel --dist-dir=dist FROM scratch AS export_vllm ARG COMMON_WORKDIR COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl / -COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt / +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements /requirements COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples @@ -60,7 +60,8 @@ RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* # Install vLLM RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ cd /install \ - && pip install -U -r requirements-rocm.txt \ + && pip install -U -r requirements/rocm.txt \ + && pip install -U -r requirements/rocm-test.txt \ && pip uninstall -y vllm \ && pip install *.whl @@ -99,7 +100,7 @@ RUN if [ ${BUILD_RPD} -eq "1" ]; then \ # Install vLLM RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ cd /install \ - && pip install -U -r requirements-rocm.txt \ + && pip install -U -r requirements/rocm.txt \ && pip uninstall -y vllm \ && pip install *.whl diff --git a/Dockerfile.rocm_base b/Dockerfile.rocm_base index 5bbe98b0c2204..38d6a33636eba 100644 --- a/Dockerfile.rocm_base +++ b/Dockerfile.rocm_base @@ -6,12 +6,14 @@ ARG RCCL_BRANCH="648a58d" ARG RCCL_REPO="https://github.com/ROCm/rccl" ARG TRITON_BRANCH="e5be006" ARG TRITON_REPO="https://github.com/triton-lang/triton.git" -ARG PYTORCH_BRANCH="8d4926e" +ARG PYTORCH_BRANCH="3a585126" ARG PYTORCH_VISION_BRANCH="v0.19.1" ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG FA_BRANCH="b7d29fb" ARG FA_REPO="https://github.com/ROCm/flash-attention.git" +ARG AITER_BRANCH="21d47a9" +ARG AITER_REPO="https://github.com/ROCm/aiter.git" FROM ${BASE_IMAGE} AS base @@ -129,8 +131,18 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \ RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ pip install /install/*.whl +ARG AITER_REPO +ARG AITER_BRANCH +RUN git clone --recursive ${AITER_REPO} +RUN cd aiter \ + && git checkout ${AITER_BRANCH} \ + && git submodule update --init --recursive \ + && pip install -r requirements.txt \ + && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter + ARG BASE_IMAGE ARG HIPBLASLT_BRANCH +ARG HIPBLAS_COMMON_BRANCH ARG LEGACY_HIPBLASLT_OPTION ARG RCCL_BRANCH ARG RCCL_REPO @@ -155,4 +167,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \ && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ - && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt + && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \ + && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ + && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt diff --git a/Dockerfile.s390x b/Dockerfile.s390x new file mode 100644 index 0000000000000..5a84dc12d8f71 --- /dev/null +++ b/Dockerfile.s390x @@ -0,0 +1,152 @@ +# Base UBI image for s390x architecture +ARG BASE_UBI_IMAGE_TAG=9.5-1736404155 +ARG PYTHON_VERSION=3.12 +FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base + +# Install basic dependencies +ARG PYTHON_VERSION +ENV PYTHON_VERSION=${PYTHON_VERSION} + +WORKDIR /workspace + +ENV LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 + +# Install development utilities +RUN microdnf install -y \ + which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \ + libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \ + openssl-devel openblas openblas-devel autoconf automake libtool cmake && \ + microdnf clean all + +# Python Installation +FROM base AS python-install +ARG PYTHON_VERSION + +ENV VIRTUAL_ENV=/opt/vllm +ENV PATH="$VIRTUAL_ENV/bin:$PATH" +ENV PYTHON_VERSION=${PYTHON_VERSION} +RUN microdnf install -y \ + python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel && \ + python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all + +FROM python-install AS pyarrow + +# Build Apache Arrow +WORKDIR /tmp +RUN --mount=type=cache,target=/root/.cache/uv \ + git clone https://github.com/apache/arrow.git && \ + cd arrow/cpp && \ + mkdir release && cd release && \ + cmake -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=/usr/local \ + -DARROW_PYTHON=ON \ + -DARROW_PARQUET=ON \ + -DARROW_ORC=ON \ + -DARROW_FILESYSTEM=ON \ + -DARROW_WITH_LZ4=ON \ + -DARROW_WITH_ZSTD=ON \ + -DARROW_WITH_SNAPPY=ON \ + -DARROW_JSON=ON \ + -DARROW_CSV=ON \ + -DARROW_DATASET=ON \ + -DPROTOBUF_PROTOC_EXECUTABLE=/usr/bin/protoc \ + -DARROW_DEPENDENCY_SOURCE=BUNDLED \ + .. && \ + make -j$(nproc) && \ + make install && \ + cd ../../python && \ + export PYARROW_PARALLEL=4 && \ + export ARROW_BUILD_TYPE=release && \ + uv pip install -r requirements/build.txt && \ + python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel + +FROM python-install AS numa-build +# Install numactl (needed for numa.h dependency) +WORKDIR /tmp +RUN curl -LO https://github.com/numactl/numactl/archive/refs/tags/v2.0.16.tar.gz && \ + tar -xvzf v2.0.16.tar.gz && \ + cd numactl-2.0.16 && \ + ./autogen.sh && \ + ./configure && \ + make + +# Set include path +ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH" + +FROM python-install AS rust +ENV CARGO_HOME=/root/.cargo +ENV RUSTUP_HOME=/root/.rustup +ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH" + +RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \ + . "$CARGO_HOME/env" && \ + rustup default stable && \ + rustup show + +FROM python-install AS torch-vision +# Install torchvision +ARG TORCH_VERSION=2.7.0.dev20250304 +ARG TORCH_VISION_VERSION=v0.20.1 +WORKDIR /tmp +RUN --mount=type=cache,target=/root/.cache/uv \ + git clone https://github.com/pytorch/vision.git && \ + cd vision && \ + git checkout $TORCH_VISION_VERSION && \ + uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \ + python setup.py bdist_wheel + +# Final build stage +FROM python-install AS vllm-cpu +ARG PYTHON_VERSION + +# Set correct library path for torch and numactl +ENV LD_LIBRARY_PATH="/opt/vllm/lib64/python${PYTHON_VERSION}/site-packages/torch/lib:/usr/local/lib:$LD_LIBRARY_PATH" +ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH" +ENV UV_LINK_MODE=copy +ENV CARGO_HOME=/root/.cargo +ENV RUSTUP_HOME=/root/.rustup +ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH" + +COPY . /workspace/vllm +WORKDIR /workspace/vllm + +RUN --mount=type=bind,from=numa-build,src=/tmp/numactl-2.0.16,target=/numactl \ + make -C /numactl install + +# Install dependencies, including PyTorch and Apache Arrow +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \ + --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \ + --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \ + --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \ + sed -i '/^torch/d' requirements/build.txt && \ + ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \ + VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \ + uv pip install -v \ + $ARROW_WHL_FILE \ + $VISION_WHL_FILE \ + --extra-index-url https://download.pytorch.org/whl/nightly/cpu \ + --index-strategy unsafe-best-match \ + -r requirements/build.txt \ + -r requirements/cpu.txt + +# Build and install vllm +RUN --mount=type=cache,target=/root/.cache/uv \ + VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \ + uv pip install "$(echo dist/*.whl)[tensorizer]" + +# setup non-root user for vllm +RUN umask 002 && \ + useradd --uid 2000 --gid 0 vllm && \ + mkdir -p /home/vllm && \ + chmod g+rwx /home/vllm + +COPY LICENSE /licenses/vllm.md +COPY examples/*.jinja /app/data/template/ + +USER 2000 +WORKDIR /home/vllm + +# Set the default entrypoint +ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] \ No newline at end of file diff --git a/Dockerfile.tpu b/Dockerfile.tpu index ee0d94d98e82b..50806d8820a30 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -1,4 +1,4 @@ -ARG NIGHTLY_DATE="20250122" +ARG NIGHTLY_DATE="20250124" ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE" FROM $BASE_IMAGE @@ -15,11 +15,14 @@ ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi +# Remove existing versions of dependencies +RUN pip uninstall -y torch torch_xla torchvision + ENV VLLM_TARGET_DEVICE="tpu" RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,source=.git,target=.git \ python3 -m pip install \ - -r requirements-tpu.txt + -r requirements/tpu.txt RUN python3 setup.py develop # install development dependencies (for testing) diff --git a/Dockerfile.xpu b/Dockerfile.xpu index a374f20d7d949..ad4abf16b43b6 100644 --- a/Dockerfile.xpu +++ b/Dockerfile.xpu @@ -1,11 +1,7 @@ -FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base +# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually. +FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base -RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \ - echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \ - chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \ - wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \ - echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \ - chmod 644 /usr/share/keyrings/intel-graphics.gpg +RUN rm /etc/apt/sources.list.d/intel-graphics.list RUN apt-get update -y && \ apt-get install -y --no-install-recommends --fix-missing \ @@ -21,30 +17,20 @@ RUN apt-get update -y && \ python3 \ python3-dev \ python3-pip \ - # vim \ wget WORKDIR /workspace/vllm -COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt -COPY requirements-common.txt /workspace/vllm/requirements-common.txt +COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt +COPY requirements/common.txt /workspace/vllm/requirements/common.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install --no-cache-dir \ - -r requirements-xpu.txt - -RUN git clone https://github.com/intel/pti-gpu && \ - cd pti-gpu/sdk && \ - git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \ - mkdir build && \ - cd build && \ - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \ - make -j && \ - cmake --install . --config Release --prefix "/usr/local" + -r requirements/xpu.txt ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/" COPY . . -ARG GIT_REPO_CHECK +ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi @@ -54,6 +40,12 @@ RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,source=.git,target=.git \ python3 setup.py install +# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu +# FIXME: This will be fix in ipex 2.7. just leave this here for awareness. +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install intel-extension-for-pytorch==2.6.10+xpu \ + --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + CMD ["/bin/bash"] FROM vllm-base AS vllm-openai diff --git a/MANIFEST.in b/MANIFEST.in index 82be639ef4d73..82fd22b845f09 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,9 +1,9 @@ include LICENSE -include requirements-common.txt -include requirements-cuda.txt -include requirements-rocm.txt -include requirements-neuron.txt -include requirements-cpu.txt +include requirements/common.txt +include requirements/cuda.txt +include requirements/rocm.txt +include requirements/neuron.txt +include requirements/cpu.txt include CMakeLists.txt recursive-include cmake * diff --git a/README.md b/README.md index 4ed905bf7aa9d..f2da0467e5c34 100644 --- a/README.md +++ b/README.md @@ -10,14 +10,29 @@ Easy, fast, and cheap LLM serving for everyone

-| Documentation | Blog | Paper | Discord | Twitter/X | Developer Slack | +| Documentation | Blog | Paper | Twitter/X | User Forum | Developer Slack |

--- +[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center! + +[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day) + +--- + *Latest News* 🔥 -- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing). + +- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing). +- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0). +- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted. +- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html). +- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing). - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone! + +
+Previous News + - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing). - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! - [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users! @@ -31,11 +46,14 @@ Easy, fast, and cheap LLM serving for everyone - [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM. - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai). +
+ --- ## About + vLLM is a fast and easy-to-use library for LLM inference and serving. -Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry. +Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry. vLLM is fast with: @@ -78,7 +96,7 @@ pip install vllm ``` Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more. -- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html) +- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html) - [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html) - [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html) @@ -126,6 +144,7 @@ We also have an official fundraising venue through [OpenCollective](https://open ## Citation If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180): + ```bibtex @inproceedings{kwon2023efficient, title={Efficient Memory Management for Large Language Model Serving with PagedAttention}, @@ -137,12 +156,12 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs ## Contact Us -* For technical questions and feature requests, please use Github issues or discussions. -* For discussing with fellow users, please use Discord. -* For coordinating contributions and development, please use Slack. -* For security disclosures, please use Github's security advisory feature. -* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu. +- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions) +- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai) +- coordinating contributions and development, please use [Slack](https://slack.vllm.ai) +- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature +- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu) ## Media Kit -* If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit). +- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit). diff --git a/RELEASE.md b/RELEASE.md new file mode 100644 index 0000000000000..7f52707152128 --- /dev/null +++ b/RELEASE.md @@ -0,0 +1,54 @@ +# Releasing vLLM + +vLLM releases offer a reliable version of the code base, packaged into a binary format that can be conveniently accessed via PyPI. These releases also serve as key milestones for the development team to communicate with the community about newly available features, improvements, and upcoming changes that could affect users, including potential breaking changes. + +## Release Versioning + +vLLM uses a “right-shifted” versioning scheme where a new patch release is out every 2 weeks. And patch releases contain features and bug fixes (as opposed to semver where patch release contains only backwards-compatible bug fixes). When critical fixes need to be made, special release post1 is released. + +* _major_ major architectural milestone and when incompatible API changes are made, similar to PyTorch 2.0. +* _minor_ major features +* _patch_ features and backwards-compatible bug fixes +* _post1_ or _patch-1_ backwards-compatible bug fixes, either explicit or implicit post release + +## Release Cadence + +Patch release is released on bi-weekly basis. Post release 1-3 days after patch release and uses same branch as patch release. +Following is the release cadence for year 2025. All future release dates below are tentative. Please note: Post releases are optional. + +| Release Date | Patch release versions | Post Release versions | +| --- | --- | --- | +| Jan 2025 | 0.7.0 | --- | +| Feb 2025 | 0.7.1, 0.7.2, 0.7.3 | --- | +| Mar 2025 | 0.7.4, 0.7.5 | --- | +| Apr 2025 | 0.7.6, 0.7.7 | --- | +| May 2025 | 0.7.8, 0.7.9 | --- | +| Jun 2025 | 0.7.10, 0.7.11 | --- | +| Jul 2025 | 0.7.12, 0.7.13 | --- | +| Aug 2025 | 0.7.14, 0.7.15 | --- | +| Sep 2025 | 0.7.16, 0.7.17 | --- | +| Oct 2025 | 0.7.18, 0.7.19 | --- | +| Nov 2025 | 0.7.20, 0.7.21 | --- | +| Dec 2025 | 0.7.22, 0.7.23 | --- | + +## Release branch + +Each release is built from a dedicated release branch. + +* For _major_, _minor_, _patch_ releases, the release branch cut is performed 1-2 days before release is live. +* For post releases, previously cut release branch is reused +* Release builds are triggered via push to RC tag like vX.Y.Z-rc1 . This enables us to build and test multiple RCs for each release. +* Final tag : vX.Y.Z does not trigger the build but used for Release notes and assets. +* After branch cut is created we monitor the main branch for any reverts and apply these reverts to a release branch. + +## Release Cherry-Pick Criteria + +After branch cut, we approach finalizing the release branch with clear criteria on what cherry picks are allowed in. Note: a cherry pick is a process to land a PR in the release branch after branch cut. These are typically limited to ensure that the team has sufficient time to complete a thorough round of testing on a stable code base. + +* Regression fixes - that address functional/performance regression against the most recent release (e.g. 0.7.0 for 0.7.1 release) +* Critical fixes - critical fixes for severe issue such as silent incorrectness, backwards compatibility, crashes, deadlocks, (large) memory leaks +* Fixes to new features introduced in the most recent release (e.g. 0.7.0 for 0.7.1 release) +* Documentation improvements +* Release branch specific changes (e.g. change version identifiers or CI fixes) + +Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes. diff --git a/benchmarks/README.md b/benchmarks/README.md index 2aa4a285021f1..d41de1caa04c0 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,19 +1,268 @@ # Benchmarking vLLM -## Downloading the ShareGPT dataset +This README guides you through running benchmark tests with the extensive +datasets supported on vLLM. It’s a living document, updated as new features and datasets +become available. + +## Dataset Overview + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DatasetOnlineOfflineData Path
ShareGPTwget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
BurstGPTwget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv
SonnetLocal file: benchmarks/sonnet.txt
Randomsynthetic
HuggingFace🟡🟡Specify your dataset path on HuggingFace
VisionArenalmarena-ai/vision-arena-bench-v0.1 (a HuggingFace dataset)
+ +✅: supported + +🚧: to be supported + +🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats +similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. +If you need support for other dataset formats, please consider contributing. + +**Note**: VisionArena’s `dataset-name` should be set to `hf` + +--- +## Example - Online Benchmark + +First start serving your model -You can download the dataset by running: ```bash -wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json +MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B" +vllm serve ${MODEL_NAME} --disable-log-requests ``` -## Downloading the ShareGPT4V dataset +Then run the benchmarking script -The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts -will ignore a datapoint if the referred image is missing. ```bash -wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json -mkdir coco -p -wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip -unzip coco/train2017.zip -d coco/ +# download dataset +# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json +MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B" +NUM_PROMPTS=10 +BACKEND="vllm" +DATASET_NAME="sharegpt" +DATASET_PATH="/ShareGPT_V3_unfiltered_cleaned_split.json" +python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS} ``` + +If successful, you will see the following output + +``` +============ Serving Benchmark Result ============ +Successful requests: 10 +Benchmark duration (s): 5.78 +Total input tokens: 1369 +Total generated tokens: 2212 +Request throughput (req/s): 1.73 +Output token throughput (tok/s): 382.89 +Total Token throughput (tok/s): 619.85 +---------------Time to First Token---------------- +Mean TTFT (ms): 71.54 +Median TTFT (ms): 73.88 +P99 TTFT (ms): 79.49 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 7.91 +Median TPOT (ms): 7.96 +P99 TPOT (ms): 8.03 +---------------Inter-token Latency---------------- +Mean ITL (ms): 7.74 +Median ITL (ms): 7.70 +P99 ITL (ms): 8.39 +================================================== +``` + +### VisionArena Benchmark for Vision Language Models + +```bash +# need a model with vision capability here +vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests +``` + +```bash +MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct" +NUM_PROMPTS=10 +BACKEND="openai-chat" +DATASET_NAME="hf" +DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1" +DATASET_SPLIT='train' + +python3 vllm/benchmarks/benchmark_serving.py \ + --backend "${BACKEND}" \ + --model "${MODEL_NAME}" \ + --endpoint "/v1/chat/completions" \ + --dataset-name "${DATASET_NAME}" \ + --dataset-path "${DATASET_PATH}" \ + --hf-split "${DATASET_SPLIT}" \ + --num-prompts "${NUM_PROMPTS}" +``` + +### HuggingFaceDataset Examples + +Currently, HuggingFaceDataset only supports dataset formats +similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset +formats, please consider contributing. + +```bash +# need a model with vision capability here +vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests +``` + +**`lmms-lab/LLaVA-OneVision-Data`** + +```bash +MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct" +NUM_PROMPTS=10 +BACKEND="openai-chat" +DATASET_NAME="hf" +DATASET_PATH="lmms-lab/LLaVA-OneVision-Data" +DATASET_SPLIT='train' +DATASET_SUBSET='chart2text(cauldron)' +python3 vllm/benchmarks/benchmark_serving.py \ + --backend "${BACKEND}" \ + --model "${MODEL_NAME}" \ + --endpoint "/v1/chat/completions" \ + --dataset-name "${DATASET_NAME}" \ + --dataset-path "${DATASET_PATH}" \ + --hf-split "${DATASET_SPLIT}" \ + --num-prompts "${NUM_PROMPTS}" \ + --hf-subset "${DATASET_SUBSET}" +``` + +**`Aeala/ShareGPT_Vicuna_unfiltered`** + +```bash +MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct" +NUM_PROMPTS=10 +BACKEND="openai-chat" +DATASET_NAME="hf" +DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered" +DATASET_SPLIT='train' +python3 vllm/benchmarks/benchmark_serving.py \ + --backend "${BACKEND}" \ + --model "${MODEL_NAME}" \ + --endpoint "/v1/chat/completions" \ + --dataset-name "${DATASET_NAME}" \ + --dataset-path "${DATASET_PATH}" \ + --hf-split "${DATASET_SPLIT}" \ + --num-prompts "${NUM_PROMPTS}" \ +``` + +--- +## Example - Offline Throughput Benchmark + +```bash +MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B" +NUM_PROMPTS=10 +DATASET_NAME="sonnet" +DATASET_PATH="vllm/benchmarks/sonnet.txt" + +python3 vllm/benchmarks/benchmark_throughput.py \ + --model "${MODEL_NAME}" \ + --dataset-name "${DATASET_NAME}" \ + --dataset-path "${DATASET_PATH}" \ + --num-prompts "${NUM_PROMPTS}" +``` + +If successful, you will see the following output + +``` +Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s +Total num prompt tokens: 5014 +Total num output tokens: 1500 +``` + +### VisionArena Benchmark for Vision Language Models + +``` bash +MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct" +NUM_PROMPTS=10 +DATASET_NAME="hf" +DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1" +DATASET_SPLIT="train" + +python3 vllm/benchmarks/benchmark_throughput.py \ + --model "${MODEL_NAME}" \ + --backend "vllm-chat" \ + --dataset-name "${DATASET_NAME}" \ + --dataset-path "${DATASET_PATH}" \ + --num-prompts "${NUM_PROMPTS}" \ + --hf-split "${DATASET_SPLIT}" +``` + +The `num prompt tokens` now includes image token counts + +``` +Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s +Total num prompt tokens: 14527 +Total num output tokens: 1280 +``` + +### Benchmark with LoRA Adapters + +``` bash +# download dataset +# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json +MODEL_NAME="meta-llama/Llama-2-7b-hf" +BACKEND="vllm" +DATASET_NAME="sharegpt" +DATASET_PATH="/ShareGPT_V3_unfiltered_cleaned_split.json" +NUM_PROMPTS=10 +MAX_LORAS=2 +MAX_LORA_RANK=8 +ENABLE_LORA="--enable-lora" +LORA_PATH="yard1/llama-2-7b-sql-lora-test" + +python3 vllm/benchmarks/benchmark_throughput.py \ + --model "${MODEL_NAME}" \ + --backend "${BACKEND}" \ + --dataset_path "${DATASET_PATH}" \ + --dataset_name "${DATASET_NAME}" \ + --num-prompts "${NUM_PROMPTS}" \ + --max-loras "${MAX_LORAS}" \ + --max-lora-rank "${MAX_LORA_RANK}" \ + ${ENABLE_LORA} \ + --lora-path "${LORA_PATH}" + ``` diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index d098c110cd921..0f13c79ae234b 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -1,10 +1,12 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import os import sys import time import traceback from dataclasses import dataclass, field -from typing import List, Optional, Union +from typing import Optional, Union import aiohttp import huggingface_hub.constants @@ -12,6 +14,9 @@ from tqdm.asyncio import tqdm from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast) +# NOTE(simon): do not import vLLM here so the benchmark script +# can run without vLLM installed. + AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) @@ -23,7 +28,6 @@ class RequestFuncInput: output_len: int model: str model_name: Optional[str] = None - best_of: int = 1 logprobs: Optional[int] = None extra_body: Optional[dict] = None multi_modal_content: Optional[dict] = None @@ -37,8 +41,8 @@ class RequestFuncOutput: latency: float = 0.0 output_tokens: int = 0 ttft: float = 0.0 # Time to first token - itl: List[float] = field( - default_factory=list) # List of inter-token latencies + itl: list[float] = field( + default_factory=list) # list of inter-token latencies tpot: float = 0.0 # avg next-token latencies prompt_len: int = 0 error: str = "" @@ -51,15 +55,15 @@ async def async_request_tgi( api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: params = { - "best_of": request_func_input.best_of, "max_new_tokens": request_func_input.output_len, "do_sample": True, "temperature": 0.01, # TGI does not accept 0.0 temperature. "top_p": 0.99, # TGI does not accept 1.0 top_p. "truncate": request_func_input.prompt_len, - # TGI does not accept ignore_eos flag. + "ignore_eos_token": request_func_input.ignore_eos, } payload = { "inputs": request_func_input.prompt, @@ -67,6 +71,10 @@ async def async_request_tgi( } output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len + if request_func_input.ignore_eos: + output.output_tokens = request_func_input.output_len + else: + output.output_tokens = None ttft = 0.0 st = time.perf_counter() @@ -123,8 +131,8 @@ async def async_request_trt_llm( api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: - assert request_func_input.best_of == 1 + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: payload = { "accumulate_tokens": True, "text_input": request_func_input.prompt, @@ -187,8 +195,8 @@ async def async_request_deepspeed_mii( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, ) -> RequestFuncOutput: - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: - assert request_func_input.best_of == 1 + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: payload = { "prompt": request_func_input.prompt, @@ -235,13 +243,13 @@ async def async_request_openai_completions( ("completions", "profile") ), "OpenAI Completions API URL must end with 'completions' or 'profile'." - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: payload = { "model": request_func_input.model_name \ if request_func_input.model_name else request_func_input.model, "prompt": request_func_input.prompt, "temperature": 0.0, - "best_of": request_func_input.best_of, "max_tokens": request_func_input.output_len, "logprobs": request_func_input.logprobs, "stream": True, @@ -330,10 +338,11 @@ async def async_request_openai_chat_completions( ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith( - "chat/completions" + ("chat/completions", "profile") ), "OpenAI Chat Completions API URL must end with 'chat/completions'." - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: content = [{"type": "text", "text": request_func_input.prompt}] if request_func_input.multi_modal_content: content.append(request_func_input.multi_modal_content) @@ -423,12 +432,17 @@ def get_model(pretrained_model_name_or_path: str) -> str: if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true': from modelscope import snapshot_download - model_path = snapshot_download( - model_id=pretrained_model_name_or_path, - local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, - ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"]) + from vllm.model_executor.model_loader.weight_utils import get_lock - return model_path + # Use file lock to prevent multiple processes from + # downloading the same model weights at the same time. + with get_lock(pretrained_model_name_or_path): + model_path = snapshot_download( + model_id=pretrained_model_name_or_path, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"]) + + return model_path return pretrained_model_name_or_path diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py new file mode 100644 index 0000000000000..0567875f9862f --- /dev/null +++ b/benchmarks/benchmark_dataset.py @@ -0,0 +1,717 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +This module defines a framework for sampling benchmark requests from various +datasets. Each dataset subclass of BenchmarkDataset must implement sample +generation. Supported dataset types include: + - ShareGPT + - Random (synthetic) + - Sonnet + - BurstGPT + - HuggingFace + - VisionArena + +TODO: Implement CustomDataset to parse a JSON file and convert its contents into +SampleRequest instances, similar to the approach used in ShareGPT. +""" + +import base64 +import io +import json +import logging +import random +from abc import ABC, abstractmethod +from collections.abc import Mapping +from dataclasses import dataclass +from functools import cache +from typing import Any, Optional, Union + +import numpy as np +import pandas as pd +from datasets import load_dataset +from PIL import Image +from transformers import PreTrainedTokenizerBase + +from vllm.lora.request import LoRARequest +from vllm.lora.utils import get_adapter_absolute_path +from vllm.multimodal import MultiModalDataDict +from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer + +logger = logging.getLogger(__name__) + +# ----------------------------------------------------------------------------- +# Data Classes +# ----------------------------------------------------------------------------- + + +@dataclass +class SampleRequest: + """ + Represents a single inference request for benchmarking. + """ + + prompt: Union[str, Any] + prompt_len: int + expected_output_len: int + multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None + lora_request: Optional[LoRARequest] = None + + +# ----------------------------------------------------------------------------- +# Benchmark Dataset Base Class +# ----------------------------------------------------------------------------- + + +class BenchmarkDataset(ABC): + DEFAULT_SEED = 0 + + def __init__( + self, + dataset_path: Optional[str] = None, + random_seed: int = DEFAULT_SEED, + ) -> None: + """ + Initialize the BenchmarkDataset with an optional dataset path and random + seed. Args: + dataset_path (Optional[str]): Path to the dataset. If None, it + indicates that a default or random dataset might be used. + random_seed (int): Seed value for reproducible shuffling or + sampling. Defaults to DEFAULT_SEED. + """ + self.dataset_path = dataset_path + # Set the random seed, ensuring that a None value is replaced with the + # default seed. + self.random_seed = (random_seed + if random_seed is not None else self.DEFAULT_SEED) + self.data = None + + def apply_multimodal_chat_transformation( + self, + prompt: str, + mm_content: Optional[MultiModalDataDict] = None) -> list[dict]: + """ + Transform a prompt and optional multimodal content into a chat format. + This method is used for chat models that expect a specific conversation + format. + """ + content = [{"text": prompt, "type": "text"}] + if mm_content is not None: + content.append(mm_content) + return [{"role": "user", "content": content}] + + def load_data(self) -> None: + """ + Load data from the dataset path into self.data. + + This method must be overridden by subclasses since the method to load + data will vary depending on the dataset format and source. + + Raises: + NotImplementedError: If a subclass does not implement this method. + """ + # TODO (jenniferzhao): add support for downloading data + raise NotImplementedError( + "load_data must be implemented in subclasses.") + + def get_random_lora_request( + self, + tokenizer: PreTrainedTokenizerBase, + max_loras: Optional[int] = None, + lora_path: Optional[str] = None, + ) -> tuple[Optional[LoRARequest], AnyTokenizer]: + """ + Optionally select a random LoRA request and return its associated + tokenizer. + + This method is used when LoRA parameters are provided. It randomly + selects a LoRA based on max_loras and retrieves a cached tokenizer for + that LoRA if available. Otherwise, it returns the base tokenizer. + + Args: + tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no + LoRA is selected. max_loras (Optional[int]): The maximum number of + LoRAs available. If None, LoRA is not used. lora_path + (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA + is not used. + + Returns: + tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first + element is a LoRARequest (or None if not applicable) and the second + element is the tokenizer associated with the LoRA request (or the + base tokenizer). + """ + if max_loras is None or lora_path is None: + return None, tokenizer + + # Generate a random LoRA ID in the range [1, max_loras]. + lora_id = random.randint(1, max_loras) + lora_request = LoRARequest( + lora_name=str(lora_id), + lora_int_id=lora_id, + lora_path=lora_path_on_disk(lora_path), + ) + if lora_id not in lora_tokenizer_cache: + lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request) + # Return lora_request and the cached tokenizer if available; otherwise, + # return the base tokenizer + return lora_request, lora_tokenizer_cache[lora_id] or tokenizer + + @abstractmethod + def sample(self, tokenizer: PreTrainedTokenizerBase, + num_requests: int) -> list[SampleRequest]: + """ + Abstract method to generate sample requests from the dataset. + + Subclasses must override this method to implement dataset-specific logic + for generating a list of SampleRequest objects. + + Args: + tokenizer (PreTrainedTokenizerBase): The tokenizer to be used + for processing the dataset's text. + num_requests (int): The number of sample requests to generate. + + Returns: + list[SampleRequest]: A list of sample requests generated from the + dataset. + """ + raise NotImplementedError("sample must be implemented in subclasses.") + + def maybe_oversample_requests(self, requests: list[SampleRequest], + num_requests: int) -> None: + """ + Oversamples the list of requests if its size is less than the desired + number. + + Args: + requests (List[SampleRequest]): The current list of sampled + requests. num_requests (int): The target number of requests. + """ + if len(requests) < num_requests: + random.seed(self.random_seed) + additional = random.choices(requests, + k=num_requests - len(requests)) + requests.extend(additional) + logger.info("Oversampled requests to reach %d total samples.", + num_requests) + + +# ----------------------------------------------------------------------------- +# Utility Functions and Global Caches +# ----------------------------------------------------------------------------- + + +def is_valid_sequence( + prompt_len: int, + output_len: int, + min_len: int = 4, + max_prompt_len: int = 1024, + max_total_len: int = 2048, + skip_min_output_len_check: bool = False, +) -> bool: + """ + Validate a sequence based on prompt and output lengths. + + Default pruning criteria are copied from the original `sample_hf_requests` + and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as + from `sample_requests` in benchmark_throughput.py. + """ + # Check for invalid conditions + prompt_too_short = prompt_len < min_len + output_too_short = (not skip_min_output_len_check) and (output_len + < min_len) + prompt_too_long = prompt_len > max_prompt_len + combined_too_long = (prompt_len + output_len) > max_total_len + + # Return True if none of the invalid conditions are met + return not (prompt_too_short or output_too_short or prompt_too_long + or combined_too_long) + + +@cache +def lora_path_on_disk(lora_path: str) -> str: + return get_adapter_absolute_path(lora_path) + + +# Global cache for LoRA tokenizers. +lora_tokenizer_cache: dict[int, AnyTokenizer] = {} + + +def process_image(image: Any) -> Mapping[str, Any]: + """ + Process a single image input and return a multimedia content dictionary. + + For a PIL.Image.Image input: + - Converts the image to RGB. + - Saves the image as a JPEG in-memory. + - Encodes the JPEG data as a base64 string. + - Returns a dictionary with the image as a base64 data URL. + + For a string input: + - Treats the string as a URL or file path. + - Prepends "file://" if the string doesn't start with "http://" or + "file://". + - Returns a dictionary with the image URL. + + Raises: + ValueError: If the input is neither a PIL.Image.Image nor a string. + """ + if isinstance(image, Image.Image): + image = image.convert("RGB") + with io.BytesIO() as image_data: + image.save(image_data, format="JPEG") + image_base64 = base64.b64encode( + image_data.getvalue()).decode("utf-8") + return { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_base64}" + }, + } + + if isinstance(image, str): + image_url = (image if image.startswith( + ("http://", "file://")) else f"file://{image}") + return {"type": "image_url", "image_url": {"url": image_url}} + + raise ValueError( + f"Invalid image input {image}. Must be a PIL.Image.Image or str.") + + +# ----------------------------------------------------------------------------- +# Random Dataset Implementation (Synthetic Data) +# ----------------------------------------------------------------------------- + + +class RandomDataset(BenchmarkDataset): + # Default values copied from benchmark_serving.py for the random dataset. + DEFAULT_PREFIX_LEN = 0 + DEFAULT_RANGE_RATIO = 1.0 + DEFAULT_INPUT_LEN = 1024 + DEFAULT_OUTPUT_LEN = 128 + + def __init__( + self, + **kwargs, + ) -> None: + super().__init__(**kwargs) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + prefix_len: int = DEFAULT_PREFIX_LEN, + range_ratio: float = DEFAULT_RANGE_RATIO, + input_len: int = DEFAULT_INPUT_LEN, + output_len: int = DEFAULT_OUTPUT_LEN, + **kwargs, + ) -> list[SampleRequest]: + vocab_size = tokenizer.vocab_size + + prefix_token_ids = (np.random.randint( + 0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else []) + + input_low = int(input_len * range_ratio) + output_low = int(output_len * range_ratio) + + input_lens = np.random.randint(input_low, + input_len + 1, + size=num_requests) + output_lens = np.random.randint(output_low, + output_len + 1, + size=num_requests) + offsets = np.random.randint(0, vocab_size, size=num_requests) + + requests = [] + for i in range(num_requests): + inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) % + vocab_size).tolist() + token_sequence = prefix_token_ids + inner_seq + prompt = tokenizer.decode(token_sequence) + total_input_len = prefix_len + int(input_lens[i]) + requests.append( + SampleRequest( + prompt=prompt, + prompt_len=total_input_len, + expected_output_len=int(output_lens[i]), + )) + return requests + + +# ----------------------------------------------------------------------------- +# ShareGPT Dataset Implementation +# ----------------------------------------------------------------------------- + + +class ShareGPTDataset(BenchmarkDataset): + """ + Implements the ShareGPT dataset. Loads data from a JSON file and generates + sample requests based on conversation turns. + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + with open(self.dataset_path, encoding="utf-8") as f: + self.data = json.load(f) + # Filter entries with at least two conversation turns. + self.data = [ + entry for entry in self.data + if "conversations" in entry and len(entry["conversations"]) >= 2 + ] + random.seed(self.random_seed) + random.shuffle(self.data) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + lora_path: Optional[str] = None, + max_loras: Optional[int] = None, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + **kwargs, + ) -> list: + samples: list = [] + for entry in self.data: + if len(samples) >= num_requests: + break + prompt, completion = ( + entry["conversations"][0]["value"], + entry["conversations"][1]["value"], + ) + + lora_request, tokenizer = self.get_random_lora_request( + tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path) + prompt_ids = tokenizer(prompt).input_ids + completion_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_ids) + new_output_len = (len(completion_ids) + if output_len is None else output_len) + if not is_valid_sequence(prompt_len, + new_output_len, + skip_min_output_len_check=output_len + is not None): + continue + if enable_multimodal_chat: + prompt = self.apply_multimodal_chat_transformation( + prompt, None) + samples.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=new_output_len, + lora_request=lora_request, + )) + self.maybe_oversample_requests(samples, num_requests) + return samples + + +# ----------------------------------------------------------------------------- +# Sonnet Dataset Implementation +# ----------------------------------------------------------------------------- + + +class SonnetDataset(BenchmarkDataset): + """ + Simplified implementation of the Sonnet dataset. Loads poem lines from a + text file and generates sample requests. Default values here copied from + `benchmark_serving.py` for the sonnet dataset. + """ + + DEFAULT_PREFIX_LEN = 200 + DEFAULT_INPUT_LEN = 550 + DEFAULT_OUTPUT_LEN = 150 + + def __init__( + self, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if not self.dataset_path: + raise ValueError("dataset_path must be provided.") + with open(self.dataset_path, encoding="utf-8") as f: + self.data = f.readlines() + + def sample( + self, + tokenizer, + num_requests: int, + prefix_len: int = DEFAULT_PREFIX_LEN, + input_len: int = DEFAULT_INPUT_LEN, + output_len: int = DEFAULT_OUTPUT_LEN, + return_prompt_formatted: bool = False, + **kwargs, + ) -> list: + # Calculate average token length for a poem line. + tokenized_lines = [tokenizer(line).input_ids for line in self.data] + avg_len = sum(len(tokens) + for tokens in tokenized_lines) / len(tokenized_lines) + + # Build the base prompt. + base_prompt = "Pick as many lines as you can from these poem lines:\n" + base_msg = [{"role": "user", "content": base_prompt}] + base_fmt = tokenizer.apply_chat_template(base_msg, + add_generation_prompt=True, + tokenize=False) + base_offset = len(tokenizer(base_fmt).input_ids) + if input_len <= base_offset: + raise ValueError( + f"'input_len' must be higher than the base prompt length " + f"({base_offset}).") + + # Determine how many poem lines to use. + num_input_lines = round((input_len - base_offset) / avg_len) + num_prefix_lines = round((prefix_len - base_offset) / avg_len) + prefix_lines = self.data[:num_prefix_lines] + + samples = [] + for _ in range(num_requests): + extra_lines = random.choices(self.data, + k=num_input_lines - num_prefix_lines) + prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}" + msg = [{"role": "user", "content": prompt}] + prompt_formatted = tokenizer.apply_chat_template( + msg, add_generation_prompt=True, tokenize=False) + prompt_len = len(tokenizer(prompt_formatted).input_ids) + samples.append( + SampleRequest( + prompt=prompt_formatted + if return_prompt_formatted else prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + )) + return samples + + +# ----------------------------------------------------------------------------- +# BurstGPT Dataset Implementation +# ----------------------------------------------------------------------------- + + +class BurstGPTDataset(BenchmarkDataset): + """ + Implements the BurstGPT dataset. Loads data from a CSV file and generates + sample requests based on synthetic prompt generation. Only rows with Model + "GPT-4" and positive response tokens are used. + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self, ): + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + df = pd.read_csv(self.dataset_path) + # Filter to keep only GPT-4 rows. + gpt4_df = df[df["Model"] == "GPT-4"] + # Remove failed requests (where Response tokens is 0 or less). + gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0] + # Sample the desired number of rows. + self.data = gpt4_df + + def _sample_loaded_data(self, num_requests: int) -> list: + if num_requests <= len(self.data): + data = self.data.sample(n=num_requests, + random_state=self.random_seed) + else: + data = self.data.sample( + n=num_requests, + random_state=self.random_seed, + replace=True, + ) + # Convert the dataframe to a list of lists. + return data.values.tolist() + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + max_loras: Optional[int] = None, + lora_path: Optional[str] = None, + **kwargs, + ) -> list[SampleRequest]: + samples = [] + data = self._sample_loaded_data(num_requests=num_requests) + for i in range(num_requests): + input_len = int(data[i][2]) + output_len = int(data[i][3]) + lora_req, tokenizer = self.get_random_lora_request( + tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path) + vocab_size = tokenizer.vocab_size + # Generate a synthetic prompt: a list of token IDs computed as (i + + # j) modulo vocab_size. + token_ids = [(i + j) % vocab_size for j in range(input_len)] + prompt = tokenizer.decode(token_ids) + samples.append( + SampleRequest( + prompt=prompt, + prompt_len=input_len, + expected_output_len=output_len, + lora_request=lora_req, + )) + return samples + + +# ----------------------------------------------------------------------------- +# HuggingFace Dataset Implementation +# ----------------------------------------------------------------------------- + + +class HuggingFaceDataset(BenchmarkDataset): + """ + Dataset class for processing a HuggingFace dataset with conversation data + and optional images. + """ + + def __init__( + self, + dataset_split: str, + dataset_subset: Optional[str] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.dataset_split = dataset_split + self.dataset_subset = dataset_subset + + self.load_data() + + def load_data(self) -> None: + if not self.dataset_path: + raise ValueError("dataset_path must be provided for loading data.") + + self.data = load_dataset( + self.dataset_path, + name=self.dataset_subset, + split=self.dataset_split, + streaming=True, + ) + if self.data.features is None or "conversations" \ + not in self.data.features: + raise ValueError( + "HuggingFaceDataset currently only supports datasets with " + "a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. " + "Please consider contributing if you would like to add " + "support for additional dataset formats.") + # Shuffle and filter examples with at least 2 conversations. + self.data = self.data.shuffle(seed=self.random_seed).filter( + lambda x: len(x["conversations"]) >= 2) + + def sample(self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + **kwargs) -> list: + sampled_requests = [] + dynamic_output = output_len is None + + for item in self.data: + if len(sampled_requests) >= num_requests: + break + conv = item["conversations"] + prompt, completion = conv[0]["value"], conv[1]["value"] + + prompt_ids = tokenizer(prompt).input_ids + completion_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_ids) + completion_len = len(completion_ids) + output_len = completion_len if dynamic_output else output_len + assert isinstance(output_len, int) and output_len > 0 + if dynamic_output and not is_valid_sequence( + prompt_len, completion_len): + continue + mm_content = process_image( + item["image"]) if "image" in item else None + if enable_multimodal_chat: + # Note: when chat is enabled the request prompt_len is no longer + # accurate and we will be using request output to count the + # actual prompt len and output len + prompt = self.apply_multimodal_chat_transformation( + prompt, mm_content) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + )) + self.maybe_oversample_requests(sampled_requests, num_requests) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# Vision Arena Dataset Implementation +# ----------------------------------------------------------------------------- + + +class VisionArenaDataset(HuggingFaceDataset): + """ + Vision Arena Dataset. + """ + + DEFAULT_OUTPUT_LEN = 128 + VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1" + + def __init__( + self, + **kwargs, + ) -> None: + super().__init__(**kwargs) + if self.dataset_path != self.VISION_ARENA_DATASET_PATH: + raise ValueError(f"Only support Vision Arena dataset.\ + This data path {self.dataset_path} is not valid.") + if self.dataset_subset is None and self.dataset_split != "train": + raise ValueError("Dataset split must be 'train'.") + + self.load_data() + + def load_data(self) -> None: + dataset = load_dataset( + self.dataset_path, + name=self.dataset_subset, + split=self.dataset_split, + streaming=True, + ) + self.data = dataset.shuffle(seed=self.random_seed) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + **kwargs, + ) -> list: + output_len = (output_len + if output_len is not None else self.DEFAULT_OUTPUT_LEN) + sampled_requests = [] + for item in self.data: + if len(sampled_requests) >= num_requests: + break + prompt = item["turns"][0][0]["content"] + mm_content = process_image(item["images"][0]) + prompt_len = len(tokenizer(prompt).input_ids) + if enable_multimodal_chat: + # Note: when chat is enabled the request prompt_len is no longer + # accurate and we will be using request output to count the + # actual prompt len + prompt = self.apply_multimodal_chat_transformation( + prompt, mm_content) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + )) + self.maybe_oversample_requests(sampled_requests, num_requests) + return sampled_requests diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py deleted file mode 100644 index 1a0e62598bfcb..0000000000000 --- a/benchmarks/benchmark_guided.py +++ /dev/null @@ -1,494 +0,0 @@ -"""Benchmark guided decoding throughput.""" -import argparse -import dataclasses -import json -import os -import random -import time -from typing import List - -import datasets -import pandas as pd -import uvloop -from transformers import AutoTokenizer, PreTrainedTokenizerBase - -from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs -from vllm.entrypoints.openai.api_server import ( - build_async_engine_client_from_engine_args) -from vllm.sampling_params import GuidedDecodingParams -from vllm.utils import FlexibleArgumentParser, merge_async_iterators - - -@dataclasses.dataclass -class SampleRequest: - """A class representing a single inference request for benchmarking. - - Attributes: - prompt: The input text prompt for the model. - multi_modal_data: Optional dictionary containing multi-modal data (e.g. - images). - prompt_len: The length of the prompt in tokens. - expected_output_len: The expected length of the output in tokens. - """ - prompt: str - prompt_len: int - expected_output_len: int - schema: dict - structure_type: str = 'json' - completion: str = None - - -def run_vllm(requests: List[SampleRequest], - engine_args: EngineArgs, - n: int, - guided_decoding_rate: float = 1.0, - warmup: bool = False) -> float: - from vllm import LLM, SamplingParams - llm = LLM(**vars(engine_args)) - - # Add the requests to the engine. - prompts: List[str] = [] - sampling_params: List[SamplingParams] = [] - # create a list containing random selected true or false - guided_decoding_req_idx = random.sample( - range(len(requests)), int(len(requests) * guided_decoding_rate)) - - if warmup: - print(">>>>> Running warmup prompt, for the first 5") - # We setup the first 5 requests to warmup FSM - # if using xgrammar dataset, we will skip warmup - warmup_requests = requests[:5] - for i, request in enumerate(warmup_requests): - prompts.append(request.prompt) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - guided_decoding=GuidedDecodingParams(json=request.schema) - if guided_decoding_rate > 0 else None, - )) - llm.generate(prompts, sampling_params, use_tqdm=False) - - print(">>>>> Benchmark started...") - prompts = [] - sampling_params = [] - for i, request in enumerate(requests): - prompts.append(request.prompt) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - guided_decoding=GuidedDecodingParams( - **{request.structure_type: request.schema}) - if i in guided_decoding_req_idx else None, - )) - - start = time.perf_counter() - outputs = llm.generate(prompts, sampling_params, use_tqdm=False) - ret = [] - for output, request in zip(outputs, requests): - generated_text = output.outputs[0].text - ret.append({ - "generated": generated_text, - "expected": request.completion - }) - end = time.perf_counter() - return end - start, ret - - -async def run_vllm_async( - requests: List[SampleRequest], - engine_args: AsyncEngineArgs, - n: int, - guided_decoding_rate: float = 1.0, - warmup: bool = False, - disable_frontend_multiprocessing: bool = False) -> float: - from vllm import SamplingParams - - async with build_async_engine_client_from_engine_args( - engine_args, disable_frontend_multiprocessing) as llm: - - # Add the requests to the engine. - prompts: List[str] = [] - sampling_params: List[SamplingParams] = [] - guided_decoding_req_idx = random.sample( - range(len(requests)), int(len(requests) * guided_decoding_rate)) - - if warmup: - print(">>>>>> Running warmup prompt, for the first 5") - # We setup the first 5 requests to warmup FSM - # if using xgrammar dataset, we will skip warmup - warmup_requests = requests[:5] - for i, request in enumerate(warmup_requests): - prompts.append(request.prompt) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - guided_decoding=GuidedDecodingParams( - json=request.schema) - if guided_decoding_rate > 0 else None, - )) - generators = [] - for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)): - generator = llm.generate(prompt, sp, request_id=f"test{i}") - generators.append(generator) - all_gens = merge_async_iterators(*generators) - async for i, res in all_gens: - pass - - print(">>>>> Benchmark started...") - prompts = [] - sampling_params = [] - for i, request in enumerate(requests): - prompts.append(request.prompt) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - guided_decoding=GuidedDecodingParams(json=request.schema) - if i in guided_decoding_req_idx else None, - )) - - generators = [] - start_time = [] - latencies = [] - start = time.perf_counter() - for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)): - generator = llm.generate(prompt, sp, request_id=f"test{i}") - generators.append(generator) - start_time.append(time.perf_counter()) - latencies.append([]) - all_gens = merge_async_iterators(*generators) - generated_texts = [''] * len(requests) - async for i, res in all_gens: - generated_texts[i] = res.outputs[0].text - lat = time.perf_counter() - start_time[i] - latencies[i].append(lat) - ret = [{ - 'generated': gt, - 'expected': req.completion - } for gt, req in zip(generated_texts, requests)] - end = time.perf_counter() - first_latency = pd.Series([lat[0] * 1000 for lat in latencies]) - next_latency = pd.Series([(lat[-1] - lat[0]) / len(lat[1:]) * 1000 - for lat in latencies]) - return end - start, ret, (first_latency, next_latency) - - -def sample_requests(tokenizer: PreTrainedTokenizerBase, - args: argparse.Namespace) -> List[SampleRequest]: - if args.dataset == 'json': - if args.json_schema_path is None: - dir_path = os.path.dirname(os.path.realpath(__file__)) - args.json_schema_path = os.path.join(dir_path, - "structured_schemas", - "structured_schema_1.json") - with open(args.json_schema_path) as f: - schema = json.load(f) - prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501 - input_len = len(tokenizer(prompt).input_ids) - print(f"Input length of the prompt: {input_len} tokens") - requests = [ - SampleRequest(prompt=prompt, - prompt_len=input_len, - expected_output_len=args.output_len, - schema=schema, - structure_type=args.structure_type) - for _ in range(args.num_prompts) - ] - - elif args.dataset == "grammar": - schema = """ - ?start: select_statement - - ?select_statement: "SELECT " column_list " FROM " table_name - - ?column_list: column_name ("," column_name)* - - ?table_name: identifier - - ?column_name: identifier - - ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ - """ - prompt = "Generate an SQL query to show the 'username' \ - and 'email' from the 'users' table." - - input_len = len(tokenizer(prompt).input_ids) - print(f"Input length of the prompt: {input_len} tokens") - requests = [ - SampleRequest(prompt=prompt, - prompt_len=input_len, - expected_output_len=args.output_len, - schema=schema, - structure_type=args.structure_type) - for _ in range(args.num_prompts) - ] - - elif args.dataset == "regex": - regex = r"\w+@\w+\.com\n" - args.regex = regex - prompt = "Generate an email address for Alan Turing, \ - who works in Enigma. End in .com and new line. \ - Example result: alan.turing@enigma.com\n" - - input_len = len(tokenizer(prompt).input_ids) - print(f"Input length of the prompt: {input_len} tokens") - requests = [ - SampleRequest(prompt=prompt, - prompt_len=input_len, - expected_output_len=args.output_len, - schema=regex, - structure_type=args.structure_type) - for _ in range(args.num_prompts) - ] - - elif args.dataset == "choice": - choice = ["Positive", "Negative"] - args.choice = choice - prompt = "Classify this sentiment: vLLM is wonderful!" - input_len = len(tokenizer(prompt).input_ids) - print(f"Input length of the prompt: {input_len} tokens") - requests = [ - SampleRequest(prompt=prompt, - prompt_len=input_len, - expected_output_len=args.output_len, - schema=choice, - structure_type=args.structure_type) - for _ in range(args.num_prompts) - ] - - elif args.dataset == "xgrammar_bench": - args.warmup = False - requests: List[SampleRequest] = [] - dataset = datasets.load_dataset("NousResearch/json-mode-eval", - split="train") - print(f"dataset has {len(dataset)} entries") - len_dataset = len(dataset) - for data_point_idx in range(args.num_prompts): - idx = data_point_idx - while idx >= len_dataset: - idx -= len_dataset - schema = dataset["schema"][idx] - prompt = tokenizer.apply_chat_template(dataset["prompt"][idx], - tokenize=False) - input_len = len(tokenizer(prompt).input_ids) - completion = dataset["completion"][idx] - - requests.append( - SampleRequest(prompt=prompt, - prompt_len=input_len, - expected_output_len=args.output_len, - schema=schema, - completion=completion)) - - return requests - - -def evaluate(ret, args): - - def _eval_correctness_json(expected, actual): - # extract json string from string using regex - import re - actual = actual.replace('\n', '').replace(' ', '').strip() - try: - actual = re.search(r'\{.*\}', actual).group() - actual = json.loads(actual) - except Exception: - return False - - return True - - def _eval_correctness_choice(expected, actual): - return actual in args.choice - - def _eval_correctness_regex(expected, actual): - import re - return re.match(args.regex, actual) is not None - - def _eval_correctness(expected, actual): - if args.structure_type == 'json': - return _eval_correctness_json(expected, actual) - elif args.structure_type == 'regex': - return _eval_correctness_regex(expected, actual) - elif args.structure_type == 'choice': - return _eval_correctness_choice(expected, actual) - else: - return None - - scores = [] - for res in ret: - score = _eval_correctness(res['expected'], res['generated']) - res['correctness'] = score - scores.append(score) - - not_none_scores = [score for score in scores if score is not None] - - return (sum(not_none_scores) / len(not_none_scores) * - 100) if len(not_none_scores) > 0 else None - - -def main(args: argparse.Namespace): - print(args) - random.seed(args.seed) - - # async engine is working for 'regex', 'choice' and 'grammar' - if args.dataset == 'grammar': - args.structure_type = 'grammar' - args.async_engine = False - elif args.dataset == 'regex': - args.structure_type = 'regex' - args.async_engine = False - elif args.dataset == 'choice': - args.structure_type = 'choice' - args.async_engine = False - else: - args.structure_type = 'json' - - if args.no_guided_decoding: - args.guided_decoding_ratio = 0 - if args.save_results: - result_file_name = f'{args.guided_decoding_ratio}guided' - result_file_name += f"_{args.model.split('/')[-1]}" - result_file_name += f"_{args.dataset}" - result_file_name += f"_{args.num_prompts}" - result_file_name += f"_out{args.output_len}" - result_file_name += f"_async{args.async_engine}" - result_file_name += f"_warmup{args.warmup}" - result_file_name += f"_chunkedprefill{args.enable_chunked_prefill}" - result_file_name += ".txt" - else: - result_file_name = None - - # Synthesize a prompt with the given input length. - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer, trust_remote_code=args.trust_remote_code) - requests = sample_requests(tokenizer, args) - - if args.async_engine: - engine_args = AsyncEngineArgs.from_cli_args(args) - elapsed_time, ret, (first_latency, next_latency) = uvloop.run( - run_vllm_async(requests, engine_args, args.n, - args.guided_decoding_ratio, args.warmup, - args.disable_frontend_multiprocessing)) - else: - engine_args = EngineArgs.from_cli_args(args) - elapsed_time, ret = run_vllm(requests, engine_args, args.n, - args.guided_decoding_ratio, args.warmup) - first_latency, next_latency = None, None - - score = evaluate(ret, args) - total_num_tokens = sum(request.prompt_len + request.expected_output_len - for request in requests) - total_output_tokens = sum(request.expected_output_len - for request in requests) - if first_latency is not None: - latency_breakdown = "\nFirst token latency(msecs):\n" - latency_breakdown += f"{first_latency.describe()}" - latency_breakdown += "\nNext token latency(msecs):\n" - latency_breakdown += f"{next_latency.describe()}" - print( - f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " - f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " - f"{total_output_tokens / elapsed_time:.2f} output tokens/s", - f"Correct rate is {score} %", - f"{latency_breakdown if first_latency is not None else ''}") - - # Output JSON results if specified - if args.output_json or result_file_name: - results = { - "elapsed_time": elapsed_time, - "num_requests": len(requests), - "total_num_tokens": total_num_tokens, - "total_output_tokens": total_output_tokens, - "requests_per_second": len(requests) / elapsed_time, - "tokens_per_second": f"{total_num_tokens / elapsed_time:.2f}", - "output_tokens_per_second": - f"{total_output_tokens / elapsed_time:.2f}", - "correct_rate(%)": score - } - results = {"outputs": ret, **results} - if first_latency is not None: - results["first_token_latency(msecs)"] = first_latency.describe( - ).to_dict() - results["next_token_latency(msecs)"] = next_latency.describe( - ).to_dict() - if args.output_json: - with open(args.output_json, "w") as f: - json.dump(results, f, indent=4) - elif result_file_name: - with open(result_file_name, "w") as f: - json.dump(results, f, indent=4) - - -if __name__ == "__main__": - parser = FlexibleArgumentParser(description="Benchmark guided decoding.") - parser = AsyncEngineArgs.add_cli_args(parser) - - parser.add_argument("--output-len", - type=int, - default=512, - help="Output length for each request. Overrides the " - "output length from the dataset.") - parser.add_argument( - "--dataset", - default='json', - choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench']) - parser.add_argument("--json_schema_path", - type=str, - default=None, - help="Path to json schema.") - parser.add_argument("--n", - type=int, - default=1, - help="Number of generated sequences per prompt.") - parser.add_argument("--num-prompts", - type=int, - default=10, - help="Number of prompts to process.") - parser.add_argument( - '--output-json', - type=str, - default=None, - help='Path to save the throughput results in JSON format.') - parser.add_argument("--async-engine", - action='store_true', - default=False, - help="Use vLLM async engine rather than LLM class.") - parser.add_argument("--no-guided-decoding", - action='store_true', - default=False, - help="Whether to disable JSON decoding or not.") - parser.add_argument("--guided-decoding-ratio", - type=float, - default=1.0, - help="Ratio of Guided Decoding requests") - parser.add_argument("--disable-frontend-multiprocessing", - action='store_true', - default=False, - help="Disable decoupled async engine frontend.") - parser.add_argument("--warmup", - action="store_true", - default=False, - help="Run warmup prompts before benchmark.") - parser.add_argument("--save-results", - action="store_true", - default=False, - help="save output results.") - args = parser.parse_args() - if args.tokenizer is None: - args.tokenizer = args.model - main(args) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 77c4f6aa927e4..dfd9bb1e6a4d0 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -1,13 +1,17 @@ +# SPDX-License-Identifier: Apache-2.0 """Benchmark the latency of processing a single batch of requests.""" + import argparse import dataclasses import json +import os import time from pathlib import Path -from typing import List, Optional +from typing import Any, Optional import numpy as np import torch +from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from tqdm import tqdm from vllm import LLM, SamplingParams @@ -17,6 +21,18 @@ from vllm.sampling_params import BeamSearchParams from vllm.utils import FlexibleArgumentParser +def save_to_pytorch_benchmark_format(args: argparse.Namespace, + results: dict[str, Any]) -> None: + pt_records = convert_to_pytorch_benchmark_format( + args=args, + metrics={"latency": results["latencies"]}, + extra_info={k: results[k] + for k in ["avg_latency", "percentiles"]}) + if pt_records: + pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" + write_to_json(pt_file, pt_records) + + def main(args: argparse.Namespace): print(args) @@ -25,6 +41,10 @@ def main(args: argparse.Namespace): # NOTE(woosuk): If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. llm = LLM(**dataclasses.asdict(engine_args)) + assert llm.llm_engine.model_config.max_model_len >= ( + args.input_len + + args.output_len), ("Please ensure that max_model_len is greater than" + " the sum of input_len and output_len.") sampling_params = SamplingParams( n=args.n, @@ -32,12 +52,13 @@ def main(args: argparse.Namespace): top_p=1.0, ignore_eos=True, max_tokens=args.output_len, + detokenize=not args.disable_detokenize, ) print(sampling_params) dummy_prompt_token_ids = np.random.randint(10000, size=(args.batch_size, args.input_len)) - dummy_prompts: List[PromptType] = [{ + dummy_prompts: list[PromptType] = [{ "prompt_token_ids": batch } for batch in dummy_prompt_token_ids.tolist()] @@ -53,7 +74,8 @@ def main(args: argparse.Namespace): beam_width=args.n, max_tokens=args.output_len, ignore_eos=True, - )) + ), + ) def run_to_completion(profile_dir: Optional[str] = None): if profile_dir: @@ -63,7 +85,8 @@ def main(args: argparse.Namespace): torch.profiler.ProfilerActivity.CUDA, ], on_trace_ready=torch.profiler.tensorboard_trace_handler( - str(profile_dir))) as p: + str(profile_dir)), + ) as p: llm_generate() print(p.key_averages().table(sort_by="self_cuda_time_total")) else: @@ -80,9 +103,8 @@ def main(args: argparse.Namespace): if args.profile: profile_dir = args.profile_result_dir if not profile_dir: - profile_dir = Path( - "." - ) / "vllm_benchmark_result" / f"latency_result_{time.time()}" + profile_dir = (Path(".") / "vllm_benchmark_result" / + f"latency_result_{time.time()}") print(f"Profiling (results will be saved to '{profile_dir}')...") run_to_completion(profile_dir=profile_dir) return @@ -94,9 +116,9 @@ def main(args: argparse.Namespace): latencies = np.array(latencies) percentages = [10, 25, 50, 75, 90, 99] percentiles = np.percentile(latencies, percentages) - print(f'Avg latency: {np.mean(latencies)} seconds') + print(f"Avg latency: {np.mean(latencies)} seconds") for percentage, percentile in zip(percentages, percentiles): - print(f'{percentage}% percentile latency: {percentile} seconds') + print(f"{percentage}% percentile latency: {percentile} seconds") # Output JSON results if specified if args.output_json: @@ -107,43 +129,57 @@ def main(args: argparse.Namespace): } with open(args.output_json, "w") as f: json.dump(results, f, indent=4) + save_to_pytorch_benchmark_format(args, results) -if __name__ == '__main__': +if __name__ == "__main__": parser = FlexibleArgumentParser( - description='Benchmark the latency of processing a single batch of ' - 'requests till completion.') - parser.add_argument('--input-len', type=int, default=32) - parser.add_argument('--output-len', type=int, default=128) - parser.add_argument('--batch-size', type=int, default=8) - parser.add_argument('--n', - type=int, - default=1, - help='Number of generated sequences per prompt.') - parser.add_argument('--use-beam-search', action='store_true') - parser.add_argument('--num-iters-warmup', - type=int, - default=10, - help='Number of iterations to run for warmup.') - parser.add_argument('--num-iters', + description="Benchmark the latency of processing a single batch of " + "requests till completion.") + parser.add_argument("--input-len", type=int, default=32) + parser.add_argument("--output-len", type=int, default=128) + parser.add_argument("--batch-size", type=int, default=8) + parser.add_argument( + "--n", + type=int, + default=1, + help="Number of generated sequences per prompt.", + ) + parser.add_argument("--use-beam-search", action="store_true") + parser.add_argument( + "--num-iters-warmup", + type=int, + default=10, + help="Number of iterations to run for warmup.", + ) + parser.add_argument("--num-iters", type=int, default=30, - help='Number of iterations to run.') + help="Number of iterations to run.") parser.add_argument( - '--profile', - action='store_true', - help='profile the generation process of a single batch') + "--profile", + action="store_true", + help="profile the generation process of a single batch", + ) parser.add_argument( - '--profile-result-dir', + "--profile-result-dir", type=str, default=None, - help=('path to save the pytorch profiler output. Can be visualized ' - 'with ui.perfetto.dev or Tensorboard.')) + help=("path to save the pytorch profiler output. Can be visualized " + "with ui.perfetto.dev or Tensorboard."), + ) parser.add_argument( - '--output-json', + "--output-json", type=str, default=None, - help='Path to save the latency results in JSON format.') + help="Path to save the latency results in JSON format.", + ) + parser.add_argument( + "--disable-detokenize", + action="store_true", + help=("Do not detokenize responses (i.e. do not include " + "detokenization time in the latency measurement)"), + ) parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py index 0b8fba38156f1..21480578edbd5 100644 --- a/benchmarks/benchmark_long_document_qa_throughput.py +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Offline benchmark to test the long document QA throughput. diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 3ab421a89c935..4fff7a8fc8eda 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Benchmark the efficiency of prefix caching. @@ -30,7 +31,7 @@ import dataclasses import json import random import time -from typing import List, Optional, Tuple +from typing import Optional from transformers import PreTrainedTokenizerBase @@ -76,9 +77,9 @@ def sample_requests_from_dataset( dataset_path: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, - input_length_range: Tuple[int, int], + input_length_range: tuple[int, int], fixed_output_len: Optional[int], -) -> List[Request]: +) -> list[Request]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -98,7 +99,7 @@ def sample_requests_from_dataset( assert min_len >= 0 and max_len >= min_len, "input_length_range too small" # Filter out sequences that are too long or too short - filtered_requests: List[Request] = [] + filtered_requests: list[Request] = [] for i in range(len(dataset)): if len(filtered_requests) == num_requests: @@ -121,10 +122,10 @@ def sample_requests_from_dataset( def sample_requests_from_random( num_requests: int, tokenizer: PreTrainedTokenizerBase, - input_length_range: Tuple[int, int], + input_length_range: tuple[int, int], fixed_output_len: Optional[int], prefix_len: int, -) -> List[Request]: +) -> list[Request]: requests = [] prefix_token_ids = sample_tokens(tokenizer, prefix_len) @@ -143,9 +144,9 @@ def sample_requests_from_random( return requests -def repeat_and_sort_requests(requests: List[Request], +def repeat_and_sort_requests(requests: list[Request], repeat_count: int, - sort: bool = False) -> List[str]: + sort: bool = False) -> list[str]: repeated_requests = requests * repeat_count if sort: repeated_requests.sort(key=lambda x: x[1]) @@ -193,7 +194,9 @@ def main(args): llm = LLM(**dataclasses.asdict(engine_args)) - sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) + sampling_params = SamplingParams(temperature=0, + max_tokens=args.output_len, + detokenize=not args.disable_detokenize) print("Testing filtered requests") prompts = repeat_and_sort_requests(filtered_requests, @@ -242,6 +245,12 @@ if __name__ == "__main__": "subtract this length when filtering prompts. Only used " "when dataset-path is not provided.", ) + parser.add_argument( + '--disable-detokenize', + action='store_true', + help=("Do not detokenize responses (i.e. do not include " + "detokenization time in the latency measurement)"), + ) parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index e0c9e6a6db502..76fe00ede249b 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -1,10 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 """Benchmark offline prioritization.""" import argparse import dataclasses import json import random import time -from typing import List, Optional, Tuple +from typing import Optional from transformers import AutoTokenizer, PreTrainedTokenizerBase @@ -12,12 +13,17 @@ from vllm.engine.arg_utils import EngineArgs from vllm.utils import FlexibleArgumentParser +#Select a equi-probable random priority +def get_random_flag(): + return 0 if random.random() < 0.5 else 1 + + def sample_requests( dataset_path: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, fixed_output_len: Optional[int], -) -> List[Tuple[str, int, int]]: +) -> list[tuple[str, int, int, int]]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -34,7 +40,7 @@ def sample_requests( random.shuffle(dataset) # Filter out sequences that are too long or too short - filtered_dataset: List[Tuple[str, int, int]] = [] + filtered_dataset: list[tuple[str, int, int]] = [] for i in range(len(dataset)): if len(filtered_dataset) == num_requests: break @@ -54,8 +60,7 @@ def sample_requests( # Prune too long sequences. continue - #Select a equi-probable random priority - priority = 0 if random.random() < 0.5 else 1 + priority = get_random_flag() filtered_dataset.append((prompt, prompt_len, output_len, priority)) @@ -63,13 +68,20 @@ def sample_requests( def run_vllm( - requests: List[Tuple[str, int, int]], + requests: list[tuple[str, int, int]], n: int, engine_args: EngineArgs, + disable_detokenize: bool = False, ) -> float: from vllm import LLM, SamplingParams llm = LLM(**dataclasses.asdict(engine_args)) + assert all( + llm.llm_engine.model_config.max_model_len >= (request[1] + request[2]) + for request in requests), ( + "Please ensure that max_model_len is greater than the sum of" + " input_len and output_len for all requests.") + # Add the requests to the engine. prompts = [] sampling_params = [] @@ -84,6 +96,7 @@ def run_vllm( top_p=1.0, ignore_eos=True, max_tokens=output_len, + detokenize=not disable_detokenize, )) start = time.perf_counter() @@ -102,15 +115,16 @@ def main(args: argparse.Namespace): if args.dataset is None: # Synthesize a prompt with the given input length. prompt = "hi" * (args.input_len - 1) - requests = [(prompt, args.input_len, args.output_len) - for _ in range(args.num_prompts)] + requests = [(prompt, args.input_len, args.output_len, + get_random_flag()) for _ in range(args.num_prompts)] else: requests = sample_requests(args.dataset, args.num_prompts, tokenizer, args.output_len) if args.backend == "vllm": elapsed_time = run_vllm(requests, args.n, - EngineArgs.from_cli_args(args)) + EngineArgs.from_cli_args(args), + args.disable_detokenize) else: raise ValueError(f"Unknown backend: {args.backend}") total_num_tokens = sum(prompt_len + output_len @@ -163,6 +177,12 @@ if __name__ == "__main__": type=str, default=None, help='Path to save the throughput results in JSON format.') + parser.add_argument( + '--disable-detokenize', + action='store_true', + help=("Do not detokenize responses (i.e. do not include " + "detokenization time in the latency measurement)"), + ) parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index bc026b0ec1ca6..47627126b6688 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 r"""Benchmark online serving throughput. On the server side, run one of the following commands: @@ -24,23 +25,20 @@ On the client side, run: """ import argparse import asyncio -import base64 import gc -import io import json import os import random import time import warnings +from collections.abc import AsyncGenerator, Iterable from dataclasses import dataclass from datetime import datetime -from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple +from typing import Any, Optional import numpy as np from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, RequestFuncOutput) -from datasets import load_dataset -from PIL.Image import Image from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase @@ -54,6 +52,11 @@ try: except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser +from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset, + RandomDataset, SampleRequest, ShareGPTDataset, + SonnetDataset, VisionArenaDataset) +from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json + MILLISECONDS_TO_SECONDS_CONVERSION = 1000 @@ -69,334 +72,48 @@ class BenchmarkMetrics: mean_ttft_ms: float median_ttft_ms: float std_ttft_ms: float - percentiles_ttft_ms: List[Tuple[float, float]] + percentiles_ttft_ms: list[tuple[float, float]] mean_tpot_ms: float median_tpot_ms: float std_tpot_ms: float - percentiles_tpot_ms: List[Tuple[float, float]] + percentiles_tpot_ms: list[tuple[float, float]] mean_itl_ms: float median_itl_ms: float std_itl_ms: float - percentiles_itl_ms: List[Tuple[float, float]] + percentiles_itl_ms: list[tuple[float, float]] # E2EL stands for end-to-end latency per request. # It is the time taken on the client side from sending # a request to receiving a complete response. mean_e2el_ms: float median_e2el_ms: float std_e2el_ms: float - percentiles_e2el_ms: List[Tuple[float, float]] - - -def sample_sharegpt_requests( - dataset_path: str, - num_requests: int, - tokenizer: PreTrainedTokenizerBase, - fixed_output_len: Optional[int] = None, -) -> List[Tuple[str, int, int, None]]: - # Load the dataset. - with open(dataset_path, encoding='utf-8') as f: - dataset = json.load(f) - # Filter out the conversations with less than 2 turns. - dataset = [data for data in dataset if len(data["conversations"]) >= 2] - # Only keep the first two turns of each conversation. - dataset = [(data["conversations"][0]["value"], - data["conversations"][1]["value"]) for data in dataset] - - # Shuffle the dataset. - random.shuffle(dataset) - - # Filter out sequences that are too long or too short - filtered_dataset: List[Tuple[str, int, int]] = [] - for i in range(len(dataset)): - if len(filtered_dataset) == num_requests: - break - - # Tokenize the prompts and completions. - prompt = dataset[i][0] - prompt_token_ids = tokenizer(prompt).input_ids - completion = dataset[i][1] - completion_token_ids = tokenizer(completion).input_ids - prompt_len = len(prompt_token_ids) - output_len = len(completion_token_ids - ) if fixed_output_len is None else fixed_output_len - if prompt_len < 4 or (fixed_output_len is None and output_len < 4): - # Prune too short sequences. - continue - if prompt_len > 1024 or prompt_len + output_len > 2048: - # Prune too long sequences. - continue - filtered_dataset.append((prompt, prompt_len, output_len, None)) - - return filtered_dataset - - -def sample_sonnet_requests( - dataset_path: str, - num_requests: int, - input_len: int, - output_len: int, - prefix_len: int, - tokenizer: PreTrainedTokenizerBase, -) -> List[Tuple[str, str, int, int, None]]: - assert ( - input_len > prefix_len - ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'." - - # Load the dataset. - with open(dataset_path, encoding='utf-8') as f: - poem_lines = f.readlines() - - # Tokenize the poem lines. - poem_token_ids = tokenizer(poem_lines).input_ids - average_poem_len = sum( - len(token_ids) for token_ids in poem_token_ids) / len(poem_token_ids) - - # Base prefix for all requests. - base_prompt = "Pick as many lines as you can from these poem lines:\n" - base_message = [{ - "role": "user", - "content": base_prompt, - }] - base_prompt_formatted = tokenizer.apply_chat_template( - base_message, add_generation_prompt=True, tokenize=False) - base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids) - - assert ( - input_len > base_prompt_offset - ), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}." - num_input_lines = round( - (input_len - base_prompt_offset) / average_poem_len) - - # First approximately `prefix_len` number of tokens in the - # prompt are fixed poem lines. - assert ( - prefix_len > base_prompt_offset - ), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}." - - num_prefix_lines = round( - (prefix_len - base_prompt_offset) / average_poem_len) - prefix_lines = poem_lines[:num_prefix_lines] - - # Sample the rest of lines per request. - sampled_requests: List[Tuple[str, int, int]] = [] - for _ in range(num_requests): - num_lines_needed = num_input_lines - num_prefix_lines - sampled_lines = "".join(prefix_lines + - random.choices(poem_lines, k=num_lines_needed)) - - prompt = f"{base_prompt}{sampled_lines}" - message = [ - { - "role": "user", - "content": prompt, - }, - ] - prompt_formatted = tokenizer.apply_chat_template( - message, add_generation_prompt=True, tokenize=False) - prompt_len = len(tokenizer(prompt_formatted).input_ids) - sampled_requests.append( - (prompt, prompt_formatted, prompt_len, output_len, None)) - - return sampled_requests - - -def sample_mmmu_pro_vision_requests( - dataset, - num_requests: int, - tokenizer: PreTrainedTokenizerBase, - fixed_output_len: Optional[int] = None, -) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]: - sampled_requests: List[Tuple[str, int, int, Dict[str, - Collection[str]]]] = [] - for data in dataset: - if len(sampled_requests) == num_requests: - break - - # MMMU-Pro vision direct prompt - # Ref: https://github.com/MMMU-Benchmark/MMMU/blob/6ce42f4d8f70c1841c67867152648974415b5cac/mmmu-pro/prompts.yaml#L5 - prompt = ( - "Answer with the option letter from the given choices directly. " - "The last line of your response should be of the following " - "format: 'Answer: $LETTER' (without quotes) where LETTER is one of " - "options.") - - prompt_token_ids = tokenizer(prompt).input_ids - if fixed_output_len is None: - # Default max output len is set to 128 - print("--hf-output-len is not provided. Using default value 128.") - fixed_output_len = 128 - - prompt_len = len(prompt_token_ids) - output_len = fixed_output_len - - assert isinstance( - data["image"], - Image), ("Input image format must be `PIL.Image.Image`, " - f"given {type(data['image'])}.") - image: Image = data["image"] - image = image.convert("RGB") - image_data = io.BytesIO() - image.save(image_data, format='JPEG') - image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") - mm_content = { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{image_base64}" - }, - } - - sampled_requests.append((prompt, prompt_len, output_len, mm_content)) - - return sampled_requests - - -def sample_hf_requests( - dataset_path: str, - dataset_subset: str, - dataset_split: str, - num_requests: int, - tokenizer: PreTrainedTokenizerBase, - random_seed: int, - fixed_output_len: Optional[int] = None, -) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]: - - # Special case for MMMU-Pro vision dataset - if dataset_path == 'MMMU/MMMU_Pro' and dataset_subset == 'vision': - assert dataset_split == "test" - dataset = load_dataset(dataset_path, - name=dataset_subset, - split=dataset_split, - streaming=True) - assert "image" in dataset.features, ( - "MMMU/MMMU_Pro vision dataset must have 'image' column.") - filter_func = lambda x: isinstance(x["image"], Image) - dataset = dataset.shuffle(seed=random_seed).filter(filter_func) - return sample_mmmu_pro_vision_requests(dataset, num_requests, - tokenizer, fixed_output_len) - - dataset = load_dataset(dataset_path, - name=dataset_subset, - split=dataset_split, - streaming=True) - assert "conversations" in dataset.features, ( - "HF Dataset must have 'conversations' column.") - filter_func = lambda x: len(x["conversations"]) >= 2 - filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func) - sampled_requests: List[Tuple[str, int, int, Dict[str, - Collection[str]]]] = [] - for data in filtered_dataset: - if len(sampled_requests) == num_requests: - break - - # Tokenize the prompts and completions. - prompt = data["conversations"][0]["value"] - prompt_token_ids = tokenizer(prompt).input_ids - completion = data["conversations"][1]["value"] - completion_token_ids = tokenizer(completion).input_ids - prompt_len = len(prompt_token_ids) - output_len = len(completion_token_ids - ) if fixed_output_len is None else fixed_output_len - if fixed_output_len is None and (prompt_len < 4 or output_len < 4): - # Prune too short sequences. - continue - if fixed_output_len is None and \ - (prompt_len > 1024 or prompt_len + output_len > 2048): - # Prune too long sequences. - continue - - if "image" in data and isinstance(data["image"], Image): - image: Image = data["image"] - image = image.convert("RGB") - image_data = io.BytesIO() - image.save(image_data, format='JPEG') - image_base64 = base64.b64encode( - image_data.getvalue()).decode("utf-8") - mm_content = { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{image_base64}" - }, - } - elif "image" in data and isinstance(data["image"], str): - if (data["image"].startswith("http://") or \ - data["image"].startswith("file://")): - image_url = data["image"] - else: - image_url = f"file://{data['image']}" - - mm_content = { - "type": "image_url", - "image_url": { - "url": image_url - }, - } - else: - mm_content = None - - sampled_requests.append((prompt, prompt_len, output_len, mm_content)) - - return sampled_requests - - -def sample_random_requests( - prefix_len: int, - input_len: int, - output_len: int, - num_prompts: int, - range_ratio: float, - tokenizer: PreTrainedTokenizerBase, -) -> List[Tuple[str, int, int]]: - prefix_token_ids = np.random.randint(0, - tokenizer.vocab_size, - size=prefix_len).tolist() - - input_lens = np.random.randint( - int(input_len * range_ratio), - input_len + 1, - size=num_prompts, - ) - output_lens = np.random.randint( - int(output_len * range_ratio), - output_len + 1, - size=num_prompts, - ) - offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts) - input_requests = [] - for i in range(num_prompts): - prompt = tokenizer.decode(prefix_token_ids + - [(offsets[i] + i + j) % tokenizer.vocab_size - for j in range(input_lens[i])]) - - input_requests.append((prompt, int(prefix_len + input_lens[i]), - int(output_lens[i]), None)) - - return input_requests + percentiles_e2el_ms: list[tuple[float, float]] async def get_request( - input_requests: List[Tuple[str, int, int]], + input_requests: list[SampleRequest], request_rate: float, burstiness: float = 1.0, -) -> AsyncGenerator[Tuple[str, int, int], None]: +) -> AsyncGenerator[SampleRequest, None]: """ - Asynchronously generates requests at a specified rate + Asynchronously generates requests at a specified rate with OPTIONAL burstiness. - + Args: - input_requests: - A list of input requests, each represented as a tuple. - request_rate: + input_requests: + A list of input requests, each represented as a SampleRequest. + request_rate: The rate at which requests are generated (requests/s). - burstiness (optional): - The burstiness factor of the request generation. + burstiness (optional): + The burstiness factor of the request generation. Only takes effect when request_rate is not inf. Default value is 1, which follows a Poisson process. Otherwise, the request intervals follow a gamma distribution. - A lower burstiness value (0 < burstiness < 1) results - in more bursty requests, while a higher burstiness value + A lower burstiness value (0 < burstiness < 1) results + in more bursty requests, while a higher burstiness value (burstiness > 1) results in a more uniform arrival of requests. """ - input_requests = iter(input_requests) + input_requests: Iterable[SampleRequest] = iter(input_requests) # Calculate scale parameter theta to maintain the desired request_rate. assert burstiness > 0, ( @@ -418,23 +135,23 @@ async def get_request( def calculate_metrics( - input_requests: List[Tuple[str, int, int]], - outputs: List[RequestFuncOutput], + input_requests: list[SampleRequest], + outputs: list[RequestFuncOutput], dur_s: float, tokenizer: PreTrainedTokenizerBase, - selected_percentile_metrics: List[str], - selected_percentiles: List[float], - goodput_config_dict: Dict[str, float], -) -> Tuple[BenchmarkMetrics, List[int]]: - actual_output_lens: List[int] = [] + selected_percentile_metrics: list[str], + selected_percentiles: list[float], + goodput_config_dict: dict[str, float], +) -> tuple[BenchmarkMetrics, list[int]]: + actual_output_lens: list[int] = [] total_input = 0 completed = 0 good_completed = 0 - itls: List[float] = [] - tpots: List[float] = [] - all_tpots: List[float] = [] - ttfts: List[float] = [] - e2els: List[float] = [] + itls: list[float] = [] + tpots: list[float] = [] + all_tpots: list[float] = [] + ttfts: list[float] = [] + e2els: list[float] = [] for i in range(len(outputs)): if outputs[i].success: output_len = outputs[i].output_tokens @@ -449,7 +166,7 @@ def calculate_metrics( tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids) actual_output_lens.append(output_len) - total_input += input_requests[i][1] + total_input += input_requests[i].prompt_len tpot = 0 if output_len > 1: latency_minus_ttft = outputs[i].latency - outputs[i].ttft @@ -532,18 +249,18 @@ async def benchmark( model_id: str, model_name: str, tokenizer: PreTrainedTokenizerBase, - input_requests: List[Tuple[str, int, int]], + input_requests: list[SampleRequest], logprobs: Optional[int], - best_of: int, request_rate: float, burstiness: float, disable_tqdm: bool, profile: bool, - selected_percentile_metrics: List[str], - selected_percentiles: List[str], + selected_percentile_metrics: list[str], + selected_percentiles: list[float], ignore_eos: bool, - goodput_config_dict: Dict[str, float], + goodput_config_dict: dict[str, float], max_concurrency: Optional[int], + lora_modules: Optional[Iterable[str]], ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -551,12 +268,16 @@ async def benchmark( raise ValueError(f"Unknown backend: {backend}") print("Starting initial single prompt test run...") - test_prompt, test_prompt_len, test_output_len, test_mm_content = ( - input_requests[0]) + test_prompt, test_prompt_len, test_output_len, test_mm_content = \ + input_requests[0].prompt, input_requests[0].prompt_len, \ + input_requests[0].expected_output_len, \ + input_requests[0].multi_modal_data + if backend != "openai-chat" and test_mm_content is not None: # multi-modal benchmark is only available on OpenAI Chat backend. raise ValueError( "Multi-modal content is only supported on 'openai-chat' backend.") + assert test_mm_content is None or isinstance(test_mm_content, dict) test_input = RequestFuncInput( model=model_id, model_name=model_name, @@ -565,10 +286,10 @@ async def benchmark( prompt_len=test_prompt_len, output_len=test_output_len, logprobs=logprobs, - best_of=best_of, multi_modal_content=test_mm_content, ignore_eos=ignore_eos, ) + test_output = await request_func(request_func_input=test_input) if not test_output.success: raise ValueError( @@ -577,6 +298,12 @@ async def benchmark( else: print("Initial test run completed. Starting main benchmark run...") + if lora_modules: + # For each input request, choose a LoRA module at random. + lora_modules = iter( + [random.choice(lora_modules) \ + for _ in range(len(input_requests))]) + if profile: print("Starting profiler...") profile_input = RequestFuncInput(model=model_id, @@ -586,7 +313,6 @@ async def benchmark( prompt_len=test_prompt_len, output_len=test_output_len, logprobs=logprobs, - best_of=best_of, multi_modal_content=test_mm_content, ignore_eos=ignore_eos) profile_output = await request_func(request_func_input=profile_input) @@ -620,24 +346,30 @@ async def benchmark( pbar=pbar) benchmark_start_time = time.perf_counter() - tasks: List[asyncio.Task] = [] + tasks: list[asyncio.Task] = [] async for request in get_request(input_requests, request_rate, burstiness): - prompt, prompt_len, output_len, mm_content = request - request_func_input = RequestFuncInput(model=model_id, - model_name=model_name, + prompt, prompt_len, output_len, mm_content = request.prompt, \ + request.prompt_len, request.expected_output_len, \ + request.multi_modal_data + req_model_id, req_model_name = model_id, model_name + if lora_modules: + req_lora_module = next(lora_modules) + req_model_id, req_model_name = req_lora_module, req_lora_module + + request_func_input = RequestFuncInput(model=req_model_id, + model_name=req_model_name, prompt=prompt, api_url=api_url, prompt_len=prompt_len, output_len=output_len, logprobs=logprobs, - best_of=best_of, multi_modal_content=mm_content, ignore_eos=ignore_eos) tasks.append( asyncio.create_task( limited_request_func(request_func_input=request_func_input, pbar=pbar))) - outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) + outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) if profile: print("Stopping profiler...") @@ -648,7 +380,6 @@ async def benchmark( prompt_len=test_prompt_len, output_len=test_output_len, logprobs=logprobs, - best_of=best_of, ) profile_output = await request_func(request_func_input=profile_input) if profile_output.success: @@ -782,6 +513,31 @@ def parse_goodput(slo_pairs): return goodput_config_dict +def save_to_pytorch_benchmark_format(args: argparse.Namespace, + results: dict[str, Any], + file_name: str) -> None: + metrics = [ + "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms", + "mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms", + "median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms" + ] + # These raw data might be useful, but they are rather big. They can be added + # later if needed + ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"] + pt_records = convert_to_pytorch_benchmark_format( + args=args, + metrics={k: [results[k]] + for k in metrics}, + extra_info={ + k: results[k] + for k in results if k not in metrics and k not in ignored_metrics + }) + if pt_records: + # Don't use json suffix here as we don't want CI to pick it up + pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json" + write_to_json(pt_file, pt_records) + + def main(args: argparse.Namespace): print(args) random.seed(args.seed) @@ -804,81 +560,77 @@ def main(args: argparse.Namespace): tokenizer_mode=tokenizer_mode, trust_remote_code=args.trust_remote_code) - if args.dataset is not None: - warnings.warn( - "The '--dataset' argument will be deprecated in the next " - "release. Please use '--dataset-name' and " - "'--dataset-path' in the future runs.", - stacklevel=2) - input_requests = sample_sharegpt_requests( - dataset_path=args.dataset, - num_requests=args.num_prompts, - tokenizer=tokenizer, - fixed_output_len=args.sharegpt_output_len, - ) + if args.dataset_name is None: + raise ValueError( + "Please specify '--dataset-name' and the corresponding " + "'--dataset-path' if required.") - elif args.dataset_name == "sharegpt": - input_requests = sample_sharegpt_requests( - dataset_path=args.dataset_path, - num_requests=args.num_prompts, - tokenizer=tokenizer, - fixed_output_len=args.sharegpt_output_len, - ) - - elif args.dataset_name == "sonnet": - # Do not format the prompt, pass to message directly + if args.dataset_name == "sonnet": + dataset = SonnetDataset(dataset_path=args.dataset_path) + # For the "sonnet" dataset, formatting depends on the backend. if args.backend == "openai-chat": - input_requests = sample_sonnet_requests( - dataset_path=args.dataset_path, - num_requests=args.num_prompts, - input_len=args.sonnet_input_len, - output_len=args.sonnet_output_len, - prefix_len=args.sonnet_prefix_len, - tokenizer=tokenizer, - ) - input_requests = [(prompt, prompt_len, output_len, None) - for prompt, prompt_formatted, prompt_len, - output_len, _ in input_requests] + input_requests = dataset.sample(num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + return_prompt_formatted=False) else: - assert ( - tokenizer.chat_template or tokenizer.default_chat_template - ), "Tokenizer/model must have chat template for sonnet dataset." - input_requests = sample_sonnet_requests( - dataset_path=args.dataset_path, - num_requests=args.num_prompts, - input_len=args.sonnet_input_len, - output_len=args.sonnet_output_len, - prefix_len=args.sonnet_prefix_len, - tokenizer=tokenizer, - ) - input_requests = [(prompt_formatted, prompt_len, output_len, None) - for prompt, prompt_formatted, prompt_len, - output_len, _ in input_requests] + assert tokenizer.chat_template or tokenizer.default_chat_template, ( + "Tokenizer/model must have chat template for sonnet dataset.") + input_requests = dataset.sample(num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + return_prompt_formatted=True) elif args.dataset_name == "hf": - input_requests = sample_hf_requests( + # Choose between VisionArenaDataset + # and HuggingFaceDataset based on provided parameters. + dataset_class = (VisionArenaDataset if args.dataset_path + == VisionArenaDataset.VISION_ARENA_DATASET_PATH + and args.hf_subset is None else HuggingFaceDataset) + input_requests = dataset_class( dataset_path=args.dataset_path, dataset_subset=args.hf_subset, dataset_split=args.hf_split, + ).sample( num_requests=args.num_prompts, tokenizer=tokenizer, random_seed=args.seed, - fixed_output_len=args.hf_output_len, - ) - - elif args.dataset_name == "random": - input_requests = sample_random_requests( - prefix_len=args.random_prefix_len, - input_len=args.random_input_len, - output_len=args.random_output_len, - num_prompts=args.num_prompts, - range_ratio=args.random_range_ratio, - tokenizer=tokenizer, + output_len=args.hf_output_len, ) else: - raise ValueError(f"Unknown dataset: {args.dataset_name}") + # For datasets that follow a similar structure, use a mapping. + dataset_mapping = { + "sharegpt": + lambda: ShareGPTDataset(random_seed=args.seed, + dataset_path=args.dataset_path).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + output_len=args.sharegpt_output_len, + ), + "burstgpt": + lambda: BurstGPTDataset(random_seed=args.seed, + dataset_path=args.dataset_path). + sample(tokenizer=tokenizer, num_requests=args.num_prompts), + "random": + lambda: RandomDataset(dataset_path=args.dataset_path).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + prefix_len=args.random_prefix_len, + input_len=args.random_input_len, + output_len=args.random_output_len, + range_ratio=args.random_range_ratio, + ) + } + try: + input_requests = dataset_mapping[args.dataset_name]() + except KeyError as err: + raise ValueError(f"Unknown dataset: {args.dataset_name}") from err goodput_config_dict = check_goodput_args(args) # Avoid GC processing "static" data - reduce pause times. @@ -895,7 +647,6 @@ def main(args: argparse.Namespace): tokenizer=tokenizer, input_requests=input_requests, logprobs=args.logprobs, - best_of=args.best_of, request_rate=args.request_rate, burstiness=args.burstiness, disable_tqdm=args.disable_tqdm, @@ -907,11 +658,12 @@ def main(args: argparse.Namespace): ignore_eos=args.ignore_eos, goodput_config_dict=goodput_config_dict, max_concurrency=args.max_concurrency, + lora_modules=args.lora_modules, )) # Save config and results to json if args.save_result: - result_json: Dict[str, Any] = {} + result_json: dict[str, Any] = {} # Setup current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") @@ -919,7 +671,6 @@ def main(args: argparse.Namespace): result_json["backend"] = backend result_json["model_id"] = model_id result_json["tokenizer_id"] = tokenizer_id - result_json["best_of"] = args.best_of result_json["num_prompts"] = args.num_prompts # Metadata @@ -933,9 +684,18 @@ def main(args: argparse.Namespace): "Invalid metadata format. Please use KEY=VALUE format." ) + if not args.save_detailed: + # Remove fields with too many data points + for field in [ + "input_lens", "output_lens", "ttfts", "itls", + "generated_texts", "errors" + ]: + if field in result_json: + del result_json[field] + # Traffic - result_json["request_rate"] = ( - args.request_rate if args.request_rate < float("inf") else "inf") + result_json["request_rate"] = (args.request_rate if args.request_rate + < float("inf") else "inf") result_json["burstiness"] = args.burstiness result_json["max_concurrency"] = args.max_concurrency @@ -953,6 +713,7 @@ def main(args: argparse.Namespace): file_name = os.path.join(args.result_dir, file_name) with open(file_name, "w", encoding='utf-8') as outfile: json.dump(result_json, outfile) + save_to_pytorch_benchmark_format(args, result_json, file_name) if __name__ == "__main__": @@ -970,7 +731,8 @@ if __name__ == "__main__": default=None, help="Server or API base url if not using http host and port.", ) - parser.add_argument("--host", type=str, default="localhost") + # Use 127.0.0.1 here instead of localhost to force the use of ipv4 + parser.add_argument("--host", type=str, default="127.0.0.1") parser.add_argument("--port", type=int, default=8000) parser.add_argument( "--endpoint", @@ -978,18 +740,11 @@ if __name__ == "__main__": default="/v1/completions", help="API endpoint.", ) - parser.add_argument( - "--dataset", - type=str, - default=None, - help="Path to the ShareGPT dataset, will be deprecated in the " - "next release.", - ) parser.add_argument( "--dataset-name", type=str, default="sharegpt", - choices=["sharegpt", "sonnet", "random", "hf"], + choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"], help="Name of the dataset to benchmark on.", ) parser.add_argument("--dataset-path", @@ -1022,13 +777,6 @@ if __name__ == "__main__": help= "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 ) - parser.add_argument( - "--best-of", - type=int, - default=1, - help="Generates `best_of` sequences per prompt and " - "returns the best one.", - ) parser.add_argument("--use-beam-search", action="store_true") parser.add_argument( "--num-prompts", @@ -1089,6 +837,12 @@ if __name__ == "__main__": action="store_true", help="Specify to save benchmark results to a json file", ) + parser.add_argument( + "--save-detailed", + action="store_true", + help="When saving the results, whether to include per request " + "information such as response, error, ttfs, tpots, etc.", + ) parser.add_argument( "--metadata", metavar="KEY=VALUE", @@ -1231,11 +985,12 @@ if __name__ == "__main__": '--tokenizer-mode', type=str, default="auto", - choices=['auto', 'slow', 'mistral'], + choices=['auto', 'slow', 'mistral', 'custom'], help='The tokenizer mode.\n\n* "auto" will use the ' 'fast tokenizer if available.\n* "slow" will ' 'always use the slow tokenizer. \n* ' - '"mistral" will always use the `mistral_common` tokenizer.') + '"mistral" will always use the `mistral_common` tokenizer. \n*' + '"custom" will use --tokenizer to select the preregistered tokenizer.') parser.add_argument("--served-model-name", type=str, @@ -1244,5 +999,13 @@ if __name__ == "__main__": "If not specified, the model name will be the " "same as the ``--model`` argument. ") + parser.add_argument("--lora-modules", + nargs='+', + default=None, + help="A subset of LoRA module names passed in when " + "launching the server. For each request, the " + "script chooses a LoRA module at random.") + args = parser.parse_args() + main(args) diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_structured_output.py similarity index 76% rename from benchmarks/benchmark_serving_guided.py rename to benchmarks/benchmark_serving_structured_output.py index 4435d87e18a8a..c79a93faff197 100644 --- a/benchmarks/benchmark_serving_guided.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -1,4 +1,5 @@ -r"""Benchmark online serving throughput with guided decoding. +# SPDX-License-Identifier: Apache-2.0 +r"""Benchmark online serving throughput with structured outputs. On the server side, run one of the following commands: (vLLM OpenAI API server) @@ -8,12 +9,12 @@ On the server side, run one of the following commands: ./launch_tgi_server.sh On the client side, run: - python benchmarks/benchmark_serving.py \ + python benchmarks/benchmark_serving_structured_output.py \ --backend \ --model \ --dataset json \ - --guided-decoding-ratio 1.0 \ - --guided-decoding-backend xgrammar \ + --structured-output-ratio 1.0 \ + --structured-output-backend xgrammar \ --request-rate 10 \ --num-prompts 1000 @@ -23,14 +24,17 @@ On the client side, run: """ import argparse import asyncio +import copy import dataclasses import json import os import random import time +import uuid import warnings +from collections.abc import AsyncGenerator from dataclasses import dataclass -from typing import AsyncGenerator, List, Optional, Tuple +from typing import Optional import datasets import numpy as np @@ -50,6 +54,9 @@ try: except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser +from vllm.v1.structured_output.utils import ( + has_xgrammar_unsupported_json_features) + MILLISECONDS_TO_SECONDS_CONVERSION = 1000 @@ -65,22 +72,22 @@ class BenchmarkMetrics: mean_ttft_ms: float median_ttft_ms: float std_ttft_ms: float - percentiles_ttft_ms: List[Tuple[float, float]] + percentiles_ttft_ms: list[tuple[float, float]] mean_tpot_ms: float median_tpot_ms: float std_tpot_ms: float - percentiles_tpot_ms: List[Tuple[float, float]] + percentiles_tpot_ms: list[tuple[float, float]] mean_itl_ms: float median_itl_ms: float std_itl_ms: float - percentiles_itl_ms: List[Tuple[float, float]] + percentiles_itl_ms: list[tuple[float, float]] # E2EL stands for end-to-end latency per request. # It is the time taken on the client side from sending # a request to receiving a complete response. mean_e2el_ms: float median_e2el_ms: float std_e2el_ms: float - percentiles_e2el_ms: List[Tuple[float, float]] + percentiles_e2el_ms: list[tuple[float, float]] @dataclasses.dataclass @@ -103,25 +110,44 @@ class SampleRequest: def sample_requests(tokenizer: PreTrainedTokenizerBase, - args: argparse.Namespace) -> List[SampleRequest]: - if args.dataset == 'json': + args: argparse.Namespace) -> list[SampleRequest]: + if args.dataset == 'json' or args.dataset == 'json-unique': if args.json_schema_path is None: dir_path = os.path.dirname(os.path.realpath(__file__)) args.json_schema_path = os.path.join(dir_path, "structured_schemas", "structured_schema_1.json") + json_schemas = [] with open(args.json_schema_path) as f: schema = json.load(f) - prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501 - input_len = len(tokenizer(prompt).input_ids) - print(f"Input length of the prompt: {input_len} tokens") + + if args.dataset == 'json-unique': + json_schemas = [ + copy.deepcopy(schema) for _ in range(args.num_prompts) + ] + for i in range(len(json_schemas)): + json_schemas[i]["properties"][ + f"__optional_field_{uuid.uuid4()}"] = { + "type": + "string", + "description": + "An unique optional field to avoid cached schemas" + } + + def gen_prompt(index: int): + schema = json_schemas[index % len(json_schemas)] + return f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501 + + def get_schema(index: int): + return json_schemas[index % len(json_schemas)] + requests = [ - SampleRequest(prompt=prompt, - prompt_len=input_len, + SampleRequest(prompt=gen_prompt(i), + prompt_len=len(tokenizer(gen_prompt(i)).input_ids), expected_output_len=args.output_len, - schema=schema, + schema=get_schema(i), structure_type=args.structure_type) - for _ in range(args.num_prompts) + for i in range(args.num_prompts) ] elif args.dataset == "grammar": @@ -186,10 +212,20 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, ] elif args.dataset == "xgrammar_bench": - requests: List[SampleRequest] = [] + requests: list[SampleRequest] = [] dataset = datasets.load_dataset("NousResearch/json-mode-eval", split="train") - print(f"dataset has {len(dataset)} entries") + full_dataset_len = len(dataset) + + def _filter_func(item): + import json + schema = json.loads(item["schema"]) + return not has_xgrammar_unsupported_json_features(schema) + + dataset = dataset.filter(_filter_func) + num_filtered_out = full_dataset_len - len(dataset) + print(f"dataset has {len(dataset)} entries after filtering " + f"out {num_filtered_out} entries with unsupported features") len_dataset = len(dataset) for data_point_idx in range(args.num_prompts): idx = data_point_idx @@ -213,26 +249,26 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, async def get_request( - input_requests: List[SampleRequest], + input_requests: list[SampleRequest], request_rate: float, burstiness: float = 1.0, -) -> AsyncGenerator[Tuple[int, SampleRequest], None]: +) -> AsyncGenerator[tuple[int, SampleRequest], None]: """ - Asynchronously generates requests at a specified rate + Asynchronously generates requests at a specified rate with OPTIONAL burstiness. - + Args: - input_requests: + input_requests: A list of input requests, each represented as a tuple. - request_rate: + request_rate: The rate at which requests are generated (requests/s). - burstiness (optional): - The burstiness factor of the request generation. + burstiness (optional): + The burstiness factor of the request generation. Only takes effect when request_rate is not inf. Default value is 1, which follows a Poisson process. Otherwise, the request intervals follow a gamma distribution. - A lower burstiness value (0 < burstiness < 1) results - in more bursty requests, while a higher burstiness value + A lower burstiness value (0 < burstiness < 1) results + in more bursty requests, while a higher burstiness value (burstiness > 1) results in a more uniform arrival of requests. """ input_requests = iter(input_requests) @@ -257,22 +293,23 @@ async def get_request( def calculate_metrics( - input_requests: List[Tuple[str, int, int]], - outputs: List[RequestFuncOutput], + input_requests: list[tuple[str, int, int]], + outputs: list[RequestFuncOutput], dur_s: float, tokenizer: PreTrainedTokenizerBase, - selected_percentile_metrics: List[str], - selected_percentiles: List[float], -) -> Tuple[BenchmarkMetrics, List[int]]: - actual_output_lens: List[int] = [] + selected_percentile_metrics: list[str], + selected_percentiles: list[float], + goodput_config_dict: Optional[dict[str, float]] = None, +) -> tuple[BenchmarkMetrics, list[int]]: + actual_output_lens: list[int] = [] total_input = 0 completed = 0 good_completed = 0 - itls: List[float] = [] - tpots: List[float] = [] - all_tpots: List[float] = [] - ttfts: List[float] = [] - e2els: List[float] = [] + itls: list[float] = [] + tpots: list[float] = [] + all_tpots: list[float] = [] + ttfts: list[float] = [] + e2els: list[float] = [] for i in range(len(outputs)): if outputs[i].success: # We use the tokenizer to count the number of output tokens for all @@ -286,10 +323,10 @@ def calculate_metrics( total_input += input_requests[i].prompt_len tpot = 0 if output_len > 1: - tpot = (outputs[i].latency - outputs[i].ttft) / (output_len - - 1) + latency_minus_ttft = outputs[i].latency - outputs[i].ttft + tpot = latency_minus_ttft / (output_len - 1) tpots.append(tpot) - outputs[i].tpot = sum(tpots) / len(tpots) if len(tpots) else 0 + outputs[i].tpot = tpot # Note: if output_len <= 1, we regard tpot as 0 for goodput all_tpots.append(tpot) itls += outputs[i].itl @@ -299,6 +336,28 @@ def calculate_metrics( else: actual_output_lens.append(0) + if goodput_config_dict: + valid_metrics = [] + slo_values = [] + + if "ttft" in goodput_config_dict: + valid_metrics.append(ttfts) + slo_values.append(goodput_config_dict["ttft"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + if "tpot" in goodput_config_dict: + valid_metrics.append(all_tpots) + slo_values.append(goodput_config_dict["tpot"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + if "e2el" in goodput_config_dict: + valid_metrics.append(e2els) + slo_values.append(goodput_config_dict["e2el"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + + for req_metric in zip(*valid_metrics): + is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) + if is_good_req: + good_completed += 1 + if completed == 0: warnings.warn( "All requests failed. This is likely due to a misconfiguration " @@ -344,17 +403,18 @@ async def benchmark( base_url: str, model_id: str, tokenizer: PreTrainedTokenizerBase, - input_requests: List[SampleRequest], + input_requests: list[SampleRequest], request_rate: float, burstiness: float, disable_tqdm: bool, profile: bool, - selected_percentile_metrics: List[str], - selected_percentiles: List[str], + selected_percentile_metrics: list[str], + selected_percentiles: list[str], ignore_eos: bool, max_concurrency: Optional[int], - guided_decoding_ratio: float, - guided_decoding_backend: str, + structured_output_ratio: float, + structured_output_backend: str, + goodput_config_dict: Optional[dict[str, float]] = None, ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -365,16 +425,18 @@ async def benchmark( extra_body = {} # Add the schema to the extra_body extra_body[request.structure_type] = request.schema - # Add the specific guided_decoding_backend - extra_body["guided_decoding_backend"] = guided_decoding_backend + # Add the specific structured_output_backend + extra_body["guided_decoding_backend"] = structured_output_backend return extra_body print("Starting initial single prompt test run...") - guided_decoding_req_idx = random.sample( + structured_output_req_idx = random.sample( range(len(input_requests)), - int(len(input_requests) * guided_decoding_ratio)) + int(len(input_requests) * structured_output_ratio)) test_request = input_requests[0] + test_req_extra_body = (prepare_extra_body(test_request) + if 0 in structured_output_req_idx else None) test_input = RequestFuncInput( model=model_id, prompt=test_request.prompt, @@ -382,7 +444,7 @@ async def benchmark( prompt_len=test_request.prompt_len, output_len=test_request.expected_output_len, ignore_eos=ignore_eos, - extra_body=prepare_extra_body(test_request), + extra_body=test_req_extra_body, ) test_output = await request_func(request_func_input=test_input) if not test_output.success: @@ -401,7 +463,7 @@ async def benchmark( prompt_len=test_request.prompt_len, output_len=test_request.expected_output_len, ignore_eos=ignore_eos, - extra_body=prepare_extra_body(test_request), + extra_body=test_req_extra_body, ) profile_output = await request_func(request_func_input=profile_input) if profile_output.success: @@ -434,12 +496,12 @@ async def benchmark( pbar=pbar) benchmark_start_time = time.perf_counter() - tasks: List[asyncio.Task] = [] - expected: List[str] = [] + tasks: list[asyncio.Task] = [] + expected: list[str] = [] async for i, request in get_request(input_requests, request_rate, burstiness): extra_body = prepare_extra_body( - request) if i in guided_decoding_req_idx else None + request) if i in structured_output_req_idx else None request_func_input = RequestFuncInput( model=model_id, prompt=request.prompt, @@ -454,7 +516,7 @@ async def benchmark( asyncio.create_task( limited_request_func(request_func_input=request_func_input, pbar=pbar))) - outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) + outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) if profile: print("Stopping profiler...") @@ -482,6 +544,7 @@ async def benchmark( tokenizer=tokenizer, selected_percentile_metrics=selected_percentile_metrics, selected_percentiles=selected_percentiles, + goodput_config_dict=goodput_config_dict, ) print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) @@ -493,6 +556,9 @@ async def benchmark( metrics.total_output)) print("{:<40} {:<10.2f}".format("Request throughput (req/s):", metrics.request_throughput)) + if goodput_config_dict: + print("{:<40} {:<10.2f}".format("Request goodput (req/s):", + metrics.request_goodput)) print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", metrics.output_throughput)) print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", @@ -616,6 +682,40 @@ def evaluate(ret, args): 100) if len(not_none_scores) > 0 else None +def parse_goodput(slo_pairs): + goodput_config_dict = {} + try: + for slo_pair in slo_pairs: + slo_name, slo_val = slo_pair.split(":") + goodput_config_dict[slo_name] = float(slo_val) + except ValueError as err: + raise argparse.ArgumentTypeError( + "Invalid format found for service level objectives. " + "Specify service level objectives for goodput as \"KEY:VALUE\" " + "pairs, where the key is a metric name, and the value is a " + "number in milliseconds.") from err + return goodput_config_dict + + +def check_goodput_args(args): + goodput_config_dict = {} + VALID_NAMES = ["ttft", "tpot", "e2el"] + if args.goodput: + goodput_config_dict = parse_goodput(args.goodput) + for slo_name, slo_val in goodput_config_dict.items(): + if slo_name not in VALID_NAMES: + raise ValueError( + f"Invalid metric name found, {slo_name}: {slo_val}. " + "The service level objective name should be one of " + f"{str(VALID_NAMES)}. ") + if slo_val < 0: + raise ValueError( + f"Invalid value found, {slo_name}: {slo_val}. " + "The service level objective value should be " + "non-negative.") + return goodput_config_dict + + def main(args: argparse.Namespace): print(args) random.seed(args.seed) @@ -632,8 +732,11 @@ def main(args: argparse.Namespace): api_url = f"http://{args.host}:{args.port}{args.endpoint}" base_url = f"http://{args.host}:{args.port}" - tokenizer = get_tokenizer(tokenizer_id, - trust_remote_code=args.trust_remote_code) + tokenizer = get_tokenizer( + tokenizer_id, + trust_remote_code=args.trust_remote_code, + tokenizer_mode=args.tokenizer_mode, + ) if args.dataset == 'grammar': args.structure_type = 'guided_grammar' @@ -644,10 +747,10 @@ def main(args: argparse.Namespace): else: args.structure_type = 'guided_json' - if args.no_guided_decoding: - args.guided_decoding_ratio = 0 + if args.no_structured_output: + args.structured_output_ratio = 0 if args.save_results: - result_file_name = f'{args.guided_decoding_ratio}guided' + result_file_name = f'{args.structured_output_ratio}guided' result_file_name += f"_{backend}" result_file_name += f"_{args.request_rate}qps" result_file_name += f"_{args.model.split('/')[-1]}" @@ -660,6 +763,8 @@ def main(args: argparse.Namespace): input_requests = sample_requests(tokenizer, args) + goodput_config_dict = check_goodput_args(args) + benchmark_result, ret = asyncio.run( benchmark( backend=backend, @@ -678,8 +783,9 @@ def main(args: argparse.Namespace): ], ignore_eos=args.ignore_eos, max_concurrency=args.max_concurrency, - guided_decoding_ratio=args.guided_decoding_ratio, - guided_decoding_backend=args.guided_decoding_backend, + structured_output_ratio=args.structured_output_ratio, + structured_output_backend=args.structured_output_backend, + goodput_config_dict=goodput_config_dict, )) # Save config and results to json @@ -730,7 +836,8 @@ if __name__ == "__main__": default=None, help="Server or API base url if not using http host and port.", ) - parser.add_argument("--host", type=str, default="localhost") + # Use 127.0.0.1 here instead of localhost to force the use of ipv4 + parser.add_argument("--host", type=str, default="127.0.0.1") parser.add_argument("--port", type=int, default=8000) parser.add_argument( "--endpoint", @@ -738,10 +845,12 @@ if __name__ == "__main__": default="/v1/completions", help="API endpoint.", ) - parser.add_argument( - "--dataset", - default='json', - choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench']) + parser.add_argument("--dataset", + default='json', + choices=[ + 'json', 'json-unique', 'grammar', 'regex', + 'choice', 'xgrammar_bench' + ]) parser.add_argument("--json_schema_path", type=str, default=None, @@ -770,6 +879,13 @@ if __name__ == "__main__": help= "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 ) + parser.add_argument( + "--tokenizer-mode", + type=str, + default="auto", + help= + "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + ) parser.add_argument( "--num-prompts", type=int, @@ -863,19 +979,32 @@ if __name__ == "__main__": "Default value is \"99\". " "Use \"--percentile-metrics\" to select metrics.", ) - parser.add_argument("--no-guided-decoding", + parser.add_argument( + "--goodput", + nargs="+", + required=False, + help="Specify service level objectives for goodput as \"KEY:VALUE\" " + "pairs, where the key is a metric name, and the value is in " + "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, " + "separated by spaces. Allowed request level metric names are " + "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of " + "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " + "and the blog: https://hao-ai-lab.github.io/blogs/distserve") + + parser.add_argument("--no-structured-output", action='store_true', default=False, help="Whether to disable JSON decoding or not.") - parser.add_argument("--guided-decoding-ratio", + parser.add_argument("--structured-output-ratio", type=float, default=1.0, - help="Ratio of Guided Decoding requests") - parser.add_argument("--guided-decoding-backend", - type=str, - choices=["outlines", "lm-format-enforcer", "xgrammar"], - default="xgrammar", - help="Backend to use for guided decoding") + help="Ratio of Structured Outputs requests") + parser.add_argument( + "--structured-output-backend", + type=str, + choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"], + default="xgrammar", + help="Backend to use for structured outputs") args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index c1b10b3cf8f58..53869db478c51 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -1,15 +1,20 @@ +# SPDX-License-Identifier: Apache-2.0 """Benchmark offline inference throughput.""" import argparse import dataclasses import json +import os import random import time -from functools import cache -from typing import Dict, List, Optional, Tuple +import warnings +from typing import Any, Optional, Union import torch import uvloop -from PIL import Image +from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset, + RandomDataset, SampleRequest, ShareGPTDataset, + SonnetDataset, VisionArenaDataset) +from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from tqdm import tqdm from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) @@ -17,163 +22,35 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args) -from vllm.inputs import TextPrompt +from vllm.inputs import TextPrompt, TokensPrompt from vllm.lora.request import LoRARequest -from vllm.lora.utils import get_adapter_absolute_path -from vllm.multimodal import MultiModalDataDict +from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams -from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer from vllm.utils import FlexibleArgumentParser, merge_async_iterators -@dataclasses.dataclass -class SampleRequest: - """A class representing a single inference request for benchmarking. - - Attributes: - prompt: The input text prompt for the model. - prompt_len: The length of the prompt in tokens. - expected_output_len: The expected length of the output in tokens. - multi_modal_data: Optional dictionary containing multi-modal data (e.g. - images). - lora_request: Optional LoRARequest specifying the LoRA to use. - """ - prompt: str - prompt_len: int - expected_output_len: int - multi_modal_data: Optional[MultiModalDataDict] = None - lora_request: Optional[LoRARequest] = None - - -def _get_prompt_for_image_model(question: str, *, model: str) -> str: - """Prepend and append special tokens around the question to form a prompt. - - Args: - question: The input question text to wrap with special tokens - model: The name of the model being used, to determine which special - tokens to add - - Returns: - The formatted prompt string with appropriate special tokens for the - model - - Raises: - ValueError: If an unsupported model name is provided - """ - model = model.lower() - if "pixtral" in model: - return f"[INST]{question}\n[IMG][/INST]" - raise ValueError(f"Unsupported model {model}") - - -@cache -def lora_path_on_disk(lora_path: str) -> str: - return get_adapter_absolute_path(lora_path) - - -lora_tokenizer_cache: Dict[int, AnyTokenizer] = {} - - -def get_random_lora_request( - args: argparse.Namespace -) -> Tuple[LoRARequest, Optional[AnyTokenizer]]: - global lora_tokenizer_cache - lora_id = random.randint(1, args.max_loras) - lora_request = LoRARequest(lora_name=str(lora_id), - lora_int_id=lora_id, - lora_path=lora_path_on_disk(args.lora_path)) - if lora_id not in lora_tokenizer_cache: - lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request) - return lora_request, lora_tokenizer_cache[lora_id] - - -def sample_requests(tokenizer: PreTrainedTokenizerBase, - args: argparse.Namespace) -> List[SampleRequest]: - - dataset_path: str = args.dataset - num_requests: int = args.num_prompts - fixed_output_len: Optional[int] = args.output_len - model: str = args.model - if fixed_output_len is not None and fixed_output_len < 4: - raise ValueError("output_len too small") - - # Load the dataset. - with open(dataset_path) as f: - dataset = json.load(f) - # Filter out the conversations with less than 2 turns. - dataset = [data for data in dataset if len(data["conversations"]) >= 2] - # Shuffle the dataset. - random.shuffle(dataset) - - # Filter out sequences that are too long or too short - filtered_dataset: List[SampleRequest] = [] - for data in tqdm(dataset, - total=len(filtered_dataset), - desc="sampling requests"): - if len(filtered_dataset) == num_requests: - break - - # Only keep the first two turns of each conversation. - prompt = data["conversations"][0]["value"] - completion = data["conversations"][1]["value"] - - multi_modal_data: Optional[MultiModalDataDict] = None - if "image" in data: - multi_modal_data = multi_modal_data or {} - image_path = data["image"] - # TODO(vllm-project/vllm/issues/9778): Support multiple images. - assert isinstance(image_path, - str), "Only support single image input" - try: - multi_modal_data["image"] = Image.open(image_path).convert( - "RGB") - except FileNotFoundError: - # Ignore datapoint where asset is missing - continue - prompt = _get_prompt_for_image_model(question=prompt, model=model) - - request_tokenizer = tokenizer - lora_request: Optional[LoRARequest] = None - if args.enable_lora: - lora_request, lora_tokenizer = get_random_lora_request(args) - if lora_tokenizer: - request_tokenizer = lora_tokenizer - - # Tokenize the prompts and completions. - prompt_token_ids = request_tokenizer(prompt).input_ids - completion_token_ids = request_tokenizer(completion).input_ids - prompt_len = len(prompt_token_ids) - output_len = len(completion_token_ids - ) if fixed_output_len is None else fixed_output_len - if prompt_len < 4 or output_len < 4: - # Prune too short sequences. - continue - if prompt_len > 1024 or prompt_len + output_len > 2048: - # Prune too long sequences. - continue - filtered_dataset.append( - SampleRequest(prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - multi_modal_data=multi_modal_data, - lora_request=lora_request)) - - return filtered_dataset - - def run_vllm( - requests: List[SampleRequest], + requests: list[SampleRequest], n: int, engine_args: EngineArgs, -) -> float: + disable_detokenize: bool = False, +) -> tuple[float, Optional[list[RequestOutput]]]: from vllm import LLM, SamplingParams llm = LLM(**dataclasses.asdict(engine_args)) - + assert all( + llm.llm_engine.model_config.max_model_len >= ( + request.prompt_len + request.expected_output_len) + for request in requests), ( + "Please ensure that max_model_len is greater than the sum of" + " prompt_len and expected_output_len for all requests.") # Add the requests to the engine. - prompts: List[TextPrompt] = [] - sampling_params: List[SamplingParams] = [] + prompts: list[Union[TextPrompt, TokensPrompt]] = [] + sampling_params: list[SamplingParams] = [] for request in requests: prompts.append( + TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"], + multi_modal_data=request.multi_modal_data) + if "prompt_token_ids" in request.prompt else \ TextPrompt(prompt=request.prompt, multi_modal_data=request.multi_modal_data)) sampling_params.append( @@ -183,19 +60,21 @@ def run_vllm( top_p=1.0, ignore_eos=True, max_tokens=request.expected_output_len, + detokenize=not disable_detokenize, )) - lora_requests: Optional[List[LoRARequest]] = None + lora_requests: Optional[list[LoRARequest]] = None if engine_args.enable_lora: lora_requests = [request.lora_request for request in requests] use_beam_search = False + outputs = None if not use_beam_search: start = time.perf_counter() - llm.generate(prompts, - sampling_params, - lora_request=lora_requests, - use_tqdm=True) + outputs = llm.generate(prompts, + sampling_params, + lora_request=lora_requests, + use_tqdm=True) end = time.perf_counter() else: assert lora_requests is None, "BeamSearch API does not support LoRA" @@ -213,26 +92,75 @@ def run_vllm( ignore_eos=True, )) end = time.perf_counter() - return end - start + return end - start, outputs + + +def run_vllm_chat( + requests: list[SampleRequest], + n: int, + engine_args: EngineArgs, + disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]: + """ + Run vLLM chat benchmark. This function is recommended ONLY for benchmarking + multimodal models as it properly handles multimodal inputs and chat + formatting. For non-multimodal models, use run_vllm() instead. + """ + from vllm import LLM, SamplingParams + llm = LLM(**dataclasses.asdict(engine_args)) + + assert all( + llm.llm_engine.model_config.max_model_len >= ( + request.prompt_len + request.expected_output_len) + for request in requests), ( + "Please ensure that max_model_len is greater than the sum of " + "prompt_len and expected_output_len for all requests.") + + prompts = [] + sampling_params: list[SamplingParams] = [] + for request in requests: + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + detokenize=not disable_detokenize, + )) + start = time.perf_counter() + outputs = llm.chat(prompts, sampling_params, use_tqdm=True) + end = time.perf_counter() + return end - start, outputs async def run_vllm_async( - requests: List[SampleRequest], + requests: list[SampleRequest], n: int, engine_args: AsyncEngineArgs, disable_frontend_multiprocessing: bool = False, + disable_detokenize: bool = False, ) -> float: from vllm import SamplingParams async with build_async_engine_client_from_engine_args( engine_args, disable_frontend_multiprocessing) as llm: + assert all( + llm.model_config.max_model_len >= (request.prompt_len + + request.expected_output_len) + for request in requests), ( + "Please ensure that max_model_len is greater than the sum of" + " prompt_len and expected_output_len for all requests.") # Add the requests to the engine. - prompts: List[TextPrompt] = [] - sampling_params: List[SamplingParams] = [] - lora_requests: List[Optional[LoRARequest]] = [] + prompts: list[Union[TextPrompt, TokensPrompt]] = [] + sampling_params: list[SamplingParams] = [] + lora_requests: list[Optional[LoRARequest]] = [] for request in requests: prompts.append( + TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"], + multi_modal_data=request.multi_modal_data) + if "prompt_token_ids" in request.prompt else \ TextPrompt(prompt=request.prompt, multi_modal_data=request.multi_modal_data)) sampling_params.append( @@ -242,6 +170,7 @@ async def run_vllm_async( top_p=1.0, ignore_eos=True, max_tokens=request.expected_output_len, + detokenize=not disable_detokenize, )) lora_requests.append(request.lora_request) @@ -262,12 +191,13 @@ async def run_vllm_async( def run_hf( - requests: List[SampleRequest], + requests: list[SampleRequest], model: str, tokenizer: PreTrainedTokenizerBase, n: int, max_batch_size: int, trust_remote_code: bool, + disable_detokenize: bool = False, ) -> float: llm = AutoModelForCausalLM.from_pretrained( model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code) @@ -278,7 +208,7 @@ def run_hf( pbar = tqdm(total=len(requests)) start = time.perf_counter() - batch: List[str] = [] + batch: list[str] = [] max_prompt_len = 0 max_output_len = 0 for i in range(len(requests)): @@ -307,8 +237,9 @@ def run_hf( use_cache=True, max_new_tokens=max_output_len, ) - # Include the decoding time. - tokenizer.batch_decode(llm_outputs, skip_special_tokens=True) + if not disable_detokenize: + # Include the decoding time. + tokenizer.batch_decode(llm_outputs, skip_special_tokens=True) pbar.update(len(batch)) # Clear the batch. @@ -320,7 +251,7 @@ def run_hf( def run_mii( - requests: List[SampleRequest], + requests: list[SampleRequest], model: str, tensor_parallel_size: int, output_len: int, @@ -337,58 +268,86 @@ def run_mii( return end - start +def save_to_pytorch_benchmark_format(args: argparse.Namespace, + results: dict[str, Any]) -> None: + pt_records = convert_to_pytorch_benchmark_format( + args=args, + metrics={ + "requests_per_second": [results["requests_per_second"]], + "tokens_per_second": [results["tokens_per_second"]], + }, + extra_info={ + k: results[k] + for k in ["elapsed_time", "num_requests", "total_num_tokens"] + }) + if pt_records: + # Don't use json suffix here as we don't want CI to pick it up + pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" + write_to_json(pt_file, pt_records) + + +def get_requests(args, tokenizer): + # Common parameters for all dataset types. + common_kwargs = { + "dataset_path": args.dataset_path, + "random_seed": args.seed, + } + sample_kwargs = { + "tokenizer": tokenizer, + "lora_path": args.lora_path, + "max_loras": args.max_loras, + "num_requests": args.num_prompts, + "input_len": args.input_len, + "output_len": args.output_len, + } + if args.dataset_path is None or args.dataset_name == "random": + sample_kwargs["range_ratio"] = args.random_range_ratio + sample_kwargs["prefix_len"] = args.prefix_len + dataset_cls = RandomDataset + elif args.dataset_name == "sharegpt": + dataset_cls = ShareGPTDataset + if args.backend == "vllm-chat": + sample_kwargs["enable_multimodal_chat"] = True + elif args.dataset_name == "sonnet": + assert tokenizer.chat_template or tokenizer.default_chat_template, ( + "Tokenizer/model must have chat template for sonnet dataset.") + dataset_cls = SonnetDataset + sample_kwargs["prefix_len"] = args.prefix_len + sample_kwargs["return_prompt_formatted"] = True + elif args.dataset_name == "burstgpt": + dataset_cls = BurstGPTDataset + elif args.dataset_name == "hf": + if args.backend != "vllm-chat": + raise ValueError( + "hf datasets only are supported by vllm-chat backend") + # Choose between VisionArenaDataset and HuggingFaceDataset based on + # provided parameters. + dataset_cls = (VisionArenaDataset if args.dataset_path + == VisionArenaDataset.VISION_ARENA_DATASET_PATH + and args.hf_subset is None else HuggingFaceDataset) + common_kwargs['dataset_subset'] = args.hf_subset + common_kwargs['dataset_split'] = args.hf_split + sample_kwargs["enable_multimodal_chat"] = True + + else: + raise ValueError(f"Unknown dataset name: {args.dataset_name}") + # Remove None values + sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None} + return dataset_cls(**common_kwargs).sample(**sample_kwargs) + + def main(args: argparse.Namespace): + if args.seed is None: + args.seed = 0 print(args) random.seed(args.seed) - # Sample the requests. tokenizer = AutoTokenizer.from_pretrained( args.tokenizer, trust_remote_code=args.trust_remote_code) - if args.dataset is None: - vocab_size = tokenizer.vocab_size - requests = [] - for _ in range(args.num_prompts): - - request_tokenizer = tokenizer - lora_request: Optional[LoRARequest] = None - if args.enable_lora: - lora_request, lora_tokenizer = get_random_lora_request(args) - if lora_tokenizer: - request_tokenizer = lora_tokenizer - - # Synthesize a prompt with the given input length. - candidate_ids = [ - random.randint(0, vocab_size - 1) - for _ in range(args.input_len) - ] - # As tokenizer may add additional tokens like BOS, we need to try - # different lengths to get the desired input length. - for _ in range(5): # Max attempts to correct - candidate_prompt = request_tokenizer.decode(candidate_ids) - tokenized_len = len(request_tokenizer.encode(candidate_prompt)) - - if tokenized_len == args.input_len: - break - - # Adjust length based on difference - diff = args.input_len - tokenized_len - if diff > 0: - candidate_ids.extend([ - random.randint(100, vocab_size - 100) - for _ in range(diff) - ]) - else: - candidate_ids = candidate_ids[:diff] - requests.append( - SampleRequest(prompt=candidate_prompt, - prompt_len=args.input_len, - expected_output_len=args.output_len, - lora_request=lora_request)) - else: - requests = sample_requests(tokenizer, args) - + requests = get_requests(args, tokenizer) is_multi_modal = any(request.multi_modal_data is not None for request in requests) + request_outputs: Optional[list[RequestOutput]] = None if args.backend == "vllm": if args.async_engine: elapsed_time = uvloop.run( @@ -397,31 +356,59 @@ def main(args: argparse.Namespace): args.n, AsyncEngineArgs.from_cli_args(args), args.disable_frontend_multiprocessing, + args.disable_detokenize, )) else: - elapsed_time = run_vllm(requests, args.n, - EngineArgs.from_cli_args(args)) + elapsed_time, request_outputs = run_vllm( + requests, args.n, EngineArgs.from_cli_args(args), + args.disable_detokenize) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, - args.hf_max_batch_size, args.trust_remote_code) + args.hf_max_batch_size, args.trust_remote_code, + args.disable_detokenize) elif args.backend == "mii": elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size, args.output_len) + elif args.backend == "vllm-chat": + elapsed_time, request_outputs = run_vllm_chat( + requests, args.n, EngineArgs.from_cli_args(args), + args.disable_detokenize) else: raise ValueError(f"Unknown backend: {args.backend}") - total_num_tokens = sum(request.prompt_len + request.expected_output_len - for request in requests) - total_output_tokens = sum(request.expected_output_len - for request in requests) - if is_multi_modal: - print("\033[91mWARNING\033[0m: Multi-modal request detected. The " + + if request_outputs: + # Note: with the vllm and vllm-chat backends, + # we have request_outputs, which we use to count tokens. + total_prompt_tokens = 0 + total_output_tokens = 0 + for ro in request_outputs: + if not isinstance(ro, RequestOutput): + continue + total_prompt_tokens += len( + ro.prompt_token_ids) if ro.prompt_token_ids else 0 + total_output_tokens += sum( + len(o.token_ids) for o in ro.outputs if o) + total_num_tokens = total_prompt_tokens + total_output_tokens + else: + total_num_tokens = sum(r.prompt_len + r.expected_output_len + for r in requests) + total_output_tokens = sum(r.expected_output_len for r in requests) + total_prompt_tokens = total_num_tokens - total_output_tokens + + if is_multi_modal and args.backend != "vllm-chat": + print("\033[91mWARNING\033[0m: Multi-modal request with " + f"{args.backend} backend detected. The " "following metrics are not accurate because image tokens are not" " counted. See vllm-project/vllm/issues/9778 for details.") - # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length. + # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length. + # vllm-chat backend counts the image tokens now + print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " f"{total_output_tokens / elapsed_time:.2f} output tokens/s") + print(f"Total num prompt tokens: {total_prompt_tokens}") + print(f"Total num output tokens: {total_output_tokens}") # Output JSON results if specified if args.output_json: @@ -434,20 +421,115 @@ def main(args: argparse.Namespace): } with open(args.output_json, "w") as f: json.dump(results, f, indent=4) + save_to_pytorch_benchmark_format(args, results) + + +def validate_args(args): + """ + Validate command-line arguments. + """ + + # === Deprecation and Defaulting === + if args.dataset is not None: + warnings.warn( + "The '--dataset' argument will be deprecated in the next release. " + "Please use '--dataset-name' and '--dataset-path' instead.", + stacklevel=2) + args.dataset_path = args.dataset + + if not getattr(args, "tokenizer", None): + args.tokenizer = args.model + + # === Backend Validation === + valid_backends = {"vllm", "hf", "mii", "vllm-chat"} + if args.backend not in valid_backends: + raise ValueError(f"Unsupported backend: {args.backend}") + + # === Dataset Configuration === + if not args.dataset and not args.dataset_path: + print( + "When dataset path is not set, it will default to random dataset") + args.dataset_name = 'random' + if args.input_len is None: + raise ValueError("input_len must be provided for a random dataset") + + # === Dataset Name Specific Checks === + # --hf-subset and --hf-split: only used + # when dataset_name is 'hf' + if args.dataset_name != "hf" and ( + getattr(args, "hf_subset", None) is not None + or getattr(args, "hf_split", None) is not None): + warnings.warn("--hf-subset and --hf-split will be ignored \ + since --dataset-name is not 'hf'.", + stacklevel=2) + elif args.dataset_name == "hf" and args.backend != "vllm-chat": + raise ValueError( + "When --dataset-name is 'hf', backend must be 'vllm-chat'") + + # --random-range-ratio: only used when dataset_name is 'random' + if args.dataset_name != 'random' and args.random_range_ratio is not None: + warnings.warn("--random-range-ratio will be ignored since \ + --dataset-name is not 'random'.", + stacklevel=2) + + # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not + # set. + if args.dataset_name not in {"random", "sonnet", None + } and args.prefix_len is not None: + warnings.warn("--prefix-len will be ignored since --dataset-name\ + is not 'random', 'sonnet', or not set.", + stacklevel=2) + + # === LoRA Settings === + if getattr(args, "enable_lora", False) and args.backend != "vllm": + raise ValueError( + "LoRA benchmarking is only supported for vLLM backend") + if getattr(args, "enable_lora", False) and args.lora_path is None: + raise ValueError("LoRA path must be provided when enable_lora is True") + + # === Backend-specific Validations === + if args.backend == "hf" and args.hf_max_batch_size is None: + raise ValueError("HF max batch size is required for HF backend") + if args.backend != "hf" and args.hf_max_batch_size is not None: + raise ValueError("HF max batch size is only for HF backend.") + + if args.backend in {"hf", "mii"} and getattr(args, "quantization", + None) is not None: + raise ValueError("Quantization is only for vLLM backend.") + + if args.backend == "mii" and args.dtype != "auto": + raise ValueError("dtype must be auto for MII backend.") + if args.backend == "mii" and args.n != 1: + raise ValueError("n must be 1 for MII backend.") + if args.backend == "mii" and args.tokenizer != args.model: + raise ValueError( + "Tokenizer must be the same as the model for MII backend.") if __name__ == "__main__": parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser.add_argument("--backend", type=str, - choices=["vllm", "hf", "mii"], + choices=["vllm", "hf", "mii", "vllm-chat"], default="vllm") - parser.add_argument("--dataset", + parser.add_argument( + "--dataset-name", + type=str, + choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"], + help="Name of the dataset to benchmark on.", + default="sharegpt") + parser.add_argument( + "--dataset", + type=str, + default=None, + help="Path to the ShareGPT dataset, will be deprecated in\ + the next release. The dataset is expected to " + "be a json in form of list[dict[..., conversations: " + "list[dict[..., value: ]]]]") + parser.add_argument("--dataset-path", type=str, default=None, - help="Path to the dataset. The dataset is expected to " - "be a json in form of List[Dict[..., conversations: " - "List[Dict[..., value: ]]]]") + help="Path to the dataset") parser.add_argument("--input-len", type=int, default=None, @@ -482,6 +564,11 @@ if __name__ == "__main__": action='store_true', default=False, help="Disable decoupled async engine frontend.") + parser.add_argument( + "--disable-detokenize", + action="store_true", + help=("Do not detokenize the response (i.e. do not include " + "detokenization time in the measurement)")) # LoRA parser.add_argument( "--lora-path", @@ -489,43 +576,33 @@ if __name__ == "__main__": default=None, help="Path to the lora adapters to use. This can be an absolute path, " "a relative path, or a Hugging Face model identifier.") + parser.add_argument("--prefix-len", + type=int, + default=None, + help="Number of prefix tokens per request." + "This is for the RandomDataset and SonnetDataset") + # random dataset + parser.add_argument( + "--random-range-ratio", + type=float, + default=None, + help="Range of sampled ratio of input/output length, " + "used only for RandomDataSet.", + ) + + # hf dtaset + parser.add_argument("--hf-subset", + type=str, + default=None, + help="Subset of the HF dataset.") + parser.add_argument("--hf-split", + type=str, + default=None, + help="Split of the HF dataset.") parser = AsyncEngineArgs.add_cli_args(parser) args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model - if args.dataset is None: - assert args.input_len is not None - assert args.output_len is not None - else: - assert args.input_len is None - if args.enable_lora: - assert args.lora_path is not None - - if args.backend == "vllm": - if args.hf_max_batch_size is not None: - raise ValueError("HF max batch size is only for HF backend.") - elif args.backend == "hf": - if args.hf_max_batch_size is None: - raise ValueError("HF max batch size is required for HF backend.") - if args.quantization is not None: - raise ValueError("Quantization is only for vLLM backend.") - if args.enable_lora is not None: - raise ValueError("LoRA benchmarking is only supported for vLLM" - " backend") - elif args.backend == "mii": - if args.dtype != "auto": - raise ValueError("dtype must be auto for MII backend.") - if args.n != 1: - raise ValueError("n must be 1 for MII backend.") - if args.quantization is not None: - raise ValueError("Quantization is only for vLLM backend.") - if args.hf_max_batch_size is not None: - raise ValueError("HF max batch size is only for HF backend.") - if args.tokenizer != args.model: - raise ValueError("Tokenizer must be the same as the model for MII " - "backend.") - if args.enable_lora is not None: - raise ValueError("LoRA benchmarking is only supported for vLLM" - " backend") + validate_args(args) main(args) diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py new file mode 100644 index 0000000000000..45a0ddbd5d08d --- /dev/null +++ b/benchmarks/benchmark_utils.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import json +import math +import os +from typing import Any + + +def convert_to_pytorch_benchmark_format(args: argparse.Namespace, + metrics: dict[str, list], + extra_info: dict[str, Any]) -> list: + """ + Save the benchmark results in the format used by PyTorch OSS benchmark with + on metric per record + https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database + """ + records = [] + if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False): + return records + + for name, benchmark_values in metrics.items(): + record = { + "benchmark": { + "name": "vLLM benchmark", + "extra_info": { + "args": vars(args), + }, + }, + "model": { + "name": args.model, + }, + "metric": { + "name": name, + "benchmark_values": benchmark_values, + "extra_info": extra_info, + }, + } + + tp = record["benchmark"]["extra_info"]["args"].get( + "tensor_parallel_size") + # Save tensor_parallel_size parameter if it's part of the metadata + if not tp and "tensor_parallel_size" in extra_info: + record["benchmark"]["extra_info"]["args"][ + "tensor_parallel_size"] = extra_info["tensor_parallel_size"] + + records.append(record) + + return records + + +class InfEncoder(json.JSONEncoder): + + def clear_inf(self, o: Any): + if isinstance(o, dict): + return {k: self.clear_inf(v) for k, v in o.items()} + elif isinstance(o, list): + return [self.clear_inf(v) for v in o] + elif isinstance(o, float) and math.isinf(o): + return "inf" + return o + + def iterencode(self, o: Any, *args, **kwargs) -> Any: + return super().iterencode(self.clear_inf(o), *args, **kwargs) + + +def write_to_json(filename: str, records: list) -> None: + with open(filename, "w") as f: + json.dump(records, f, cls=InfEncoder) diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py index 3d1c5e392f9e2..9e36b0a9d3bb9 100644 --- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py @@ -1,9 +1,12 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import copy import itertools import pickle as pkl import time -from typing import Callable, Iterable, List, Tuple +from collections.abc import Iterable +from typing import Callable import torch import torch.utils.benchmark as TBenchmark @@ -226,7 +229,7 @@ def print_timers(timers: Iterable[TMeasurement]): def run(dtype: torch.dtype, - MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: + MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]: results = [] for m, k, n in MKNs: timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm", @@ -239,7 +242,7 @@ def run(dtype: torch.dtype, # output makers def make_output(data: Iterable[TMeasurement], - MKNs: Iterable[Tuple[int, int, int]], + MKNs: Iterable[tuple[int, int, int]], base_description: str, timestamp=None): print(f"== All Results {base_description} ====") @@ -280,7 +283,7 @@ def run_model_bench(args): for i, model in enumerate(args.models): print(f"[{i}] {model}") - def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]: + def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]: KNs = [] for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]): KN[tp_split_dim] = KN[tp_split_dim] // tp_size diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py index ef06fcd6604dd..fe4d8fdfc0669 100644 --- a/benchmarks/cutlass_benchmarks/utils.py +++ b/benchmarks/cutlass_benchmarks/utils.py @@ -1,5 +1,7 @@ +# SPDX-License-Identifier: Apache-2.0 + # Cutlass bench utils -from typing import Iterable, Tuple +from collections.abc import Iterable import torch @@ -25,7 +27,7 @@ def to_fp16(tensor: torch.Tensor) -> torch.Tensor: def make_rand_tensors(dtype: torch.dtype, m: int, n: int, - k: int) -> Tuple[torch.Tensor, torch.Tensor]: + k: int) -> tuple[torch.Tensor, torch.Tensor]: a = torch.randn((m, k), device='cuda') * 5 b = torch.randn((n, k), device='cuda').t() * 5 @@ -61,7 +63,7 @@ def prune_to_2_4(tensor): def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int, - k: int) -> Tuple[torch.Tensor, torch.Tensor]: + k: int) -> tuple[torch.Tensor, torch.Tensor]: a = torch.randn((m, k), device='cuda') * 5 b = torch.randn((n, k), device='cuda').t() * 5 @@ -86,7 +88,7 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int, def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int) -> \ - Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]: + tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]: ABs = [] for _ in range(num_tensors): b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k) diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index d0353bc8cb42a..e7b742d8bec93 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -1,9 +1,12 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import copy import itertools import pickle as pkl import time -from typing import Callable, Iterable, List, Tuple +from collections.abc import Iterable +from typing import Callable, Optional import torch import torch.utils.benchmark as TBenchmark @@ -12,6 +15,8 @@ from utils import make_rand_tensors from weight_shapes import WEIGHT_SHAPES from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + w8a8_block_fp8_matmul) from vllm.utils import FlexibleArgumentParser DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) @@ -38,8 +43,15 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args, ).blocked_autorange(min_run_time=min_run_time) -def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str, - sub_label: str) -> Iterable[TMeasurement]: +def bench_int8( + dtype: torch.dtype, + m: int, + k: int, + n: int, + label: str, + sub_label: str, + bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]: + """Benchmark INT8-based kernels.""" assert dtype == torch.int8 a, b = make_rand_tensors(torch.int8, m, n, k) scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) @@ -48,155 +60,132 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str, azp = torch.zeros((m, ), device="cuda", dtype=torch.int32) azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32) + bench_fns = { + "pytorch_bf16_bf16_bf16_matmul-no-scales": + lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16) + ), + "pytorch_fp16_fp16_fp16_matmul-no-scales": + lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)), + "cutlass_i8_i8_bf16_scaled_mm": + lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16), + "cutlass_i8_i8_bf16_scaled_mm_bias": + lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16, + bias), + "cutlass_i8_i8_bf16_scaled_mm_azp": + lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch. + bfloat16, azp_adj), + "cutlass_i8_i8_bf16_scaled_mm_azp_bias": + lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch. + bfloat16, azp_adj, None, bias), + "cutlass_i8_i8_bf16_scaled_mm_azp_pt": + lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch. + bfloat16, azp_adj, azp), + "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias": + lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch. + bfloat16, azp_adj, azp, bias), + } + timers = [] - # pytorch impl - bfloat16 - timers.append( - bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales", - torch.mm, a.to(dtype=torch.bfloat16), - b.to(dtype=torch.bfloat16))) - - # pytorch impl - float16 - timers.append( - bench_fn(label, sub_label, - "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm, - a.to(dtype=torch.float16), b.to(dtype=torch.float16))) - - # cutlass impl - timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, - torch.bfloat16)) - - # cutlass with bias - timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16, - bias)) - - # cutlass with azp per-tensor - timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp", - ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b, - torch.bfloat16, azp_adj)) - - # cutlass with azp per-tensor + bias - timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias", - ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b, - torch.bfloat16, azp_adj, None, bias)) - - # cutlass with azp per-token - timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt", - ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b, - torch.bfloat16, azp_adj, azp)) - - # cutlass with azp per-token + bias - timers.append( - bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias", - ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b, - torch.bfloat16, azp_adj, azp, bias)) + for name, fn in bench_fns.items(): + # If bench_kernels is None, run all. Otherwise, run only exact matches. + if bench_kernels is None or name in bench_kernels: + print(f"Running {name}") + timers.append(bench_fn(label, sub_label, name, fn)) return timers -def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, - sub_label: str) -> Iterable[TMeasurement]: +def bench_fp8( + dtype: torch.dtype, + m: int, + k: int, + n: int, + label: str, + sub_label: str, + bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]: + """Benchmark FP8-based kernels.""" assert dtype == torch.float8_e4m3fn a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k) + a_cont = a.contiguous() scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) + block_scale_a = torch.rand((m, k // 128), + device="cuda", + dtype=torch.float32) + block_scale_b = torch.rand((k // 128, n // 128), + device="cuda", + dtype=torch.float32) + block_scale_a_M_major = block_scale_a.t().contiguous().t() + block_scale_b_K_major = block_scale_b.t().contiguous().t() bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16) + print(m, k, n) + + bench_fns = { + "pytorch_bf16_bf16_bf16_matmul-no-scales": + lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16) + ), + "pytorch_fp16_fp16_fp16_matmul-no-scales": + lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)), + "pytorch_fp8_fp8_fp16_scaled_mm": + lambda: torch._scaled_mm( + a, b, scale_a, scale_b, out_dtype=torch.float16), + "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum": + lambda: torch._scaled_mm(a, + b, + scale_a, + scale_b, + out_dtype=torch.float16, + use_fast_accum=True), + "pytorch_fp8_fp8_bf16_scaled_mm": + lambda: torch._scaled_mm( + a, b, scale_a, scale_b, out_dtype=torch.bfloat16), + "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum": + lambda: torch._scaled_mm(a, + b, + scale_a, + scale_b, + out_dtype=torch.bfloat16, + use_fast_accum=True), + "cutlass_fp8_fp8_bf16_scaled_mm": + lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16), + "cutlass_fp8_fp8_fp16_scaled_mm": + lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16), + "cutlass_fp8_fp8_bf16_scaled_mm_bias": + lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16, + bias), + "cutlass_fp8_fp8_fp16_scaled_mm_bias": + lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16, + bias.to(dtype=torch.float16)), + "triton_fp8_fp8_fp16_scaled_mm_blockwise": + lambda: w8a8_block_fp8_matmul(a_cont, b.t(), block_scale_a, + block_scale_b.t(), (128, 128)), + "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": + lambda: ops.cutlass_scaled_mm(a, b, block_scale_a_M_major, + block_scale_b_K_major, torch.float16), + } + timers = [] - - # pytorch impl w. bf16 - timers.append( - bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales", - torch.mm, a.to(dtype=torch.bfloat16, device="cuda"), - b.to(dtype=torch.bfloat16, device="cuda"))) - - # pytorch impl: bf16 output, without fp8 fast accum - timers.append( - bench_fn(label, - sub_label, - "pytorch_fp8_fp8_bf16_scaled_mm", - torch._scaled_mm, - a, - b, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.bfloat16)) - - # pytorch impl: bf16 output, with fp8 fast accum - timers.append( - bench_fn(label, - sub_label, - "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum", - torch._scaled_mm, - a, - b, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.bfloat16, - use_fast_accum=True)) - - # pytorch impl: fp16 output, without fp8 fast accum - timers.append( - bench_fn(label, - sub_label, - "pytorch_fp8_fp8_fp16_scaled_mm", - torch._scaled_mm, - a, - b, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.float16)) - - # pytorch impl: fp16 output, with fp8 fast accum - timers.append( - bench_fn(label, - sub_label, - "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum", - torch._scaled_mm, - a, - b, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=torch.float16, - use_fast_accum=True)) - - # cutlass impl: bf16 output - timers.append( - bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, - torch.bfloat16)) - # cutlass impl: fp16 output - timers.append( - bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16)) - - # cutlass impl: bf16 output, with bias - timers.append( - bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16, - bias)) - - # cutlass impl: fp16 output, with bias - timers.append( - bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias", - ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16, - bias.to(dtype=torch.float16))) + for name, fn in bench_fns.items(): + # If bench_kernels is None, run all. Otherwise, run only exact matches. + if bench_kernels is None or name in bench_kernels: + print(f"Running {name}") + timers.append(bench_fn(label, sub_label, name, fn)) return timers -def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str, - sub_label: str) -> Iterable[TMeasurement]: +def bench(dtype: torch.dtype, + m: int, + k: int, + n: int, + label: str, + sub_label: str, + bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]: if dtype == torch.int8: - return bench_int8(dtype, m, k, n, label, sub_label) + return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels) if dtype == torch.float8_e4m3fn: - return bench_fp8(dtype, m, k, n, label, sub_label) + return bench_fp8(dtype, m, k, n, label, sub_label, bench_kernels) raise ValueError("unsupported type") @@ -207,20 +196,24 @@ def print_timers(timers: Iterable[TMeasurement]): def run(dtype: torch.dtype, - MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: + MKNs: Iterable[tuple[int, int, int]], + bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]: results = [] for m, k, n in MKNs: - timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm", - f"MKN=({m}x{k}x{n})") + timers = bench(dtype, + m, + k, + n, + f"scaled-{dtype}-gemm", + f"MKN=({m}x{k}x{n})", + bench_kernels=bench_kernels) print_timers(timers) results.extend(timers) - return results -# output makers def make_output(data: Iterable[TMeasurement], - MKNs: Iterable[Tuple[int, int, int]], + MKNs: Iterable[tuple[int, int, int]], base_description: str, timestamp=None): print(f"== All Results {base_description} ====") @@ -232,15 +225,11 @@ def make_output(data: Iterable[TMeasurement], pkl.dump(data, f) -# argparse runners - - def run_square_bench(args): dim_sizes = list( range(args.dim_start, args.dim_end + 1, args.dim_increment)) MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) - data = run(args.dtype, MKNs) - + data = run(args.dtype, MKNs, bench_kernels=args.kernels) make_output(data, MKNs, f"square_bench-{args.dtype}") @@ -251,8 +240,7 @@ def run_range_bench(args): Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes MKNs = list(zip(Ms, Ks, Ns)) - data = run(args.dtype, MKNs) - + data = run(args.dtype, MKNs, bench_kernels=args.kernels) make_output(data, MKNs, f"range_bench-{args.dtype}") @@ -261,7 +249,7 @@ def run_model_bench(args): for i, model in enumerate(args.models): print(f"[{i}] {model}") - def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]: + def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]: KNs = [] for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]): KN[tp_split_dim] = KN[tp_split_dim] // tp_size @@ -278,7 +266,7 @@ def run_model_bench(args): for k, n in KNs: MKNs.append((m, k, n)) - data = run(args.dtype, MKNs) + data = run(args.dtype, MKNs, bench_kernels=args.kernels) model_bench_data.append(data) # Print all results @@ -328,6 +316,15 @@ Benchmark Cutlass GEMM. type=to_torch_dtype, required=True, help="Available options are ['int8', 'fp8']") + parser.add_argument( + "--kernels", + nargs="+", + type=str, + default=None, + help= + "Exact names of the kernels to benchmark. If not set, runs all kernels." + ) + subparsers = parser.add_subparsers(dest="cmd") square_parser = subparsers.add_parser("square_bench") @@ -362,4 +359,4 @@ Benchmark Cutlass GEMM. model_parser.set_defaults(func=run_model_bench) args = parser.parse_args() - args.func(args) \ No newline at end of file + args.func(args) diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py index d58fb0bf86374..3d1121df40d01 100644 --- a/benchmarks/cutlass_benchmarks/weight_shapes.py +++ b/benchmarks/cutlass_benchmarks/weight_shapes.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Weight Shapes are in the format # ([K, N], TP_SPLIT_DIM) # Example: diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py index 4058b1c0a3b79..980e68668911f 100644 --- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py +++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import aiohttp diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py index 6eb5f63980070..c2ad4916bf077 100644 --- a/benchmarks/disagg_benchmarks/round_robin_proxy.py +++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import itertools diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py index e59d8bb0e6c8c..a7b4b9e8bf302 100644 --- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py +++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import matplotlib.pyplot as plt diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py index ef91f9f8eb529..3da583a334480 100644 --- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py +++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py @@ -1,8 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 + import pickle as pkl import time +from collections.abc import Iterable from dataclasses import dataclass from itertools import product -from typing import Callable, Iterable, List, Optional +from typing import Callable, Optional import torch import torch.utils.benchmark as TBenchmark @@ -27,7 +30,7 @@ class bench_params_t: f'x DT {self.dtype}') -def get_bench_params() -> List[bench_params_t]: +def get_bench_params() -> list[bench_params_t]: ## Test Fixtures NUM_TOKENS = [2**x for x in range(11)] HIDDEN_SIZES = list(range(1024, 8129, 1024)) diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index 601c4ea439aea..8d20b91560dd6 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import sys from typing import Optional diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index 7acea6087fdfd..e12d74c01e43c 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time import torch @@ -38,7 +40,7 @@ def main(num_tokens: int, end_time = time.perf_counter() if profile: - torch.cuda.cudart().cudaProfilerStart() + torch.cuda.cudart().cudaProfilerStop() return (end_time - start_time) / num_iters # Warmup. diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index e1f613e1da509..b4b91eda28440 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import copy import json @@ -7,7 +9,7 @@ from dataclasses import dataclass from enum import Enum, auto from itertools import product from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Tuple +from typing import Any, Callable, Optional import torch import torch.utils.benchmark as TBenchmark @@ -15,11 +17,7 @@ from torch.utils.benchmark import Measurement as TMeasurement from utils import ArgPool, Bench, CudaGraphBenchParams from weight_shapes import WEIGHT_SHAPES -from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand -from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice -from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink -from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand -from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink +from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT from vllm.utils import FlexibleArgumentParser @@ -59,15 +57,15 @@ def make_rand_lora_weight_tensor(k: int, def make_rand_tensors( - a_shape: Tuple[int], - b_shape: Tuple[int], - c_shape: Tuple[int], + a_shape: tuple[int], + b_shape: tuple[int], + c_shape: tuple[int], a_dtype: torch.dtype, b_dtype: torch.dtype, c_dtype: torch.dtype, num_slices: int, device: str = "cuda", -) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]: +) -> tuple[torch.Tensor, list[torch.Tensor], torch.Tensor]: """ Make LoRA input/output matrices. """ @@ -87,7 +85,7 @@ def make_prompt_lora_mapping(num_prompts: int, num_active_loras: int, sort_by_lora_id: bool, device: str) -> torch.Tensor: """ - All prompts are mapped to a Lora ID in range [0, num_active_loras). + All prompts are mapped to a LoRA ID in range [0, num_active_loras). where 0 refers to first lora, 1 refers to second lora and so on. """ assert num_active_loras > 0 @@ -133,7 +131,7 @@ def make_token_lora_mapping(num_tokens: int, num_prompts: int, def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor, - lora_weights: List[torch.Tensor], + lora_weights: list[torch.Tensor], seq_lens_cpu: torch.Tensor, prompt_lora_mapping_cpu: torch.Tensor, scaling: float, add_inputs: Optional[bool]): @@ -151,7 +149,6 @@ def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor, result = torch.nn.functional.linear(x, w) result *= scaling out_list.append(result) - torch.cat(out_list, dim=0) cat_result = torch.cat(out_list, dim=0) @@ -165,62 +162,35 @@ class OpType(Enum): """ LoRA Ops to benchmark and its properties. """ - SGMV_SHRINK = auto() - BGMV_SHRINK = auto() - SGMV_EXPAND = auto() - BGMV_EXPAND = auto() - BGMV_EXPAND_SLICE = auto() + LORA_SHRINK = auto() + LORA_EXPAND = auto() @staticmethod def from_str(s: str) -> "OpType": - if s.lower() == 'sgmv_shrink': - return OpType.SGMV_SHRINK - if s.lower() == 'sgmv_expand': - return OpType.SGMV_EXPAND - if s.lower() == 'bgmv_shrink': - return OpType.BGMV_SHRINK - if s.lower() == 'bgmv_expand': - return OpType.BGMV_EXPAND - if s.lower() == "bgmv_expand_slice": - return OpType.BGMV_EXPAND_SLICE + if s.lower() == "lora_shrink": + return OpType.LORA_SHRINK + if s.lower() == "lora_expand": + return OpType.LORA_EXPAND raise ValueError(f"Unrecognized str {s} to convert to OpType") def is_shrink_fn(self) -> bool: - return self in [OpType.SGMV_SHRINK, OpType.BGMV_SHRINK] + return self in [OpType.LORA_SHRINK] def is_expand_fn(self) -> bool: - return self in [OpType.SGMV_EXPAND, OpType.BGMV_EXPAND] + return self in [OpType.LORA_EXPAND] - def is_prefill_op(self) -> bool: - return self in [OpType.SGMV_SHRINK, OpType.SGMV_EXPAND] - - def is_decode_op(self) -> bool: - return self in [ - OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE - ] - - def is_expand_slice_fn(self) -> bool: - return self in [OpType.BGMV_EXPAND_SLICE] - - def num_slices(self) -> List[int]: - if self in [OpType.SGMV_EXPAND, OpType.SGMV_SHRINK]: - # SGMV kernels supports slices - return [1, 2, 3] - if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]: - return [1] - if self in [OpType.BGMV_EXPAND_SLICE]: - return [2, 3] - raise ValueError(f"Unrecognized OpType {self}") + def num_slices(self) -> list[int]: + return [1, 2, 3] def mkn(self, batch_size: int, seq_length: int, hidden_size: int, - lora_rank: int) -> Tuple[int, int, int]: + lora_rank: int) -> tuple[int, int, int]: num_tokens = batch_size * seq_length if self.is_shrink_fn(): m = num_tokens k = hidden_size n = lora_rank else: - assert self.is_expand_fn() or self.is_expand_slice_fn() + assert self.is_expand_fn() m = num_tokens k = lora_rank n = hidden_size @@ -228,20 +198,20 @@ class OpType(Enum): def matmul_dtypes( self, op_dtype: torch.dtype - ) -> Tuple[torch.dtype, torch.dtype, torch.dtype]: + ) -> tuple[torch.dtype, torch.dtype, torch.dtype]: """ return a type, b type and c type for A x B = C """ if self.is_shrink_fn(): return op_dtype, op_dtype, torch.float32 else: - assert self.is_expand_fn() or self.is_expand_slice_fn() + assert self.is_expand_fn() return torch.float32, op_dtype, op_dtype def matmul_shapes( self, batch_size: int, seq_length: int, hidden_size: int, lora_rank: int, num_loras: int, - num_slices: int) -> Tuple[Tuple[int], Tuple[int], Tuple[int]]: + num_slices: int) -> tuple[tuple[int], tuple[int], tuple[int]]: """ Given num_slices, return the shapes of the A, B, and C matrices in A x B = C, for the op_type @@ -249,56 +219,39 @@ class OpType(Enum): m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank) b_shape = (num_loras, n, k) # col-major - if self == OpType.SGMV_SHRINK: - # SGMV shrink supports num_slices inherently in the kernel + if self in [OpType.LORA_SHRINK]: + # LoRA shrink kernels support num_slices inherently in the kernel. return ((m, k), b_shape, (num_slices, m, n)) - if self == OpType.SGMV_EXPAND: - # SGMV expand supports num_slices inherently in the kernel + if self in [OpType.LORA_EXPAND]: + # LoRA expand kernels support num_slices inherently in the kernel return ((num_slices, m, k), b_shape, (m, n * num_slices)) - if self == OpType.BGMV_SHRINK: - return ((m, k), b_shape, (m, n)) - if self == OpType.BGMV_EXPAND: - return ((m, k), b_shape, (m, n)) - if self == OpType.BGMV_EXPAND_SLICE: - return ((num_slices, m, k), b_shape, (m, n * num_slices)) - raise ValueError(f"Unrecognized op_type {self}") def bench_fn(self) -> Callable: + if self == OpType.LORA_SHRINK: + return lora_shrink + if self == OpType.LORA_EXPAND: + return lora_expand - def emulate_bgmv_expand_slice(kwargs_list: List[Dict[str, Any]]): - for x in kwargs_list: - bgmv_expand_slice(**x) - - if self == OpType.SGMV_SHRINK: - return sgmv_shrink - if self == OpType.SGMV_EXPAND: - return sgmv_expand - if self == OpType.BGMV_SHRINK: - return bgmv_shrink - if self == OpType.BGMV_EXPAND: - return bgmv_expand - if self == OpType.BGMV_EXPAND_SLICE: - return emulate_bgmv_expand_slice raise ValueError(f"Unrecognized optype {self}") def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor, - lora_weights: List[torch.Tensor], + lora_weights: list[torch.Tensor], **kwargs) -> Callable: - """Each benchmark operation expected the input, lora_weights and outputs + """Each benchmark operation expects the input, lora_weights and outputs in a slightly different format. Refer to self.matmul_shapes(). run_ref_group_gemm accounts for those differences in executing a reference group gemm for correctness testing. """ w_dtype = lora_weights[0].dtype num_slices = len(lora_weights) - if self == OpType.SGMV_SHRINK: + if self in [OpType.LORA_SHRINK]: for slice_idx in range(num_slices): ref_group_gemm(ref_out=output[slice_idx, :], input=input, lora_weights=lora_weights[slice_idx], **kwargs) - if self == OpType.SGMV_EXPAND: + elif self in [OpType.LORA_EXPAND]: hidden_size = lora_weights[0].shape[1] for slice_idx in range(num_slices): slice_offset = slice_idx * hidden_size @@ -307,28 +260,8 @@ class OpType(Enum): input=input[slice_idx].clone().to(dtype=w_dtype), lora_weights=lora_weights[slice_idx], **kwargs) - if self == OpType.BGMV_SHRINK: - assert num_slices == 1 - ref_group_gemm(ref_out=output, - input=input, - lora_weights=lora_weights[0], - **kwargs) - if self == OpType.BGMV_EXPAND: - assert num_slices == 1 - ref_group_gemm(ref_out=output, - input=input.clone().to(dtype=w_dtype), - lora_weights=lora_weights[0], - **kwargs) - if self == OpType.BGMV_EXPAND_SLICE: - hidden_size = lora_weights[0].shape[1] - for slice_idx in range(num_slices): - slice_offset = slice_idx * hidden_size - ref_group_gemm( - ref_out=output[:, slice_offset:slice_offset + hidden_size], - input=input[slice_idx].clone().to(dtype=w_dtype), - lora_weights=lora_weights[slice_idx], - **kwargs) - raise ValueError(f"Unrecognized optype {self}") + else: + raise ValueError(f"Unrecognized optype {self}") @dataclass @@ -382,13 +315,13 @@ class BenchmarkTensors: """ # matmul tensors input: torch.Tensor - lora_weights_lst: List[torch.Tensor] + lora_weights_lst: list[torch.Tensor] output: torch.Tensor - # metadata tensors + # LoRA kernel metadata + lora_kernel_meta: LoRAKernelMeta + # Metadata tensors used in testing correctness seq_lens: torch.Tensor - seq_start_loc: torch.Tensor prompt_lora_mapping: torch.Tensor - token_lora_mapping: torch.Tensor def io_types(self) -> str: return (f"{dtype_to_str(self.input.dtype)}x" @@ -415,26 +348,29 @@ class BenchmarkTensors: assert ctx.num_active_loras <= ctx.num_loras total_tokens = ctx.batch_size * ctx.seq_length + # Make metadata tensors involved in correctness testing. # Prepare seq lens tensor seq_len_tensor = torch.randint(ctx.seq_length, ctx.seq_length + 1, (ctx.batch_size, )) - # Prepare seq_start_loc tensor - seq_start_loc_tensor = torch.cumsum(torch.tensor( - [0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), - dim=0) assert total_tokens == seq_len_tensor.sum() # Prepare prompt lora indices tensor prompt_lora_indices_tensor = make_prompt_lora_mapping( ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu") - # Prepare token lora indices tensor + + # Make LoRAKernelMeta token_lora_indices_tensor = make_token_lora_mapping( total_tokens, ctx.batch_size, prompt_lora_indices_tensor, seq_len_tensor, "cpu") + lora_kernel_meta = LoRAKernelMeta.make( + max_loras=ctx.num_loras, + max_num_tokens=token_lora_indices_tensor.size(0), + device="cpu") + lora_kernel_meta.prepare_tensors( + token_lora_mapping=token_lora_indices_tensor) return BenchmarkTensors(input_tensor, lora_weights, output_tensor, - seq_len_tensor, seq_start_loc_tensor, - prompt_lora_indices_tensor, - token_lora_indices_tensor) + lora_kernel_meta, seq_len_tensor, + prompt_lora_indices_tensor) def sanity_check(self) -> None: """ @@ -444,9 +380,9 @@ class BenchmarkTensors: # check metadata tensors assert torch.sum(self.seq_lens) == num_tokens num_seqs = self.seq_lens.shape[0] - assert self.seq_start_loc.shape[0] == num_seqs + #assert self.seq_start_loc.shape[0] == num_seqs assert self.prompt_lora_mapping.shape[0] == num_seqs - assert self.token_lora_mapping.shape[0] == num_tokens + assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens def to_device(self, device: str): """ @@ -461,54 +397,31 @@ class BenchmarkTensors: self.input = to_device(self.input) self.output = to_device(self.output) self.seq_lens = to_device(self.seq_lens) - self.seq_start_loc = to_device(self.seq_start_loc) self.prompt_lora_mapping = to_device(self.prompt_lora_mapping) - self.token_lora_mapping = to_device(self.token_lora_mapping) for i in range(len(self.lora_weights_lst)): self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i]) - def metadata(self) -> Tuple[int, int, int]: + # LoRA meta + for field_name in LoRAKernelMeta.__dataclass_fields__: + field = getattr(self.lora_kernel_meta, field_name) + assert isinstance(field, torch.Tensor) + setattr(self.lora_kernel_meta, field_name, to_device(field)) + + def metadata(self) -> tuple[int, int, int]: """ Return num_seqs, num_tokens and max_seq_len """ num_seqs = self.seq_lens.shape[0] - num_tokens = self.token_lora_mapping.shape[0] + num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0] max_seq_len = torch.max(self.seq_lens).item() num_slices = len(self.lora_weights_lst) return num_seqs, num_tokens, max_seq_len, num_slices - def convert_to_sgmv_benchmark_tensors(self): - """ - For sgmv punica kernels, when consecutive sequences have the - same LoRA ID, we just merge them together. - This happens in punica.py::compute_metadata - """ - - # Collapse seq_lens and seq_start_loc - _, seq_lens = torch.unique_consecutive(self.token_lora_mapping, - return_counts=True) - cum_result = torch.cumsum(seq_lens, dim=0) - seq_start_loc = torch.zeros_like(seq_lens) - seq_start_loc[1:].copy_(cum_result[:-1]) - - # Collapse prompt mapping - prompt_lora_mapping = torch.unique_consecutive( - self.prompt_lora_mapping) - - assert torch.sum(seq_lens) == torch.sum(self.seq_lens), \ - f"dont match - new {torch.sum(seq_lens)} vs {torch.sum(self.seq_lens)}" - - self.prompt_lora_mapping = prompt_lora_mapping.to( - dtype=self.prompt_lora_mapping.dtype) - self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype) - self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype) - - def as_sgmv_shrink_kwargs(self) -> Dict[str, Any]: - self.convert_to_sgmv_benchmark_tensors() + def as_lora_shrink_kwargs(self) -> dict[str, Any]: self.sanity_check() self.to_device(self.input.device) - num_seqs, num_tokens, max_seq_len, num_slices = self.metadata() + _, num_tokens, _, num_slices = self.metadata() # Sanity check matrix shapes. i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ @@ -529,22 +442,20 @@ class BenchmarkTensors: 'inputs': self.input, 'lora_a_weights': self.lora_weights_lst, 'output_tensor': self.output, - 'b_seq_start_loc': self.seq_start_loc, - 'seq_len_tensor': self.seq_lens, - 'lora_indices_tensor': self.prompt_lora_mapping, - 'batches': num_seqs, - 'max_seq_length': max_seq_len, - 'token_nums': num_tokens, + 'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping, + 'token_indices_sorted_by_lora_ids': + self.lora_kernel_meta.token_indices_sorted_by_lora_ids, + 'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora, + 'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc, + 'lora_ids': self.lora_kernel_meta.active_lora_ids, 'scaling': 1.0, } - def as_sgmv_expand_kwargs(self, add_inputs: bool) -> Dict[str, Any]: - - self.convert_to_sgmv_benchmark_tensors() + def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]: self.sanity_check() self.to_device(self.input.device) - num_seqs, num_tokens, max_seq_len, num_slices = self.metadata() + _, num_tokens, _, num_slices = self.metadata() # Sanity check matrix shapes. i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ @@ -566,124 +477,28 @@ class BenchmarkTensors: 'inputs': self.input, 'lora_b_weights': self.lora_weights_lst, 'output_tensor': self.output, - 'b_seq_start_loc': self.seq_start_loc, - 'seq_len_tensor': self.seq_lens, - 'lora_indices_tensor': self.prompt_lora_mapping, - 'batches': num_seqs, - 'max_seq_length': max_seq_len, - 'token_nums': num_tokens, + 'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping, + 'token_indices_sorted_by_lora_ids': + self.lora_kernel_meta.token_indices_sorted_by_lora_ids, + 'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora, + 'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc, + 'lora_ids': self.lora_kernel_meta.active_lora_ids, 'offset_start': 0, 'add_inputs': add_inputs, } - def as_bgmv_shrink_kwargs(self) -> Dict[str, Any]: - assert len(self.lora_weights_lst) == 1 - self.to_device(self.input.device) - - _, num_tokens, _, _ = self.metadata() - # Sanity check shapes - i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ - 0].shape, self.output.shape - # Expected input shape [num_tokens, hidden_size] - assert len(i_shape) == 2 - assert i_shape[0] == num_tokens - hidden_size = i_shape[1] - # Expected lora weight shape [num_loras, lora_rank, hidden_size] - assert len(lw_shape) == 3 - assert lw_shape[2] == hidden_size - lora_rank = lw_shape[1] - # Expected output shape [num_tokens, lora_rank] - assert len(o_shape) == 2 - assert o_shape == (num_tokens, lora_rank) - - return { - 'inputs': self.input, - 'lora_a_weights': self.lora_weights_lst[0], - 'output_tensor': self.output, - 'lora_indices_tensor': self.token_lora_mapping, - 'scaling': 1.0 - } - - def as_bgmv_expand_kwargs(self, add_inputs: bool): - assert len(self.lora_weights_lst) == 1 - self.to_device(self.input.device) - - _, num_tokens, _, _ = self.metadata() - # Sanity check shapes - i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ - 0].shape, self.output.shape - # Expected input shape [num_tokens, lora_rank] - assert len(i_shape) == 2 - assert i_shape[0] == num_tokens - lora_rank = i_shape[1] - # Expected lora weight shape [num_loras, hidden_size, lora_rank] - assert len(lw_shape) == 3 - assert lw_shape[2] == lora_rank - hidden_size = lw_shape[1] - # Expected output shape [num_tokens, hidden_size] - assert len(o_shape) == 2 - assert o_shape == (num_tokens, hidden_size) - - return { - 'inputs': self.input, - 'lora_b_weights': self.lora_weights_lst[0], - 'output_tensor': self.output, - 'lora_indices_tensor': self.token_lora_mapping, - 'add_inputs': add_inputs - } - - def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> Dict[str, Any]: - - _, num_tokens, _, num_slices = self.metadata() - # Sanity check shapes - i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ - 0].shape, self.output.shape - # Expected input shape [num_slices, num_tokens, lora_rank] - assert len(i_shape) == 3 - assert i_shape[0] == num_slices - assert i_shape[1] == num_tokens - lora_rank = i_shape[2] - # Expected lora weight shape [num_loras, hidden_size, lora_rank] - assert len(lw_shape) == 3 - assert lw_shape[2] == lora_rank - hidden_size = lw_shape[1] - # Expected output shape [num_tokens, hidden_size * num_slices] - assert len(o_shape) == 2 - assert o_shape == (num_tokens, hidden_size * num_slices) - - self.to_device(self.input.device) - - kwargs_list = [] - for i in range(num_slices): - kwargs_list.append({ - 'inputs': self.input[i], - 'lora_b_weights': self.lora_weights_lst[i], - 'output_tensor': self.output, - 'lora_indices_tensor': self.token_lora_mapping, - 'slice_offset': i * hidden_size, - 'slice_size': hidden_size, - 'add_inputs': add_inputs, - }) - return {'kwargs_list': kwargs_list} - def bench_fn_kwargs(self, op_type: OpType, - add_inputs: Optional[bool] = None) -> Dict[str, Any]: + add_inputs: Optional[bool] = None) -> dict[str, Any]: if op_type.is_shrink_fn(): assert add_inputs is None else: assert add_inputs is not None - if op_type == OpType.SGMV_SHRINK: - return self.as_sgmv_shrink_kwargs() - if op_type == OpType.SGMV_EXPAND: - return self.as_sgmv_expand_kwargs(add_inputs) - if op_type == OpType.BGMV_SHRINK: - return self.as_bgmv_shrink_kwargs() - if op_type == OpType.BGMV_EXPAND: - return self.as_bgmv_expand_kwargs(add_inputs) - if op_type == OpType.BGMV_EXPAND_SLICE: - return self.as_bgmv_expand_slice_kwargs(add_inputs) + if op_type == OpType.LORA_SHRINK: + return self.as_lora_shrink_kwargs() + if op_type == OpType.LORA_EXPAND: + return self.as_lora_expand_kwargs(add_inputs) raise ValueError(f"Unrecognized optype {self}") def test_correctness(self, op_type: OpType, @@ -732,7 +547,7 @@ def bench_optype(ctx: BenchmarkContext, assert expand_fn_add_inputs is not None # BenchmarkContext -> BenchmarkTensors - bench_tensors : List[BenchmarkTensors] = \ + bench_tensors : list[BenchmarkTensors] = \ [BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)] for bt in bench_tensors: bt.sanity_check() @@ -744,7 +559,7 @@ def bench_optype(ctx: BenchmarkContext, for bt in bench_tensors ]) - # BenchmarkTensors -> Dict (kwargs) + # BenchmarkTensors -> dict (kwargs) kwargs_list = [ bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs) for bt in bench_tensors @@ -839,7 +654,7 @@ def use_cuda_graph_recommendation() -> str: """ -def print_timers(timers: List[TMeasurement], +def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None): compare = TBenchmark.Compare(timers) compare.print() @@ -859,7 +674,7 @@ def print_timers(timers: List[TMeasurement], "small num_loras the goal should be to match the torch.mm numbers.") -def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]): +def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]): if args.cuda_graph_nops is not None: assert args.cuda_graph_nops > 0 @@ -871,14 +686,7 @@ def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]): timers = [] for bench_ctx in bench_ctxs: for seq_len in args.seq_lengths: - bench_ops: List[OpType] = [] - if seq_len == 1: - # bench all decode ops - bench_ops = [op for op in args.op_types if op.is_decode_op()] - else: - # bench all prefill ops - bench_ops = [op for op in args.op_types if op.is_prefill_op()] - + bench_ops: list[OpType] = args.op_types seq_len_timers = [] for bench_op in bench_ops: for num_slices in bench_op.num_slices(): @@ -919,10 +727,10 @@ def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]): pickle.dump(timers, f) -def as_benchmark_contexts(hidden_sizes: List[int], lora_ranks: List[int], - args: argparse.Namespace) -> List[BenchmarkContext]: +def as_benchmark_contexts(hidden_sizes: list[int], lora_ranks: list[int], + args: argparse.Namespace) -> list[BenchmarkContext]: - ctxs: List[BenchmarkContext] = [] + ctxs: list[BenchmarkContext] = [] for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product( # noqa args.batch_sizes, list(hidden_sizes), lora_ranks, args.num_loras, args.sort_by_lora_id): @@ -952,7 +760,7 @@ def run_list_bench(args: argparse.Namespace): f" LoRA Ranks {args.lora_ranks}") # Get all benchmarking contexts - bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + bench_contexts: list[BenchmarkContext] = as_benchmark_contexts( hidden_sizes=args.hidden_sizes, lora_ranks=args.lora_ranks, args=args) run(args, bench_contexts) @@ -973,7 +781,7 @@ def run_range_bench(args: argparse.Namespace): f" LoRA Ranks {lora_ranks}") # Get all benchmarking contexts - bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + bench_contexts: list[BenchmarkContext] = as_benchmark_contexts( hidden_sizes=hidden_sizes, lora_ranks=lora_ranks, args=args) run(args, bench_contexts) @@ -1000,7 +808,7 @@ def run_model_bench(args: argparse.Namespace): f" LoRA Ranks {args.lora_ranks}") # Get all benchmarking contexts - bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + bench_contexts: list[BenchmarkContext] = as_benchmark_contexts( hidden_sizes=hidden_sizes, lora_ranks=args.lora_ranks, args=args) run(args, bench_contexts) @@ -1088,13 +896,13 @@ Benchmark LoRA kernels: {use_cuda_graph_recommendation()} list_bench example: - python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 + python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 model_bench example: - python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 + python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 range_bench example: - python3 benchmarks/kernels/benchmark_lora.py range_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 + python3 benchmarks/kernels/benchmark_lora.py range_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 """, # noqa: E501 formatter_class=argparse.RawTextHelpFormatter) diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index 46bab74ae8adf..a661ea9d7e60b 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import copy import itertools @@ -5,9 +7,10 @@ import math import os import pickle as pkl import time +from collections.abc import Iterable from dataclasses import dataclass from itertools import product -from typing import Callable, Iterable, List, Optional, Tuple +from typing import Callable, Optional import pandas as pd import torch @@ -42,7 +45,6 @@ def terse_type_name(dt): torch.float16: "fp16", torch.int8: "int8", torch.float8_e4m3fn: "fp8", - torch.bfloat16: "bf16", torch.float: "float", torch.int: "int", }[dt] @@ -100,8 +102,8 @@ def quantize_and_pack(atype: torch.dtype, return w_ref, w_q, w_s, w_zp -def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig, - group_size: Optional[int]) -> List[BenchmarkTensors]: +def create_bench_tensors(shape: tuple[int, int, int], types: TypeConfig, + group_size: Optional[int]) -> list[BenchmarkTensors]: m, n, k = shape # we want to make sure that weights don't fit into L2 cache between runs so @@ -112,7 +114,7 @@ def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig, a = rand_data((m, k), types.act_type, scale=5) - benchmark_tensors: List[BenchmarkTensors] = [] + benchmark_tensors: list[BenchmarkTensors] = [] for _ in range(num_weights): w = rand_data((k, n), types.act_type, scale=5) @@ -256,7 +258,7 @@ def machete_create_bench_fn(bt: BenchmarkTensors, return lambda: ops.machete_mm( a=bt.a, - b_q=bt.w_q, + b_q=w_q, b_type=bt.wtype, b_group_scales=bt.w_g_s, b_group_zeros=w_g_zp, @@ -274,7 +276,7 @@ def machete_create_bench_fn(bt: BenchmarkTensors, def bench_fns(label: str, sub_label: str, description: str, - fns: List[Callable]): + fns: list[Callable]): min_run_time = 1 if not NVTX_PROFILE else 0.1 res = TBenchmark.Timer( @@ -309,7 +311,7 @@ def bench(types: TypeConfig, n: int, label: str, sub_label: str, - sweep_schedules: bool = True) -> List[TMeasurement]: + sweep_schedules: bool = True) -> list[TMeasurement]: benchmark_tensors = create_bench_tensors((m, n, k), types, group_size) sub_label += f", L={len(benchmark_tensors)}" @@ -412,12 +414,12 @@ def bench(types: TypeConfig, # runner -def print_timers(timers: List[TMeasurement]): +def print_timers(timers: list[TMeasurement]): compare = TBenchmark.Compare(timers) compare.print() -def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: +def run(args, MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]: types = TypeConfig( act_type=args.act_type, weight_type=scalar_types.uint4b8 if args.group_zero_type is None \ @@ -429,7 +431,7 @@ def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: token_scale_type=args.token_scale_type, ) - results: List[TMeasurement] = [] + results: list[TMeasurement] = [] for m, k, n in MKNs: timers = bench(types, args.group_size, @@ -447,8 +449,8 @@ def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: # output makers def make_output( - data: List[TMeasurement], - MKNs: Iterable[Tuple[int, int, int]], + data: list[TMeasurement], + MKNs: Iterable[tuple[int, int, int]], base_description: str, timestamp=None, ): @@ -495,7 +497,7 @@ def run_model_bench(args): for i, model in enumerate(args.models): print(f"[{i}] {model}") - def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]: + def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]: KNs = [] for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]): KN[tp_split_dim] = KN[tp_split_dim] // tp_size diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index 8fb44e3a3dbd8..1e785ac8fc73a 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -1,4 +1,4 @@ -from typing import List +# SPDX-License-Identifier: Apache-2.0 import torch import torch.utils.benchmark as benchmark @@ -8,6 +8,8 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES) +from vllm.model_executor.layers.quantization.utils.allspark_utils import ( + ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, ALLSPARK_SUPPORTED_QUANT_TYPES) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, MARLIN_SUPPORTED_GROUP_SIZES, query_marlin_supported_quant_types) @@ -16,18 +18,18 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import ( marlin_24_quantize) from vllm.model_executor.layers.quantization.utils.quant_utils import ( - gptq_pack, gptq_quantize_weights, sort_weights) + gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights) from vllm.scalar_type import ScalarType from vllm.utils import FlexibleArgumentParser DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"] -DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] +DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192] ACT_ORDER_OPTS = [False, True] K_FULL_OPTS = [False, True] -def bench_run(results: List[benchmark.Measurement], model: str, +def bench_run(results: list[benchmark.Measurement], model: str, act_order: bool, is_k_full: bool, quant_type: ScalarType, group_size: int, size_m: int, size_k: int, size_n: int): label = "Quant Matmul" @@ -79,6 +81,27 @@ def bench_run(results: List[benchmark.Measurement], model: str, GPTQ_MARLIN_24_MAX_PARALLEL) marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int) + # AllSpark W8A16 quant + as_supported_case = (quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES + and group_size == -1 and not act_order and is_k_full) + if as_supported_case: + properties = torch.cuda.get_device_properties(b.device.index) + sm_count = properties.multi_processor_count + sm_version = properties.major * 10 + properties.minor + + supported_arch = (sm_version >= 80 and sm_version < 90) + as_supported_case = as_supported_case and supported_arch + if supported_arch: + has_zp = False + w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, + has_zp) + qw = qw.to(torch.uint8) + + qw_reorder, s_reorder, zp_reorder = \ + ops.allspark_repack_weight( + qw, s, zp, has_zp) + CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD + globals = { # Gen params "quant_type": quant_type, @@ -107,10 +130,19 @@ def bench_run(results: List[benchmark.Measurement], model: str, # GPTQ params "q_w_gptq": q_w_gptq, "repack_sort_indices": repack_sort_indices, + # AllSpark W8A16 params + "qw_reorder": qw_reorder if as_supported_case else None, + "s_reorder": s_reorder if as_supported_case else None, + "zp_reorder": zp_reorder if as_supported_case else None, + "sm_count": sm_count if as_supported_case else None, + "sm_version": sm_version if as_supported_case else None, + "CUBLAS_M_THRESHOLD": + CUBLAS_M_THRESHOLD if as_supported_case else None, # Kernels "gptq_marlin_gemm": ops.gptq_marlin_gemm, "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm, "gptq_marlin_repack": ops.gptq_marlin_repack, + "allspark_w8a16_gemm": ops.allspark_w8a16_gemm, } min_run_time = 1 @@ -170,13 +202,24 @@ def bench_run(results: List[benchmark.Measurement], model: str, description="gptq_marlin_repack", ).blocked_autorange(min_run_time=min_run_time)) + if as_supported_case: + results.append( + benchmark.Timer( + stmt= + "output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="allspark_w8a16_gemm_fp32", + ).blocked_autorange(min_run_time=min_run_time)) + def main(args): print("Benchmarking models:") for i, model in enumerate(args.models): print(f"[{i}] {model}") - results: List[benchmark.Measurement] = [] + results: list[benchmark.Measurement] = [] for model in args.models: for layer in WEIGHT_SHAPES[model]: diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 1fa0da75c79d2..491f8c3962f73 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -1,8 +1,12 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse +import json import time +from contextlib import nullcontext from datetime import datetime from itertools import product -from typing import Any, Dict, List, Tuple, TypedDict +from typing import Any, TypedDict import ray import torch @@ -14,8 +18,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import * from vllm.platforms import current_platform from vllm.utils import FlexibleArgumentParser -FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm( -) else torch.float8_e4m3fn +FP8_DTYPE = current_platform.fp8_dtype() class BenchmarkConfig(TypedDict): @@ -38,6 +41,7 @@ def benchmark_config( use_fp8_w8a8: bool, use_int8_w8a16: bool, num_iters: int = 100, + block_quant_shape: List[int] = None, ) -> float: init_dtype = torch.float16 if use_fp8_w8a8 else dtype x = torch.randn(num_tokens, hidden_size, dtype=dtype) @@ -79,8 +83,24 @@ def benchmark_config( dtype=torch.float32) w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32) if use_fp8_w8a8: - w1_scale = torch.randn(num_experts, dtype=torch.float32) - w2_scale = torch.randn(num_experts, dtype=torch.float32) + if block_quant_shape: + block_n, block_k = block_quant_shape[0], block_quant_shape[1] + E = num_experts + N = shard_intermediate_size // 2 + K = hidden_size + factor_for_scale = 1e-2 + n_tiles_w1 = (2 * N + block_n - 1) // block_n + n_tiles_w2 = (K + block_n - 1) // block_n + k_tiles_w1 = (K + block_k - 1) // block_k + k_tiles_w2 = (N + block_k - 1) // block_k + w1_scale = torch.rand((E, n_tiles_w1, k_tiles_w1), + dtype=torch.float32) * factor_for_scale + w2_scale = torch.rand((E, n_tiles_w2, k_tiles_w2), + dtype=torch.float32) * factor_for_scale + else: + w1_scale = torch.randn(num_experts, dtype=torch.float32) + w2_scale = torch.randn(num_experts, dtype=torch.float32) + a1_scale = torch.randn(1, dtype=torch.float32) a2_scale = torch.randn(1, dtype=torch.float32) @@ -109,6 +129,7 @@ def benchmark_config( w2_scale=w2_scale, a1_scale=a1_scale, a2_scale=a2_scale, + block_shape=block_quant_shape, ) # JIT compilation & warmup @@ -130,7 +151,7 @@ def benchmark_config( start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) - latencies: List[float] = [] + latencies: list[float] = [] for i in range(num_iters): prepare(i) torch.cuda.synchronize() @@ -173,8 +194,9 @@ def get_rocm_tuning_space(use_fp16): return param_ranges -def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]: - configs: List[BenchmarkConfig] = [] +def get_configs_compute_bound(use_fp16, + block_quant_shape) -> list[dict[str, int]]: + configs: list[BenchmarkConfig] = [] if current_platform.is_rocm(): param_ranges = get_rocm_tuning_space(use_fp16) @@ -202,17 +224,27 @@ def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]: for config_values in product(*values): config = dict(zip(keys, config_values)) configs.append(config) + + # Remove configs that are not compatible with fp8 block quantization + # BLOCK_SIZE_K must be a multiple of block_k + # BLOCK_SIZE_N must be a multiple of block_n + if block_quant_shape is not None and not use_fp16: + block_n, block_k = block_quant_shape[0], block_quant_shape[1] + for config in configs[:]: + if config["BLOCK_SIZE_K"] % block_k != 0 or config[ + "BLOCK_SIZE_N"] % block_n != 0: + configs.remove(config) return configs def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size, - search_space, is_fp16): + search_space, is_fp16, topk): N1, K1 = shard_intermediate_size, hidden_size N2, K2 = hidden_size, shard_intermediate_size // 2 - pruned_space_1 = prune_rocm_configs(num_tokens * 2, N1, K1, search_space, - is_fp16) - pruned_space_2 = prune_rocm_configs(num_tokens * 2, N2, K2, search_space, - is_fp16) + pruned_space_1 = prune_rocm_configs(num_tokens * topk, N1, K1, + search_space, is_fp16) + pruned_space_2 = prune_rocm_configs(num_tokens * topk, N2, K2, + search_space, is_fp16) search_space = merge_unique_dicts(pruned_space_1, pruned_space_2) return search_space @@ -333,7 +365,8 @@ class BenchmarkWorker: dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, - ) -> Tuple[Dict[str, int], float]: + block_quant_shape: List[int] = None, + ) -> tuple[dict[str, int], float]: current_platform.seed_everything(self.seed) dtype_str = get_config_dtype_str(dtype, use_int8_w8a16=use_int8_w8a16, @@ -343,16 +376,27 @@ class BenchmarkWorker: op_config = get_moe_configs(num_experts, shard_intermediate_size // 2, dtype_str) if op_config is None: - config = get_default_config(num_tokens, num_experts, - shard_intermediate_size, hidden_size, - topk, dtype_str) + config = get_default_config(num_tokens, + num_experts, + shard_intermediate_size, + hidden_size, + topk, + dtype_str, + is_marlin=False) else: config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))] - kernel_time = benchmark_config(config, num_tokens, num_experts, - shard_intermediate_size, hidden_size, - topk, dtype, use_fp8_w8a8, - use_int8_w8a16) + kernel_time = benchmark_config(config, + num_tokens, + num_experts, + shard_intermediate_size, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + num_iters=100, + block_quant_shape=block_quant_shape) return config, kernel_time def tune( @@ -365,8 +409,9 @@ class BenchmarkWorker: dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, - search_space: List[Dict[str, int]], - ) -> Dict[str, int]: + search_space: list[dict[str, int]], + block_quant_shape: list[int], + ) -> dict[str, int]: best_config = None best_time = float("inf") if current_platform.is_rocm(): @@ -374,21 +419,24 @@ class BenchmarkWorker: search_space = prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size, search_space, - is_fp16) + is_fp16, topk) - with torch.cuda.device(self.device_id): + with torch.cuda.device(self.device_id) if current_platform.is_rocm( + ) else nullcontext(): for config in tqdm(search_space): try: - kernel_time = benchmark_config(config, - num_tokens, - num_experts, - shard_intermediate_size, - hidden_size, - topk, - dtype, - use_fp8_w8a8, - use_int8_w8a16, - num_iters=20) + kernel_time = benchmark_config( + config, + num_tokens, + num_experts, + shard_intermediate_size, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + num_iters=20, + block_quant_shape=block_quant_shape) except triton.runtime.autotuner.OutOfResources: # Some configurations may be invalid and fail to compile. continue @@ -428,10 +476,10 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: } -def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int, +def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int, shard_intermediate_size: int, hidden_size: int, topk: int, - dtype: torch.dtype, use_fp8_w8a8: bool, - use_int8_w8a16: bool) -> None: + dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, + block_quant_shape: List[int]) -> None: dtype_str = get_config_dtype_str(dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8) @@ -439,7 +487,7 @@ def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int, # NOTE(woosuk): The current naming convention uses w2.shape[2], which # is the intermediate size after silu_and_mul. filename = get_config_file_name(num_experts, shard_intermediate_size // 2, - dtype_str) + dtype_str, block_quant_shape) print(f"Writing best config to {filename}...") with open(filename, "w") as f: @@ -447,10 +495,19 @@ def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int, f.write("\n") +def get_weight_block_size_safety(config, default_value=None): + + quantization_config = getattr(config, 'quantization_config', {}) + if isinstance(quantization_config, dict): + return quantization_config.get('weight_block_size', default_value) + return default_value + + def main(args: argparse.Namespace): print(args) - - config = AutoConfig.from_pretrained(args.model) + block_quant_shape = None + config = AutoConfig.from_pretrained( + args.model, trust_remote_code=args.trust_remote_code) if config.architectures[0] == "DbrxForCausalLM": E = config.ffn_config.moe_num_experts topk = config.ffn_config.moe_top_k @@ -461,6 +518,18 @@ def main(args: argparse.Namespace): topk = config.num_experts_per_tok intermediate_size = config.intermediate_size shard_intermediate_size = 2 * intermediate_size // args.tp_size + elif (config.architectures[0] == "DeepseekV3ForCausalLM" + or config.architectures[0] == "DeepseekV2ForCausalLM"): + E = config.n_routed_experts + topk = config.num_experts_per_tok + intermediate_size = config.moe_intermediate_size + shard_intermediate_size = 2 * intermediate_size // args.tp_size + block_quant_shape = get_weight_block_size_safety(config) + elif config.architectures[0] == "Qwen2MoeForCausalLM": + E = config.num_experts + topk = config.num_experts_per_tok + intermediate_size = config.moe_intermediate_size + shard_intermediate_size = 2 * intermediate_size // args.tp_size else: # Default: Mixtral. E = config.num_local_experts @@ -485,7 +554,7 @@ def main(args: argparse.Namespace): num_gpus = int(ray.available_resources()["GPU"]) workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)] - def _distribute(method: str, inputs: List[Any]) -> List[Any]: + def _distribute(method: str, inputs: list[Any]) -> list[Any]: outputs = [] worker_idx = 0 for input_args in inputs: @@ -498,27 +567,30 @@ def main(args: argparse.Namespace): if args.tune: is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) - search_space = get_configs_compute_bound(is_fp16) + search_space = get_configs_compute_bound(is_fp16, block_quant_shape) print(f"Start tuning over {len(search_space)} configurations...") start = time.time() configs = _distribute( - "tune", [(batch_size, E, shard_intermediate_size, hidden_size, - topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space) - for batch_size in batch_sizes]) + "tune", + [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype, + use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape) + for batch_size in batch_sizes]) best_configs = { M: sort_config(config) for M, config in zip(batch_sizes, configs) } save_configs(best_configs, E, shard_intermediate_size, hidden_size, - topk, dtype, use_fp8_w8a8, use_int8_w8a16) + topk, dtype, use_fp8_w8a8, use_int8_w8a16, + block_quant_shape) end = time.time() print(f"Tuning took {end - start:.2f} seconds") else: outputs = _distribute( - "benchmark", [(batch_size, E, shard_intermediate_size, hidden_size, - topk, dtype, use_fp8_w8a8, use_int8_w8a16) - for batch_size in batch_sizes]) + "benchmark", + [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype, + use_fp8_w8a8, use_int8_w8a16, block_quant_shape) + for batch_size in batch_sizes]) for batch_size, (config, kernel_time) in zip(batch_sizes, outputs): print(f"Batch size: {batch_size}, config: {config}") @@ -530,7 +602,11 @@ if __name__ == "__main__": parser.add_argument("--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1") - parser.add_argument("--tp-size", "-tp", type=int, default=2) + parser.add_argument("--tp-size", + "-tp", + "--tensor-parallel-size", + type=int, + default=2) parser.add_argument("--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], @@ -538,6 +614,7 @@ if __name__ == "__main__": parser.add_argument("--seed", type=int, default=0) parser.add_argument("--batch-size", type=int, required=False) parser.add_argument("--tune", action="store_true") + parser.add_argument("--trust-remote-code", action="store_true") args = parser.parse_args() main(args) diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 219013a38134b..48b351bc48141 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -1,6 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 + import random import time -from typing import List, Optional +from typing import Optional import torch @@ -9,8 +11,9 @@ from vllm.platforms import current_platform from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, create_kv_caches_with_random) -NUM_BLOCKS = 1024 +NUM_BLOCKS = 128 * 1024 PARTITION_SIZE = 512 +PARTITION_SIZE_ROCM = 256 @torch.inference_mode() @@ -52,7 +55,7 @@ def main( # Create the block tables. max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size - block_tables_lst: List[List[int]] = [] + block_tables_lst: list[list[int]] = [] for _ in range(num_seqs): block_table = [ random.randint(0, NUM_BLOCKS - 1) @@ -78,6 +81,12 @@ def main( # Prepare for the paged attention kernel. output = torch.empty_like(query) if version == "v2": + if current_platform.is_rocm(): + global PARTITION_SIZE + if not args.custom_paged_attn: + PARTITION_SIZE = 1024 + else: + PARTITION_SIZE = PARTITION_SIZE_ROCM num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE) tmp_output = torch.empty( size=(num_seqs, num_query_heads, num_partitions, head_size), @@ -121,32 +130,53 @@ def main( v_scale, ) elif version == "v2": - ops.paged_attention_v2( - output, - exp_sums, - max_logits, - tmp_output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - block_tables, - seq_lens, - block_size, - max_seq_len, - alibi_slopes, - kv_cache_dtype, - k_scale, - v_scale, - ) + if not args.custom_paged_attn: + ops.paged_attention_v2( + output, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + seq_lens, + block_size, + max_seq_len, + alibi_slopes, + kv_cache_dtype, + k_scale, + v_scale, + ) + else: + ops.paged_attention_rocm( + output, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + seq_lens, + block_size, + max_seq_len, + alibi_slopes, + kv_cache_dtype, + k_scale, + v_scale, + ) else: raise ValueError(f"Invalid version: {version}") torch.cuda.synchronize() end_time = time.perf_counter() if profile: - torch.cuda.cudart().cudaProfilerStart() + torch.cuda.cudart().cudaProfilerStop() return (end_time - start_time) / num_iters # Warmup. @@ -193,6 +223,9 @@ if __name__ == '__main__': help="Data type for kv cache storage. If 'auto', will use model " "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. " "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)") + parser.add_argument("--custom-paged-attn", + action="store_true", + help="Use custom paged attention") args = parser.parse_args() print(args) diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index 1d62483448946..b643897a60eef 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time import torch @@ -38,7 +40,7 @@ def main(num_tokens: int, end_time = time.perf_counter() if profile: - torch.cuda.cudart().cudaProfilerStart() + torch.cuda.cudart().cudaProfilerStop() return (end_time - start_time) / num_iters # Warmup. diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py index baa5de0fff1bd..eaf6b25e8ca4f 100644 --- a/benchmarks/kernels/benchmark_rmsnorm.py +++ b/benchmarks/kernels/benchmark_rmsnorm.py @@ -1,5 +1,7 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools -from typing import Optional, Tuple, Union +from typing import Optional, Union import torch import triton @@ -20,7 +22,7 @@ class HuggingFaceRMSNorm(nn.Module): self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: orig_dtype = x.dtype x = x.to(torch.float32) if residual is not None: @@ -137,7 +139,7 @@ def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True): print(f"Naive output={output_naive}") print(f"FlashInfer output={output_flashinfer}") - print(f"VLLM output={output_vllm}") + print(f"vLLM output={output_vllm}") if torch.allclose(output_naive, output_flashinfer, atol=1e-2, rtol=1e-2) and torch.allclose( diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 250d505168d09..05d24fc4b16d4 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -1,5 +1,7 @@ +# SPDX-License-Identifier: Apache-2.0 + from itertools import accumulate -from typing import List, Optional +from typing import Optional import nvtx import torch @@ -37,7 +39,7 @@ def benchmark_rope_kernels_multi_lora( }) # non-batched RoPE takes only one scaling factor, we create multiple # instances to simulate the same behavior - non_batched_ropes: List[RotaryEmbedding] = [] + non_batched_ropes: list[RotaryEmbedding] = [] for scaling_factor in scaling_factors: non_batched_ropes.append( get_rope(head_size, rotary_dim, max_position, base, is_neox_style, diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py index 4eeeca35a37cc..c375e61e41873 100644 --- a/benchmarks/kernels/benchmark_shapes.py +++ b/benchmarks/kernels/benchmark_shapes.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + WEIGHT_SHAPES = { "ideal": [[4 * 256 * 32, 256 * 32]], "mistralai/Mistral-7B-v0.1/TP1": [ diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py new file mode 100644 index 0000000000000..8f07bc8ca52eb --- /dev/null +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -0,0 +1,420 @@ +# SPDX-License-Identifier: Apache-2.0 +# Adapted from sglang quantization/tuning_block_wise_kernel.py + +import argparse +import json +import multiprocessing as mp +import os +import time +from datetime import datetime +from typing import Any + +import torch +import tqdm +import triton + +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + _w8a8_block_fp8_matmul) +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser + +mp.set_start_method("spawn", force=True) + +assert current_platform.is_cuda( +), "Only support tune w8a8 block fp8 kernel on CUDA device." + +DTYPE_MAP = { + "float32": torch.float32, + "float16": torch.float16, + "half": torch.half, + "bfloat16": torch.bfloat16, +} + + +def w8a8_block_matmul( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + block_size: list[int], + config: dict[str, Any], + output_dtype: torch.dtype = torch.float16, +) -> torch.Tensor: + """This function performs matrix multiplication with + block-wise quantization. + + It takes two input tensors `A` and `B` with scales `As` and `Bs`. + The output is returned in the specified `output_dtype`. + + Args: + A: The input tensor, e.g., activation. + B: The input tensor, e.g., weight. + As: The per-token-group quantization scale for `A`. + Bs: The per-block quantization scale for `B`. + block_size: The block size for per-block quantization. + It should be 2-dim, e.g., [128, 128]. + output_dytpe: The dtype of the returned tensor. + + Returns: + torch.Tensor: The result of matmul. + """ + assert len(block_size) == 2 + block_n, block_k = block_size[0], block_size[1] + + assert A.shape[-1] == B.shape[-1] + assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous() + assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1] + M = A.numel() // A.shape[-1] + + assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2 + N, K = B.shape + assert triton.cdiv(N, block_n) == Bs.shape[0] + assert triton.cdiv(K, block_k) == Bs.shape[1] + + C_shape = A.shape[:-1] + (N, ) + C = A.new_empty(C_shape, dtype=output_dtype) + + def grid(META): + return (triton.cdiv(M, META["BLOCK_SIZE_M"]) * + triton.cdiv(N, META["BLOCK_SIZE_N"]), ) + + if A.dtype == torch.float8_e4m3fn: + kernel = _w8a8_block_fp8_matmul + else: + raise RuntimeError( + "Currently, only support tune w8a8 block fp8 kernel.") + + kernel[grid]( + A, + B, + C, + As, + Bs, + M, + N, + K, + block_n, + block_k, + A.stride(-2), + A.stride(-1), + B.stride(1), + B.stride(0), + C.stride(-2), + C.stride(-1), + As.stride(-2), + As.stride(-1), + Bs.stride(1), + Bs.stride(0), + **config, + ) + + return C + + +def get_configs_compute_bound(): + configs = [] + for num_stages in [2, 3, 4, 5]: + for block_m in [16, 32, 64, 128, 256]: + for block_k in [64, 128]: + for block_n in [32, 64, 128, 256]: + for num_warps in [4, 8]: + for group_size in [1, 16, 32, 64]: + configs.append({ + "BLOCK_SIZE_M": block_m, + "BLOCK_SIZE_N": block_n, + "BLOCK_SIZE_K": block_k, + "GROUP_SIZE_M": group_size, + "num_warps": num_warps, + "num_stages": num_stages, + }) + return configs + + +def get_weight_shapes(tp_size): + # NOTE(HandH1998): The weight shapes only works for DeepSeek-V3. + # Modify them, if you tune for another different model. + # cannot TP + total = [ + (512 + 64, 7168), + ((128 + 64) * 128, 7168), + (128 * (128 + 128), 512), + (7168, 16384), + (7168, 18432), + ] + # N can TP + n_tp = [ + (18432 * 2, 7168), + ((128 + 64) * 128, 7168), + (128 * (128 + 128), 512), + (24576, 1536), + (12288, 7168), + (4096, 7168), + ] + # K can TP + k_tp = [(7168, 18432), (7168, 16384), (7168, 2048)] + + weight_shapes = [] + for t in total: + weight_shapes.append(t) + for n_t in n_tp: + new_t = (n_t[0] // tp_size, n_t[1]) + weight_shapes.append(new_t) + for k_t in k_tp: + new_t = (k_t[0], k_t[1] // tp_size) + weight_shapes.append(new_t) + return weight_shapes + + +def benchmark_config(A, + B, + As, + Bs, + block_size, + config, + out_dtype=torch.float16, + num_iters=10): + + def run(): + w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype) + + torch.cuda.synchronize() + # JIT complication & warmup + for _ in range(5): + run() + torch.cuda.synchronize() + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + latencies: list[float] = [] + for i in range(num_iters): + torch.cuda.synchronize() + start_event.record() + run() + end_event.record() + end_event.synchronize() + latencies.append(start_event.elapsed_time(end_event)) + avg = sum(latencies) / (num_iters * 10) * 1000 # us + return avg + + +def tune(M, N, K, block_size, out_dtype, search_space, input_type): + factor_for_scale = 1e-2 + + if input_type == "fp8": + fp8_info = torch.finfo(torch.float8_e4m3fn) + fp8_max, fp8_min = fp8_info.max, fp8_info.min + + A_fp32 = ( + (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * + fp8_max) + A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + + B_fp32 = ( + (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * + fp8_max) + B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + else: + raise RuntimeError( + "Currently, only support tune w8a8 block fp8 kernel.") + + block_n, block_k = block_size[0], block_size[1] + n_tiles = (N + block_n - 1) // block_n + k_tiles = (K + block_k - 1) // block_k + + As = torch.rand(M, k_tiles, dtype=torch.float32, + device="cuda") * factor_for_scale + Bs = (torch.rand(n_tiles, k_tiles, dtype=torch.float32, device="cuda") * + factor_for_scale) + + best_config = None + best_time = float("inf") + for config in tqdm(search_space): + try: + kernel_time = benchmark_config( + A, + B, + As, + Bs, + block_size, + config, + out_dtype, + num_iters=10, + ) + except triton.runtime.autotuner.OutOfResources: + # Some configurations may be invalid and fail to compile. + continue + + if kernel_time < best_time: + best_time = kernel_time + best_config = config + now = datetime.now() + print(f"{now.ctime()}] Completed tuning for batch_size={M}") + assert best_config is not None + return best_config + + +def save_configs( + N, + K, + block_n, + block_k, + configs, + save_path, + input_type="fp8", +) -> None: + os.makedirs(save_path, exist_ok=True) + device_name = current_platform.get_device_name().replace(" ", "_") + json_file_name = ( + f"N={N},K={K},device_name={device_name},dtype={input_type}_w8a8," + f"block_shape=[{block_n},{block_k}].json") + + config_file_path = os.path.join(save_path, json_file_name) + print(f"Writing best config to {config_file_path}...") + + with open(config_file_path, "w") as f: + json.dump(configs, f, indent=4) + f.write("\n") + + +def tune_on_gpu(args_dict): + """Run tuning on a specific GPU.""" + gpu_id = args_dict["gpu_id"] + batch_sizes = args_dict["batch_sizes"] + weight_shapes = args_dict["weight_shapes"] + args = args_dict["args"] + + torch.cuda.set_device(gpu_id) + print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}") + + block_n = args.block_n + block_k = args.block_k + out_dtype = DTYPE_MAP[args.out_dtype] + save_path = args.save_path + input_type = args.input_type + + search_space = get_configs_compute_bound() + search_space = [ + config for config in search_space + if block_k % config["BLOCK_SIZE_K"] == 0 + ] + + start = time.time() + for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"): + N, K = shape[0], shape[1] + print(f"[GPU {gpu_id}] Tune for weight shape of `N: {N}, K: {K}`") + benchmark_results = [ + tune( + batch_size, + N, + K, + [block_n, block_k], + out_dtype, + search_space, + input_type, + ) for batch_size in tqdm(batch_sizes, + desc=f"GPU {gpu_id} - Batch sizes") + ] + best_configs = { + M: config + for M, config in zip(batch_sizes, benchmark_results) + } + save_configs(N, K, block_n, block_k, best_configs, save_path, + input_type) + + end = time.time() + print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds") + + +def distribute_batch_sizes(batch_sizes, num_gpus): + """Distribute batch sizes across available GPUs.""" + batches_per_gpu = [] + for i in range(num_gpus): + start_idx = i * len(batch_sizes) // num_gpus + end_idx = (i + 1) * len(batch_sizes) // num_gpus + batches_per_gpu.append(batch_sizes[start_idx:end_idx]) + return batches_per_gpu + + +def main(args): + print(args) + num_gpus = torch.cuda.device_count() + if num_gpus == 0: + raise RuntimeError("No GPU available for tuning") + print(f"Found {num_gpus} GPUs for parallel tuning") + + torch.cuda.init() + + if args.batch_size is None: + batch_sizes = [ + 1, + 2, + 4, + 8, + 16, + 24, + 32, + 48, + 64, + 96, + 128, + 256, + 512, + 1024, + 1536, + 2048, + 3072, + 4096, + ] + else: + batch_sizes = [args.batch_size] + num_gpus = 1 # If only one batch size, use only one GPU + + weight_shapes = get_weight_shapes(args.tp_size) + + batches_per_gpu = distribute_batch_sizes(batch_sizes, num_gpus) + + process_args = [] + for gpu_id in range(num_gpus): + process_args.append({ + "gpu_id": gpu_id, + "batch_sizes": batches_per_gpu[gpu_id], + "weight_shapes": + weight_shapes, # Each GPU processes all weight shapes + "args": args, + }) + + ctx = mp.get_context("spawn") + with ctx.Pool(num_gpus) as pool: + pool.map(tune_on_gpu, process_args) + + print("Multi-GPU tuning completed") + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description=""" +Tune triton w8a8 block fp8 for DeepSeek-V3/DeepSeek-R1: + python3 benchmark_w8a8_block_fp8.py --tp-size 8 --input-type fp8 +Then copy to model_executor/layers/quantization/utils/configs + """, + formatter_class=argparse.RawTextHelpFormatter) + + parser.add_argument("--tp-size", "-tp", type=int, default=8) + parser.add_argument("--input-type", + type=str, + choices=["fp8"], + default="fp8") + parser.add_argument( + "--out-dtype", + type=str, + choices=["float32", "float16", "bfloat16", "half"], + default="float16", + ) + parser.add_argument("--block-n", type=int, default=128) + parser.add_argument("--block-k", type=int, default=128) + parser.add_argument("--batch-size", type=int, required=False) + parser.add_argument("--save-path", type=str, default="./") + args = parser.parse_args() + + main(args) diff --git a/benchmarks/kernels/deepgemm/README.md b/benchmarks/kernels/deepgemm/README.md new file mode 100644 index 0000000000000..917e814010f89 --- /dev/null +++ b/benchmarks/kernels/deepgemm/README.md @@ -0,0 +1,129 @@ +# DeepSeek DeepGEMM Kernels Benchmark + +This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels. + +Currently this just includes dense GEMMs and only works on Hopper GPUs. + +## Setup + +You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory: + +``` +git clone --recursive https://github.com/deepseek-ai/DeepGEMM +cd DeepGEMM +python setup.py install +uv pip install -e . +``` + +## Usage + +``` +python benchmark_fp8_block_dense_gemm.py +INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda. +===== STARTING FP8 GEMM BENCHMARK ===== +PyTorch version: 2.5.1+cu124 +CUDA version: 12.4 +Triton version: 3.1.0 +Using device: NVIDIA H100 80GB HBM3 +WARNING 02-26 21:55:15 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +INFO 02-26 21:55:15 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel. +WARNING 02-26 21:55:16 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=18432,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +WARNING 02-26 21:55:17 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel. +INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel. + +===== PERFORMANCE COMPARISON ===== + +DeepGEMM Implementation: ++------+-------+-------+-----------+--------+--------+ +| m | n | k | Time (μs) | TFLOPS | GB/s | ++------+-------+-------+-----------+--------+--------+ +| 8 | 4096 | 7168 | 102.9 | 4.6 | 286.4 | +| 8 | 7168 | 18432 | 70.8 | 29.8 | 1868.8 | +| 8 | 18432 | 7168 | 69.3 | 30.5 | 1911.8 | +| 64 | 4096 | 7168 | 69.1 | 54.4 | 439.0 | +| 64 | 7168 | 18432 | 69.4 | 243.6 | 1933.6 | +| 64 | 18432 | 7168 | 70.4 | 240.3 | 1917.2 | +| 64 | 24576 | 1536 | 70.1 | 68.9 | 584.6 | +| 64 | 32768 | 512 | 68.4 | 31.4 | 307.1 | +| 64 | 7168 | 16384 | 69.5 | 216.3 | 1718.5 | +| 128 | 4096 | 7168 | 141.1 | 53.3 | 222.1 | +| 128 | 7168 | 18432 | 71.9 | 470.5 | 1896.1 | +| 128 | 18432 | 7168 | 69.3 | 488.2 | 1988.2 | +| 1024 | 4096 | 7168 | 89.7 | 670.1 | 502.5 | +| 1024 | 18432 | 7168 | 279.0 | 969.8 | 635.2 | +| 2048 | 4096 | 7168 | 175.1 | 687.0 | 347.4 | +| 4096 | 4096 | 7168 | 335.4 | 717.0 | 275.1 | ++------+-------+-------+-----------+--------+--------+ + +vLLM Triton Implementation: ++------+-------+-------+-----------+--------+--------+--------------+ +| m | n | k | Time (μs) | TFLOPS | GB/s | vs DeepGEMM | ++------+-------+-------+-----------+--------+--------+--------------+ +| 8 | 4096 | 7168 | 74.0 | 6.3 | 398.2 | 1.39x faster | +| 8 | 7168 | 18432 | 89.6 | 23.6 | 1478.1 | 0.79x slower | +| 8 | 18432 | 7168 | 113.2 | 18.7 | 1170.4 | 0.61x slower | +| 64 | 4096 | 7168 | 79.4 | 47.3 | 382.2 | 0.87x slower | +| 64 | 7168 | 18432 | 98.5 | 171.7 | 1363.0 | 0.70x slower | +| 64 | 18432 | 7168 | 119.5 | 141.5 | 1129.4 | 0.59x slower | +| 64 | 24576 | 1536 | 37.6 | 128.4 | 1089.7 | 1.86x faster | +| 64 | 32768 | 512 | 38.7 | 55.5 | 542.6 | 1.77x faster | +| 64 | 7168 | 16384 | 86.1 | 174.5 | 1386.4 | 0.81x slower | +| 128 | 4096 | 7168 | 90.7 | 82.9 | 345.4 | 1.56x faster | +| 128 | 7168 | 18432 | 144.0 | 234.9 | 946.9 | 0.50x slower | +| 128 | 18432 | 7168 | 229.5 | 147.4 | 600.1 | 0.30x slower | +| 1024 | 4096 | 7168 | 242.3 | 248.2 | 186.1 | 0.37x slower | +| 1024 | 18432 | 7168 | 897.8 | 301.4 | 197.4 | 0.31x slower | +| 2048 | 4096 | 7168 | 463.0 | 259.7 | 131.4 | 0.38x slower | +| 4096 | 4096 | 7168 | 901.8 | 266.7 | 102.3 | 0.37x slower | ++------+-------+-------+-----------+--------+--------+--------------+ + +vLLM CUTLASS Implementation: ++------+-------+-------+-----------+--------+--------+--------------+--------------+ +| m | n | k | Time (μs) | TFLOPS | GB/s | vs DeepGEMM | vs Triton | ++------+-------+-------+-----------+--------+--------+--------------+--------------+ +| 8 | 4096 | 7168 | 34.6 | 13.6 | 852.3 | 2.98x faster | 2.14x faster | +| 8 | 7168 | 18432 | 78.9 | 26.8 | 1677.3 | 0.90x slower | 1.13x faster | +| 8 | 18432 | 7168 | 81.2 | 26.0 | 1631.1 | 0.85x slower | 1.39x faster | +| 64 | 4096 | 7168 | 36.9 | 101.9 | 822.9 | 1.87x faster | 2.15x faster | +| 64 | 7168 | 18432 | 87.4 | 193.4 | 1535.2 | 0.79x slower | 1.13x faster | +| 64 | 18432 | 7168 | 85.0 | 199.0 | 1587.6 | 0.83x slower | 1.41x faster | +| 64 | 24576 | 1536 | 28.0 | 172.8 | 1465.8 | 2.51x faster | 1.35x faster | +| 64 | 32768 | 512 | 28.8 | 74.5 | 728.5 | 2.37x faster | 1.34x faster | +| 64 | 7168 | 16384 | 77.9 | 193.0 | 1532.8 | 0.89x slower | 1.11x faster | +| 128 | 4096 | 7168 | 39.1 | 192.4 | 802.0 | 3.61x faster | 2.32x faster | +| 128 | 7168 | 18432 | 93.7 | 360.8 | 1454.2 | 0.77x slower | 1.54x faster | +| 128 | 18432 | 7168 | 85.7 | 394.8 | 1608.0 | 0.81x slower | 2.68x faster | +| 1024 | 4096 | 7168 | 99.7 | 603.1 | 452.2 | 0.90x slower | 2.43x faster | +| 1024 | 18432 | 7168 | 331.3 | 816.7 | 534.9 | 0.84x slower | 2.71x faster | +| 2048 | 4096 | 7168 | 198.3 | 606.6 | 306.7 | 0.88x slower | 2.34x faster | +| 4096 | 4096 | 7168 | 392.2 | 613.2 | 235.3 | 0.86x slower | 2.30x faster | ++------+-------+-------+-----------+--------+--------+--------------+--------------+ + +===== AVERAGE PERFORMANCE ===== ++----------------+------------+----------+---------------+ +| Implementation | Avg TFLOPS | Avg GB/s | Avg Time (ms) | ++----------------+------------+----------+---------------+ +| DeepGEMM | 310.98 | 1052.10 | 0.11 | +| vLLM Triton | 144.30 | 715.60 | 0.23 | +| vLLM CUTLASS | 286.78 | 1076.67 | 0.11 | ++----------------+------------+----------+---------------+ + +===== AVERAGE SPEEDUPS ===== ++-----------------------------+--------------+ +| Comparison | Speedup | ++-----------------------------+--------------+ +| DeepGEMM vs vLLM Triton | 1.71x faster | +| DeepGEMM vs vLLM CUTLASS | 0.94x slower | +| vLLM CUTLASS vs vLLM Triton | 1.84x faster | ++-----------------------------+--------------+ + +===== ACCURACY COMPARISON ===== ++----------------+-----------------------+ +| Implementation | Avg Diff vs Reference | ++----------------+-----------------------+ +| DeepGEMM | 0.000684 | +| vLLM Triton | 0.000684 | +| vLLM CUTLASS | 0.000684 | ++----------------+-----------------------+ +``` diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py new file mode 100644 index 0000000000000..7892f126e7d69 --- /dev/null +++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py @@ -0,0 +1,464 @@ +# SPDX-License-Identifier: Apache-2.0 +# fmt: off +# ruff: noqa: E501 +import time + +# Import DeepGEMM functions +import deep_gemm +import torch +import triton +from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor + +# Import vLLM functions +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + per_token_group_quant_fp8, w8a8_block_fp8_matmul) + + +# Copied from +# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L9 +def per_token_cast_to_fp8( + x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """Convert tensor to FP8 format with per-token scaling.""" + assert x.dim() == 2 and x.size(1) % 128 == 0 + m, n = x.shape + x_view = x.view(m, -1, 128) + x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4) + return (x_view * (448.0 / x_amax.unsqueeze(2))).to( + torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1) + + +# Copied from +# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L17 +def per_block_cast_to_fp8( + x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """Convert tensor to FP8 format with per-block scaling.""" + assert x.dim() == 2 + m, n = x.shape + x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128), + dtype=x.dtype, + device=x.device) + x_padded[:m, :n] = x + x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128) + x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) + x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn) + return x_scaled.view_as(x_padded)[:m, :n].contiguous(), ( + x_amax / 448.0).view(x_view.size(0), x_view.size(2)) + + +def benchmark_shape(m: int, + n: int, + k: int, + warmup: int = 100, + repeat: int = 10000, + verbose: bool = False) -> dict: + """Benchmark all implementations for a specific (m, n, k) shape.""" + if verbose: + print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===") + + # Create test tensors + A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16) + B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16) + + # Reference result in BF16 + torch.cuda.synchronize() + C_ref = A @ B.t() + + # Pre-quantize B for all implementations + # (weights can be pre-quantized offline) + B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B) + B_vllm, B_scale_vllm = per_block_cast_to_fp8(B) + + # Block size configuration + block_size = [128, 128] + + # Pre-quantize A for all implementations + A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A) + A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm) + C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16) + A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1]) + A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8( + A, block_size[1], column_major_scales=True) + + # === DeepGEMM Implementation === + def deepgemm_gemm(): + # A quantization is inside the loop as it depends on activations + # A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A) + # A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8( + # A, block_size[1]) + # A_scale_aligned = get_col_major_tma_aligned_tensor(A_scale_deepgemm) + # C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16) + deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm), + (B_deepgemm, B_scale_deepgemm), + C_deepgemm) + return C_deepgemm + + # === vLLM Triton Implementation === + def vllm_triton_gemm(): + # A quantization is inside the loop as it depends on activations + # A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1]) + return w8a8_block_fp8_matmul(A_vllm, + B_vllm, + A_scale_vllm, + B_scale_vllm, + block_size, + output_dtype=torch.bfloat16) + + # === vLLM CUTLASS Implementation === + def vllm_cutlass_gemm(): + # A quantization is inside the loop as it depends on activations + # A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8( + # A, block_size[1], column_major_scales=True) + return ops.cutlass_scaled_mm(A_vllm_cutlass, + B_vllm.T, + scale_a=A_scale_vllm_cutlass, + scale_b=B_scale_vllm.T, + out_dtype=torch.bfloat16) + + # Run correctness check first + if verbose: + print("Running correctness check...") + C_deepgemm = deepgemm_gemm() + C_vllm_triton = vllm_triton_gemm() + C_vllm_cutlass = vllm_cutlass_gemm() + + deepgemm_diff = calc_diff(C_deepgemm, C_ref) + vllm_triton_diff = calc_diff(C_vllm_triton, C_ref) + vllm_cutlass_diff = calc_diff(C_vllm_cutlass, C_ref) + + if verbose: + print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}") + print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}") + print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}") + print("vLLM Triton vs DeepGEMM difference: " + f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}") + print("vLLM CUTLASS vs DeepGEMM difference: " + f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}") + + # Benchmark implementations + implementations = { + "DeepGEMM": deepgemm_gemm, + "vLLM Triton": vllm_triton_gemm, + "vLLM CUTLASS": vllm_cutlass_gemm + } + + benchmark_results = { + "shape": { + "m": m, + "n": n, + "k": k + }, + "implementations": {} + } + + for name, func in implementations.items(): + # Warmup + for _ in range(warmup): + func() + torch.cuda.synchronize() + + # Timing loop + torch.cuda.synchronize() + start = time.time() + for _ in range(repeat): + func() + torch.cuda.synchronize() + end = time.time() + + # Calculate timing and TFLOPS + avg_time_ms = (end - start) / repeat * 1000 + avg_time_us = avg_time_ms * 1000 + tflops = 2 * m * n * k / (avg_time_ms * 1e-3) / 1e12 + gb_s = (m * k + k * n + m * n * 2) / 1e9 / (avg_time_ms * 1e-3) + + benchmark_results["implementations"][name] = { + "time_ms": avg_time_ms, + "time_us": avg_time_us, + "tflops": tflops, + "gb_s": gb_s, + "diff": { + "DeepGEMM": + 0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm), + "Reference": + deepgemm_diff if name == "DeepGEMM" else + (vllm_triton_diff + if name == "vLLM Triton" else vllm_cutlass_diff) + } + } + + if verbose: + print( + f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s" + ) + + # Calculate speedups + baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"] + for name, data in benchmark_results["implementations"].items(): + if name != "DeepGEMM": + speedup = baseline / data["time_ms"] + benchmark_results["implementations"][name][ + "speedup_vs_deepgemm"] = speedup + if verbose: + print(f"DeepGEMM is {1/speedup:.2f}x " + f"{'faster' if 1/speedup > 1 else 'slower'} than {name}") + + vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][ + "time_ms"] + vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][ + "time_ms"] + cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time + benchmark_results["implementations"]["vLLM CUTLASS"][ + "speedup_vs_triton"] = cutlass_vs_triton + if verbose: + print( + f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x " + f"{'faster' if cutlass_vs_triton > 1 else 'slower'} than vLLM Triton" + ) + + return benchmark_results + + +def format_table_row(values, widths): + """Format a row with specified column widths.""" + return "| " + " | ".join(f"{val:{w}}" + for val, w in zip(values, widths)) + " |" + + +def print_table(headers, rows, title=None): + """Print a table with headers and rows.""" + if title: + print(f"\n{title}") + + # Calculate column widths based on headers and data + widths = [ + max(len(str(h)), max(len(str(row[i])) for row in rows)) + for i, h in enumerate(headers) + ] + + # Create separator line + separator = "+-" + "-+-".join("-" * w for w in widths) + "-+" + + # Print table + print(separator) + print(format_table_row(headers, widths)) + print(separator) + for row in rows: + print(format_table_row(row, widths)) + print(separator) + + +def format_speedup(value): + """Format speedup value with indicator if it's faster or slower.""" + return f"{value:.2f}x {'faster' if value > 1.0 else 'slower'}" + + +def run_benchmarks(verbose: bool = False): + """Run benchmarks for a set of common shapes.""" + print("===== STARTING FP8 GEMM BENCHMARK =====") + + # Make sure we're using the GPU + if not torch.cuda.is_available(): + print("CUDA not available! Tests require GPU.") + return + + # Print system information + print(f"PyTorch version: {torch.__version__}") + print(f"CUDA version: {torch.version.cuda}") + print(f"Triton version: {triton.__version__}") + print(f"Using device: {torch.cuda.get_device_name()}") + + # Enable TF32 for better performance + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + + # Set seeds for reproducibility + torch.manual_seed(42) + torch.cuda.manual_seed(42) + + # Define benchmark shapes (m, n, k) + shapes = [ + (8, 4096, 7168), + (8, 7168, 18432), + (8, 18432, 7168), + (64, 4096, 7168), + (64, 7168, 18432), + (64, 18432, 7168), + (64, 24576, 1536), + (64, 32768, 512), + (64, 7168, 16384), + (128, 4096, 7168), + (128, 7168, 18432), + (128, 18432, 7168), + (1024, 4096, 7168), + (1024, 18432, 7168), + (2048, 4096, 7168), + (4096, 4096, 7168), + ] + shapes = [ + # (64, 2112, 7168), + (64, 24576, 1536), + (64, 32768, 512), + (64, 7168, 16384), + (64, 4096, 7168), + (64, 7168, 2048), + # (128, 2112, 7168), + (128, 24576, 1536), + (128, 32768, 512), + (128, 7168, 16384), + (128, 4096, 7168), + (128, 7168, 2048), + # (4096, 2112, 7168), + (4096, 24576, 1536), + (4096, 32768, 512), + (4096, 7168, 16384), + (4096, 4096, 7168), + (4096, 7168, 2048), + ] + + all_results = [] + for m, n, k in shapes: + result = benchmark_shape(m, n, k, verbose=verbose) + all_results.append(result) + + # Print results in a nicely formatted table + print("\n===== PERFORMANCE COMPARISON =====") + + # Print DeepGEMM table + deepgemm_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s"] + deepgemm_rows = [] + for result in all_results: + shape = result["shape"] + impl_data = result["implementations"]["DeepGEMM"] + deepgemm_rows.append([ + shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", + f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}" + ]) + + print_table(deepgemm_headers, + deepgemm_rows, + title="DeepGEMM Implementation:") + + # Print vLLM Triton table + triton_headers = [ + "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM" + ] + triton_rows = [] + for result in all_results: + shape = result["shape"] + impl_data = result["implementations"]["vLLM Triton"] + speedup = impl_data.get("speedup_vs_deepgemm", 1.0) + triton_rows.append([ + shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", + f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}", + format_speedup(speedup) + ]) + + print_table(triton_headers, + triton_rows, + title="vLLM Triton Implementation:") + + # Print vLLM CUTLASS table + cutlass_headers = [ + "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM", + "vs Triton" + ] + cutlass_rows = [] + for result in all_results: + shape = result["shape"] + impl_data = result["implementations"]["vLLM CUTLASS"] + vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0) + vs_triton = impl_data.get("speedup_vs_triton", 1.0) + cutlass_rows.append([ + shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", + f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}", + format_speedup(vs_deepgemm), + format_speedup(vs_triton) + ]) + + print_table(cutlass_headers, + cutlass_rows, + title="vLLM CUTLASS Implementation:") + + # Calculate and print averages + print("\n===== AVERAGE PERFORMANCE =====") + + implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"] + avg_metrics = { + impl: { + "tflops": 0, + "gb_s": 0, + "time_ms": 0 + } + for impl in implementations + } + + for result in all_results: + for impl in implementations: + impl_data = result["implementations"][impl] + avg_metrics[impl]["tflops"] += impl_data["tflops"] + avg_metrics[impl]["gb_s"] += impl_data["gb_s"] + avg_metrics[impl]["time_ms"] += impl_data["time_ms"] + + num_shapes = len(all_results) + avg_headers = ["Implementation", "Avg TFLOPS", "Avg GB/s", "Avg Time (ms)"] + avg_rows = [] + + for impl in implementations: + avg_tflops = avg_metrics[impl]["tflops"] / num_shapes + avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes + avg_time = avg_metrics[impl]["time_ms"] / num_shapes + avg_rows.append([ + impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}" + ]) + + print_table(avg_headers, avg_rows) + + # Calculate average speedups + avg_speedups = { + "DeepGEMM vs vLLM Triton": 0, + "DeepGEMM vs vLLM CUTLASS": 0, + "vLLM CUTLASS vs vLLM Triton": 0 + } + + for result in all_results: + deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"] + vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"] + vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][ + "time_ms"] + + avg_speedups[ + "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time + avg_speedups[ + "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time + avg_speedups[ + "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time + + print("\n===== AVERAGE SPEEDUPS =====") + speedup_headers = ["Comparison", "Speedup"] + speedup_rows = [] + for comparison, total in avg_speedups.items(): + avg_speedup = total / num_shapes + status = "faster" if avg_speedup > 1 else "slower" + speedup_rows.append([comparison, f"{avg_speedup:.2f}x {status}"]) + + print_table(speedup_headers, speedup_rows) + + # Average accuracy comparison + print("\n===== ACCURACY COMPARISON =====") + avg_diff = {impl: 0 for impl in implementations} + + for result in all_results: + for impl in implementations: + avg_diff[impl] += result["implementations"][impl]["diff"][ + "Reference"] + + diff_headers = ["Implementation", "Avg Diff vs Reference"] + diff_rows = [] + for impl in implementations: + diff_rows.append([impl, f"{avg_diff[impl] / num_shapes:.6f}"]) + + print_table(diff_headers, diff_rows) + + +if __name__ == "__main__": + run_benchmarks(verbose=False) diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py index 7d0bd84150a27..bd62173a7b3a6 100644 --- a/benchmarks/kernels/graph_machete_bench.py +++ b/benchmarks/kernels/graph_machete_bench.py @@ -1,8 +1,9 @@ +# SPDX-License-Identifier: Apache-2.0 + import math import pickle import re from collections import defaultdict -from typing import List import matplotlib.pyplot as plt import pandas as pd @@ -21,7 +22,7 @@ if __name__ == "__main__": with open(args.filename, 'rb') as f: data = pickle.load(f) - raw_results: List[TMeasurement] = data["results"] + raw_results: list[TMeasurement] = data["results"] results = defaultdict(lambda: list()) for v in raw_results: diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py index fee877b6f76fa..ac64f786f1840 100644 --- a/benchmarks/kernels/utils.py +++ b/benchmarks/kernels/utils.py @@ -1,5 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses -from typing import Any, Callable, Iterable, Optional +from collections.abc import Iterable +from typing import Any, Callable, Optional import torch import torch.utils.benchmark as TBenchmark diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py index 51f24f3ba1774..89b05d5882a38 100644 --- a/benchmarks/kernels/weight_shapes.py +++ b/benchmarks/kernels/weight_shapes.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Weight Shapes are in the format # ([K, N], TP_SPLIT_DIM) # Example: diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py index d16d6f9fba442..5f94552e9dc85 100644 --- a/benchmarks/overheads/benchmark_hashing.py +++ b/benchmarks/overheads/benchmark_hashing.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import cProfile import pstats diff --git a/benchmarks/run_structured_output_benchmark.sh b/benchmarks/run_structured_output_benchmark.sh new file mode 100755 index 0000000000000..126dfbc244161 --- /dev/null +++ b/benchmarks/run_structured_output_benchmark.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# Define the model to use +MODEL=${1:-"Qwen/Qwen2.5-7B-Instruct"} + +# Define the backend to use +BACKEND=${2:-"vllm"} + +# Define the dataset to use +DATASET=${3:-"xgrammar_bench"} + +# Define the guided decoding backend +GUIDED_BACKEND=${4:-"xgrammar"} + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +OUTPUT_DIR=${5:-"$SCRIPT_DIR/structured_output_benchmark_results"} + +GUIDED_RATIO=${6:-0.5} + +# Create output directory if it doesn't exist +mkdir -p "$OUTPUT_DIR" + +# Define QPS values to test +QPS_VALUES=(70 60 50 25 20 15 10) + +# Common parameters +COMMON_PARAMS="--backend $BACKEND \ + --model $MODEL \ + --dataset $DATASET \ + --structured-output-backend $GUIDED_BACKEND \ + --structured-output-ratio $GUIDED_RATIO \ + --save-results \ + --result-dir $OUTPUT_DIR" + +echo "Starting structured output benchmark with model: $MODEL" +echo "Backend: $BACKEND" +echo "Dataset: $DATASET" +echo "Structured output backend: $GUIDED_BACKEND" +echo "Results will be saved to: $OUTPUT_DIR" +echo "----------------------------------------" + +# Run benchmarks with different QPS values +for qps in "${QPS_VALUES[@]}"; do + echo "Running benchmark with QPS: $qps" + + # Get git hash and branch for the filename + GIT_HASH=$(git rev-parse --short HEAD 2>/dev/null || echo "unknown") + GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown") + + # Construct filename for this run + FILENAME="${GUIDED_BACKEND}_${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json" + + # Run the benchmark + python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \ + --request-rate $qps \ + --result-filename "$FILENAME" \ + --tokenizer-mode ${TOKENIZER_MODE:-"auto"} \ + --port ${PORT:-8000} + + echo "Completed benchmark with QPS: $qps" + echo "----------------------------------------" +done + +echo "All benchmarks completed!" +echo "Results saved to: $OUTPUT_DIR" diff --git a/benchmarks/structured_schemas/structured_schema_1.json b/benchmarks/structured_schemas/structured_schema_1.json index 6003698469e8d..13bd6b6d16c60 100644 --- a/benchmarks/structured_schemas/structured_schema_1.json +++ b/benchmarks/structured_schemas/structured_schema_1.json @@ -1,113 +1,19 @@ { - "$schema": - "https://json-schema.org/draft/2020-12/schema", - "title": - "User Profile", - "type": - "object", + "type": "object", "properties": { - "userId": { - "type": "string", - "description": "Unique identifier for the user." - }, - "personalInfo": { - "type": "object", - "properties": { - "firstName": { - "type": "string", - "description": "The user's first name." - }, - "lastName": { - "type": "string", - "description": "The user's last name." - }, - "age": { - "type": "integer", - "minimum": 0, - "description": "The user's age." - }, - "phoneNumbers": { - "type": - "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": ["home", "work", "mobile"], - "description": "Type of phone number." - }, - "number": { - "type": "string", - "pattern": "^\\+?[1-9]\\d{1,14}$", - "description": "Phone number in E.164 format." - } - }, - "required": ["type", "number"] - }, - "description": - "List of phone numbers associated with the user." - } - }, - "required": ["firstName", "lastName"] - }, - "address": { - "type": "object", - "properties": { - "street": { - "type": "string", - "description": "Street address." - }, - "city": { - "type": "string", - "description": "City name." - }, - "state": { - "type": "string", - "description": "State or province." - }, - "postalCode": { - "type": "string", - "pattern": "^\\d{5}(-\\d{4})?$", - "description": "Postal code." - }, - "country": { - "type": "string", - "description": "Country name." - } - }, - "required": ["street", "city", "state", "postalCode", "country"] - }, - "preferences": { - "type": "object", - "properties": { - "newsletterSubscribed": { - "type": - "boolean", - "description": - "Indicates if the user is subscribed to the newsletter." - }, - "favoriteCategories": { - "type": "array", - "items": { - "type": "string" - }, - "description": "List of user's favorite categories." - } - }, - "required": ["newsletterSubscribed"] - }, - "accountStatus": { - "type": "string", - "enum": ["active", "inactive", "suspended"], - "description": "Current status of the user's account." - }, - "registrationDate": { - "type": "string", - "format": "date-time", - "description": "ISO 8601 formatted date-time of user registration." - } + "name": { "type": "string" }, + "email": { "type": "string" }, + "street": { "type": "string" }, + "city": { "type": "string" }, + "state": { "type": "string" }, + "zip": { "type": "string" }, + "phone": { "type": "string" }, + "website": { "type": "string" }, + "company": { "type": "string" }, + "age": { "type": "integer" } }, - "required": - ["userId", "personalInfo", "address", "accountStatus", "registrationDate"] -} \ No newline at end of file + "required": [ + "name", + "email" + ] +} diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 714abca2a5ff7..b57d9e2263109 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -81,6 +81,7 @@ else() find_isa(${CPUINFO} "POWER9" POWER9_FOUND) find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support + find_isa(${CPUINFO} "S390" S390_FOUND) endif() @@ -129,8 +130,16 @@ elseif (ASIMD_FOUND) elseif(APPLE_SILICON_FOUND) message(STATUS "Apple Silicon Detected") set(ENABLE_NUMA OFF) +elseif (S390_FOUND) + message(STATUS "S390 detected") + # Check for S390 VXE support + list(APPEND CXX_COMPILE_FLAGS + "-mvx" + "-mzvector" + "-march=native" + "-mtune=native") else() - message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.") + message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.") endif() # @@ -140,7 +149,7 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED) FetchContent_Declare( oneDNN GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git - GIT_TAG v3.6 + GIT_TAG v3.7.1 GIT_PROGRESS TRUE GIT_SHALLOW TRUE ) @@ -181,6 +190,7 @@ set(VLLM_EXT_SRC "csrc/cpu/cache.cpp" "csrc/cpu/utils.cpp" "csrc/cpu/layernorm.cpp" + "csrc/cpu/mla_decode.cpp" "csrc/cpu/pos_encoding.cpp" "csrc/cpu/torch_bindings.cpp") diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake new file mode 100644 index 0000000000000..6291475164baa --- /dev/null +++ b/cmake/external_projects/flashmla.cmake @@ -0,0 +1,66 @@ +include(FetchContent) + +# If FLASH_MLA_SRC_DIR is set, flash-mla is installed from that directory +# instead of downloading. +# It can be set as an environment variable or passed as a cmake argument. +# The environment variable takes precedence. +if (DEFINED ENV{FLASH_MLA_SRC_DIR}) + set(FLASH_MLA_SRC_DIR $ENV{FLASH_MLA_SRC_DIR}) +endif() + +if(FLASH_MLA_SRC_DIR) + FetchContent_Declare( + flashmla + SOURCE_DIR ${FLASH_MLA_SRC_DIR} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + ) +else() + FetchContent_Declare( + flashmla + GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git + GIT_TAG 575f7724b9762f265bbee5889df9c7d630801845 + GIT_PROGRESS TRUE + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + ) +endif() + + +FetchContent_MakeAvailable(flashmla) +message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}") + +# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later. +# Only build FlashMLA kernels if we are building for something compatible with +# sm90a +cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}") +if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS) + set(FlashMLA_SOURCES + ${flashmla_SOURCE_DIR}/csrc/flash_api.cpp + ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_bf16_sm90.cu + ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_fp16_sm90.cu + ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_metadata.cu) + + set(FlashMLA_INCLUDES + ${flashmla_SOURCE_DIR}/csrc/cutlass/include + ${flashmla_SOURCE_DIR}/csrc/include) + + set_gencode_flags_for_srcs( + SRCS "${FlashMLA_SOURCES}" + CUDA_ARCHS "${FLASH_MLA_ARCHS}") + + define_gpu_extension_target( + _flashmla_C + DESTINATION vllm + LANGUAGE ${VLLM_GPU_LANG} + SOURCES ${FlashMLA_SOURCES} + COMPILE_FLAGS ${VLLM_GPU_FLAGS} + ARCHITECTURES ${VLLM_GPU_ARCHES} + INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES} + USE_SABI 3 + WITH_SOABI) +else() + # Create an empty target for setup.py when not targeting sm90a systems + add_custom_target(_flashmla_C) +endif() + diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake new file mode 100644 index 0000000000000..afd7c47e8ac00 --- /dev/null +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -0,0 +1,67 @@ +# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target +# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the +# arches in the CUDA case (and instead set the gencodes on a per file basis) +# we need to manually set VLLM_GPU_ARCHES here. +if(VLLM_GPU_LANG STREQUAL "CUDA") + foreach(_ARCH ${CUDA_ARCHS}) + string(REPLACE "." "" _ARCH "${_ARCH}") + list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real") + endforeach() +endif() + +# +# Build vLLM flash attention from source +# +# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM. +# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs. +# They should be identical but if they aren't, this is a massive footgun. +# +# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place. +# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3). +# If no component is specified, vllm-flash-attn is still installed. + +# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading. +# This is to enable local development of vllm-flash-attn within vLLM. +# It can be set as an environment variable or passed as a cmake argument. +# The environment variable takes precedence. +if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR}) + set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR}) +endif() + +if(VLLM_FLASH_ATTN_SRC_DIR) + FetchContent_Declare( + vllm-flash-attn SOURCE_DIR + ${VLLM_FLASH_ATTN_SRC_DIR} + BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn + ) +else() + FetchContent_Declare( + vllm-flash-attn + GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git + GIT_TAG dc9d410b3e2d6534a4c70724c2515f4def670a22 + GIT_PROGRESS TRUE + # Don't share the vllm-flash-attn build between build types + BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn + ) +endif() + + +# Fetch the vllm-flash-attn library +FetchContent_MakeAvailable(vllm-flash-attn) +message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}") + +# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in +# case only one is built, in the case both are built redundant work is done) +install( + DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ + DESTINATION vllm_flash_attn + COMPONENT _vllm_fa2_C + FILES_MATCHING PATTERN "*.py" +) + +install( + DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ + DESTINATION vllm_flash_attn + COMPONENT _vllm_fa3_C + FILES_MATCHING PATTERN "*.py" +) diff --git a/cmake/hipify.py b/cmake/hipify.py index 340e41c8179e3..a15577125eb1f 100755 --- a/cmake/hipify.py +++ b/cmake/hipify.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 # # A command line tool for running pytorch's hipify preprocessor on CUDA diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 15b09395a889f..c9cd099b82a75 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -257,9 +257,9 @@ endmacro() # where `<=` is the version comparison operator. # In other words, for each version in `TGT_CUDA_ARCHS` find the highest version # in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`. -# We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is -# in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add -# 9.0a to the result. +# We have special handling for x.0a, if x.0a is in `SRC_CUDA_ARCHS` and x.0 is +# in `TGT_CUDA_ARCHS` then we should remove x.0a from `SRC_CUDA_ARCHS` and add +# x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS). # The result is stored in `OUT_CUDA_ARCHS`. # # Example: @@ -270,34 +270,55 @@ endmacro() # function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS) list(REMOVE_DUPLICATES SRC_CUDA_ARCHS) + set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS}) - # if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should - # remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS + # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should + # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS set(_CUDA_ARCHS) if ("9.0a" IN_LIST SRC_CUDA_ARCHS) list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a") - if ("9.0" IN_LIST TGT_CUDA_ARCHS) + if ("9.0" IN_LIST TGT_CUDA_ARCHS_) + list(REMOVE_ITEM TGT_CUDA_ARCHS_ "9.0") set(_CUDA_ARCHS "9.0a") endif() endif() + if ("10.0a" IN_LIST SRC_CUDA_ARCHS) + list(REMOVE_ITEM SRC_CUDA_ARCHS "10.0a") + if ("10.0" IN_LIST TGT_CUDA_ARCHS) + list(REMOVE_ITEM TGT_CUDA_ARCHS_ "10.0") + set(_CUDA_ARCHS "10.0a") + endif() + endif() + list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING) - # for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is - # less or eqault to ARCH - foreach(_ARCH ${CUDA_ARCHS}) - set(_TMP_ARCH) - foreach(_SRC_ARCH ${SRC_CUDA_ARCHS}) - if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH) - set(_TMP_ARCH ${_SRC_ARCH}) - else() - break() + # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that + # is less or equal to ARCH (but has the same major version since SASS binary + # compatibility is only forward compatible within the same major version). + foreach(_ARCH ${TGT_CUDA_ARCHS_}) + set(_TMP_ARCH) + # Extract the major version of the target arch + string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}") + foreach(_SRC_ARCH ${SRC_CUDA_ARCHS}) + # Extract the major version of the source arch + string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}") + # Check major-version match AND version-less-or-equal + if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH) + if (SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR) + set(_TMP_ARCH "${_SRC_ARCH}") + endif() + else() + # If we hit a version greater than the target, we can break + break() + endif() + endforeach() + + # If we found a matching _TMP_ARCH, append it to _CUDA_ARCHS + if (_TMP_ARCH) + list(APPEND _CUDA_ARCHS "${_TMP_ARCH}") endif() endforeach() - if (_TMP_ARCH) - list(APPEND _CUDA_ARCHS ${_TMP_ARCH}) - endif() - endforeach() list(REMOVE_DUPLICATES _CUDA_ARCHS) set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE) diff --git a/collect_env.py b/collect_env.py index 254c19b19a5ac..0ec9d4cae4ba7 100644 --- a/collect_env.py +++ b/collect_env.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # ruff: noqa # code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py diff --git a/csrc/cache.h b/csrc/cache.h index eedad9fafa3c0..0970b704be3ab 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -15,6 +15,9 @@ void copy_blocks(std::vector const& key_caches, std::vector const& value_caches, const torch::Tensor& block_mapping); +void copy_blocks_mla(std::vector const& kv_caches, + const torch::Tensor& block_mapping); + void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& slot_mapping, @@ -28,6 +31,18 @@ void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value, const std::string& kv_cache_dtype, torch::Tensor& k_scale, torch::Tensor& v_scale); +void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe, + torch::Tensor& kv_cache, torch::Tensor& slot_mapping, + const std::string& kv_cache_dtype, + torch::Tensor& scale); + // Just for unittest void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache, const double scale, const std::string& kv_cache_dtype); + +void gather_cache( + torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] + torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...] + torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] + torch::Tensor const& cu_seq_lens, // [BATCH+1] + int64_t batch_size, std::optional seq_starts = std::nullopt); \ No newline at end of file diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 21a0aec0ececc..0b3f6fc8c19a8 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -2,6 +2,7 @@ #include #include +#include "cuda_utils.h" #include "cuda_compat.h" #include "dispatch_utils.h" @@ -46,7 +47,10 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst, char* src_ptr = static_cast(src.data_ptr()); char* dst_ptr = static_cast(dst.data_ptr()); - const int64_t block_size_in_bytes = src.element_size() * src[0].numel(); + // We use the stride instead of numel in case the cache is padded for memory + // alignment reasons, we assume the blocks data (inclusive of any padding) + // is contiguous in memory + const int64_t block_size_in_bytes = src.element_size() * src.stride(0); const at::cuda::OptionalCUDAGuard device_guard( src_device.is_cuda() ? src_device : dst_device); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); @@ -93,6 +97,24 @@ __global__ void copy_blocks_kernel(int64_t* key_cache_ptrs, } } +// Kernel for MLA, which works on a single joint kv_cache +// Grid: (num_layers, num_pairs) +template +__global__ void copy_blocks_mla_kernel( + int64_t* cache_ptrs, const int64_t* __restrict__ block_mapping, + const int mem_footprint_per_block) { + const int layer_idx = blockIdx.x; + const int pair_idx = blockIdx.y; + scalar_t* cache = reinterpret_cast(cache_ptrs[layer_idx]); + int64_t src_block = block_mapping[2 * pair_idx]; + int64_t dst_block = block_mapping[2 * pair_idx + 1]; + int64_t src_offset = src_block * mem_footprint_per_block; + int64_t dst_offset = dst_block * mem_footprint_per_block; + for (int i = threadIdx.x; i < mem_footprint_per_block; i += blockDim.x) { + cache[dst_offset + i] = cache[src_offset + i]; + } +} + } // namespace vllm // Note: the key_caches and value_caches vectors are constant but @@ -147,6 +169,42 @@ void copy_blocks(std::vector const& key_caches, })); } +// copy blocks kernel for MLA (assumes a joint KV-cache) +void copy_blocks_mla(std::vector const& kv_caches, + const torch::Tensor& block_mapping) { + int num_layers = kv_caches.size(); + if (num_layers == 0) { + return; + } + torch::Device cache_device = kv_caches[0].device(); + TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA"); + + std::vector cache_ptrs(num_layers); + for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) { + cache_ptrs[layer_idx] = + reinterpret_cast(kv_caches[layer_idx].data_ptr()); + } + torch::Tensor cache_ptrs_tensor = + torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64) + .to(cache_device); + + int num_pairs = block_mapping.size(0); + // We use the stride instead of numel in case the cache is padded for memory + // alignment reasons, we assume the blocks data (inclusive of any padding) + // is contiguous in memory + int mem_footprint_per_block = kv_caches[0].stride(0); + dim3 grid(num_layers, num_pairs); + dim3 block(std::min(1024, mem_footprint_per_block)); + const at::cuda::OptionalCUDAGuard device_guard(cache_device); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES( + kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] { + vllm::copy_blocks_mla_kernel<<>>( + cache_ptrs_tensor.data_ptr(), + block_mapping.data_ptr(), mem_footprint_per_block); + })); +} + namespace vllm { template @@ -245,10 +303,55 @@ __global__ void reshape_and_cache_flash_kernel( } } } + +template +__global__ void concat_and_cache_mla_kernel( + const scalar_t* __restrict__ kv_c, // [num_tokens, kv_lora_rank] + const scalar_t* __restrict__ k_pe, // [num_tokens, pe_dim] + cache_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank + // + pe_dim)] + const int64_t* __restrict__ slot_mapping, // [num_tokens] + const int block_stride, // + const int entry_stride, // + const int kv_c_stride, // + const int k_pe_stride, // + const int kv_lora_rank, // + const int pe_dim, // + const int block_size, // + const float* scale // +) { + const int64_t token_idx = blockIdx.x; + const int64_t slot_idx = slot_mapping[token_idx]; + // NOTE: slot_idx can be -1 if the token is padded + if (slot_idx < 0) { + return; + } + const int64_t block_idx = slot_idx / block_size; + const int64_t block_offset = slot_idx % block_size; + + auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst, + int src_stride, int dst_stride, int size, int offset) { + for (int i = threadIdx.x; i < size; i += blockDim.x) { + const int64_t src_idx = token_idx * src_stride + i; + const int64_t dst_idx = + block_idx * block_stride + block_offset * entry_stride + i + offset; + if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { + dst[dst_idx] = src[src_idx]; + } else { + dst[dst_idx] = + fp8::scaled_convert(src[src_idx], *scale); + } + } + }; + + copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0); + copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank); +} + } // namespace vllm -// KV_T is the stored data type of kv-cache. -// CACHE_T is the data type of key and value tensors. +// KV_T is the data type of key and value tensors. +// CACHE_T is the stored data type of kv-cache. // KV_DTYPE is the real data type of kv-cache. #define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \ vllm::reshape_and_cache_kernel \ @@ -272,7 +375,7 @@ void reshape_and_cache( torch::Tensor& slot_mapping, // [num_tokens] const std::string& kv_cache_dtype, torch::Tensor& k_scale, torch::Tensor& v_scale) { - int num_tokens = key.size(0); + int num_tokens = slot_mapping.size(0); int num_heads = key.size(1); int head_size = key.size(2); int block_size = key_cache.size(3); @@ -290,8 +393,8 @@ void reshape_and_cache( CALL_RESHAPE_AND_CACHE) } -// KV_T is the stored data type of kv-cache. -// CACHE_T is the data type of key and value tensors. +// KV_T is the data type of key and value tensors. +// CACHE_T is the stored data type of kv-cache. // KV_DTYPE is the real data type of kv-cache. #define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE) \ vllm::reshape_and_cache_flash_kernel \ @@ -343,6 +446,57 @@ void reshape_and_cache_flash( CALL_RESHAPE_AND_CACHE_FLASH); } +// KV_T is the data type of key and value tensors. +// CACHE_T is the stored data type of kv-cache. +// KV_DTYPE is the real data type of kv-cache. +#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \ + vllm::concat_and_cache_mla_kernel \ + <<>>( \ + reinterpret_cast(kv_c.data_ptr()), \ + reinterpret_cast(k_pe.data_ptr()), \ + reinterpret_cast(kv_cache.data_ptr()), \ + slot_mapping.data_ptr(), block_stride, entry_stride, \ + kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \ + reinterpret_cast(scale.data_ptr())); + +void concat_and_cache_mla( + torch::Tensor& kv_c, // [num_tokens, kv_lora_rank] + torch::Tensor& k_pe, // [num_tokens, pe_dim] + torch::Tensor& kv_cache, // [num_blocks, block_size, (kv_lora_rank + + // pe_dim)] + torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens] + const std::string& kv_cache_dtype, torch::Tensor& scale) { + // NOTE(woosuk): In vLLM V1, key.size(0) can be different from + // slot_mapping.size(0) because of padding for CUDA graphs. + // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because + // both include padding. + // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0) + // since key includes padding for CUDA graphs, while slot_mapping does not. + // In this case, slot_mapping.size(0) represents the actual number of tokens + // before padding. + // For compatibility with both cases, we use slot_mapping.size(0) as the + // number of tokens. + int num_tokens = slot_mapping.size(0); + int kv_lora_rank = kv_c.size(1); + int pe_dim = k_pe.size(1); + int block_size = kv_cache.size(1); + + TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim); + + int kv_c_stride = kv_c.stride(0); + int k_pe_stride = k_pe.stride(0); + int block_stride = kv_cache.stride(0); + int entry_stride = kv_cache.stride(1); + + dim3 grid(num_tokens); + dim3 block(std::min(kv_lora_rank, 512)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype, + CALL_CONCAT_AND_CACHE_MLA); +} + namespace vllm { template @@ -417,3 +571,161 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache, TORCH_CHECK(false, "Unsupported data type: ", kv_cache_dtype); } } + +namespace vllm { + +// grid is launched with dimensions (batch, num_splits) +template +__global__ void gather_cache( + const scalar_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, + // ENTRIES...] + scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...] + const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES] + const int32_t* __restrict__ cu_seq_lens, // [BATCH+1] + const int32_t block_size, const int32_t entry_size, + const int64_t block_table_stride, const int64_t cache_block_stride, + const int64_t cache_entry_stride, const int64_t dst_entry_stride, + const int32_t* __restrict__ seq_starts) { // Optional: starting offsets per + // batch + + const int64_t bid = blockIdx.x; // Batch ID + const int32_t num_splits = gridDim.y; + const int32_t split = blockIdx.y; + const int32_t seq_start = cu_seq_lens[bid]; + const int32_t seq_end = cu_seq_lens[bid + 1]; + const int32_t seq_len = seq_end - seq_start; + const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size); + const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits); + + const int32_t split_start = split * split_blocks; + const int32_t split_end = min((split + 1) * split_blocks, tot_blocks); + + const bool is_active_split = (split_start < tot_blocks); + const bool is_last_split = (split_end == tot_blocks); + + if (!is_active_split) return; + + int32_t full_blocks_end = split_end; + int32_t partial_block_size = 0; + + // Adjust the pointer for the block_table for this batch. + // If seq_starts is provided, compute an offset based on (seq_starts[bid] / + // page_size) + const int32_t batch_offset = bid * block_table_stride; + int32_t offset = 0; + if (seq_starts != nullptr) { + offset = seq_starts[bid] / block_size; + } + const int32_t* batch_block_table = block_table + batch_offset + offset; + + // Adjust dst pointer based on the cumulative sequence lengths. + dst += seq_start * dst_entry_stride; + + if (is_last_split) { + partial_block_size = seq_len % block_size; + if (partial_block_size) full_blocks_end -= 1; + } + + auto copy_entry = [&](const scalar_t* __restrict__ _src, + scalar_t* __restrict__ _dst) { + for (int i = threadIdx.x; i < entry_size; i += blockDim.x) + _dst[i] = _src[i]; + }; + + for (int pid = split_start; pid < full_blocks_end; ++pid) { + auto block_id = batch_block_table[pid]; + auto block_start_ptr = src_cache + block_id * cache_block_stride; + auto block_dst_ptr = dst + pid * block_size * dst_entry_stride; + for (int eid = 0; eid < block_size; ++eid) { + copy_entry(block_start_ptr + eid * cache_entry_stride, + block_dst_ptr + eid * dst_entry_stride); + } + } + + if (partial_block_size) { + auto block_id = batch_block_table[full_blocks_end]; + auto block_start_ptr = src_cache + block_id * cache_block_stride; + auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride; + for (int eid = 0; eid < partial_block_size; ++eid) { + copy_entry(block_start_ptr + eid * cache_entry_stride, + block_dst_ptr + eid * dst_entry_stride); + } + } +} + +} // namespace vllm + +// Macro to dispatch the kernel based on the data type. +#define CALL_GATHER_CACHE(CPY_DTYPE) \ + vllm::gather_cache<<>>( \ + reinterpret_cast(src_cache.data_ptr()), \ + reinterpret_cast(dst.data_ptr()), \ + block_table.data_ptr(), cu_seq_lens.data_ptr(), \ + block_size, entry_size, block_table_stride, cache_block_stride, \ + cache_entry_stride, dst_entry_stride, seq_starts_ptr); + +// Gather sequences from the cache into the destination tensor. +// - cu_seq_lens contains the cumulative sequence lengths for each batch +// - block_table contains the cache block indices for each sequence +// - Optionally, seq_starts (if provided) offsets the starting block index by +// (seq_starts[bid] / page_size) +void gather_cache( + torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] + torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...] + torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] + torch::Tensor const& cu_seq_lens, // [BATCH+1] + int64_t batch_size, + std::optional seq_starts = std::nullopt) { + at::cuda::OptionalCUDAGuard device_guard(src_cache.device()); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + int32_t block_size = src_cache.size(1); + int32_t entry_size = src_cache.flatten(2, -1).size(2); + + TORCH_CHECK(block_table.dtype() == torch::kInt32, + "block_table must be int32"); + TORCH_CHECK(cu_seq_lens.dtype() == torch::kInt32, + "cu_seq_lens must be int32"); + if (seq_starts.has_value()) { + TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32, + "seq_starts must be int32"); + } + + TORCH_CHECK(src_cache.device() == dst.device(), + "src_cache and dst must be on the same device"); + TORCH_CHECK(src_cache.device() == block_table.device(), + "src_cache and block_table must be on the same device"); + TORCH_CHECK(src_cache.device() == cu_seq_lens.device(), + "src_cache and cu_seq_lens must be on the same device"); + if (seq_starts.has_value()) { + TORCH_CHECK(src_cache.device() == seq_starts.value().device(), + "src_cache and seq_starts must be on the same device"); + } + + int64_t block_table_stride = block_table.stride(0); + int64_t cache_block_stride = src_cache.stride(0); + int64_t cache_entry_stride = src_cache.stride(1); + int64_t dst_entry_stride = dst.stride(0); + + // Decide on the number of splits based on the batch size. + int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16; + dim3 grid(batch_size, num_splits); + dim3 block(1024); + + TORCH_CHECK(src_cache.dtype() == dst.dtype(), + "src_cache and dst must have the same dtype"); + + const int dtype_bits = src_cache.element_size() * 8; + const int32_t* seq_starts_ptr = + seq_starts.has_value() ? seq_starts.value().data_ptr() : nullptr; + + if (dtype_bits == 32) { + CALL_GATHER_CACHE(uint32_t); + } else if (dtype_bits == 16) { + CALL_GATHER_CACHE(uint16_t); + } else if (dtype_bits == 8) { + CALL_GATHER_CACHE(uint8_t); + } else { + TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits); + } +} diff --git a/csrc/core/math.hpp b/csrc/core/math.hpp index ba9f40a230c8e..b8171133f6aad 100644 --- a/csrc/core/math.hpp +++ b/csrc/core/math.hpp @@ -1,7 +1,9 @@ +#pragma once + #include #include -inline uint32_t next_pow_2(uint32_t const num) { +inline constexpr uint32_t next_pow_2(uint32_t const num) { if (num <= 1) return num; return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); -} \ No newline at end of file +} diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index b9764056e8a2d..0257d8ff16baf 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -24,8 +24,8 @@ struct KernelVecType { template <> struct KernelVecType { -#ifdef __powerpc64__ - // Power architecture-specific vector types +#if defined(__powerpc64__) || defined(__s390x__) + // Power and s390x architecture-specific vector types using q_load_vec_type = vec_op::FP32Vec8; using k_load_vec_type = vec_op::FP32Vec16; using v_load_vec_type = vec_op::FP32Vec16; diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp index e3809acad7453..69f6d06e3c967 100644 --- a/csrc/cpu/cache.cpp +++ b/csrc/cpu/cache.cpp @@ -3,6 +3,12 @@ #include "cpu_types.hpp" +#if defined(__x86_64__) + #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2 +#else + #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES +#endif + namespace { template void copy_blocks_cpu_impl(std::vector const& key_caches, @@ -82,6 +88,48 @@ void reshape_and_cache_cpu_impl( } }; // namespace +template +void concat_and_cache_mla_cpu_impl( + const scalar_t* __restrict__ kv_c, // [num_tokens, kv_lora_rank] + const scalar_t* __restrict__ k_pe, // [num_tokens, pe_dim] + scalar_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank + // + pe_dim)] + const int64_t* __restrict__ slot_mapping, // [num_tokens] + const int num_tokens, // + const int block_stride, // + const int entry_stride, // + const int kv_c_stride, // + const int k_pe_stride, // + const int kv_lora_rank, // + const int pe_dim, // + const int block_size // +) { +#pragma omp parallel for + for (int token_idx = 0; token_idx < num_tokens; ++token_idx) { + const int64_t slot_idx = slot_mapping[token_idx]; + // NOTE: slot_idx can be -1 if the token is padded + if (slot_idx < 0) { + continue; + } + const int64_t block_idx = slot_idx / block_size; + const int64_t block_offset = slot_idx % block_size; + + auto copy = [&](const scalar_t* __restrict__ src, + scalar_t* __restrict__ dst, int src_stride, int dst_stride, + int size, int offset) { + for (int i = 0; i < size; i++) { + const int64_t src_idx = token_idx * src_stride + i; + const int64_t dst_idx = + block_idx * block_stride + block_offset * entry_stride + i + offset; + dst[dst_idx] = src[src_idx]; + } + }; + + copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0); + copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank); + } +} + // Note: the key_caches and value_caches vectors are constant but // not the Tensors they contain. The vectors need to be const refs // in order to satisfy pytorch's C++ operator registration code. @@ -95,13 +143,12 @@ void copy_blocks(std::vector const& key_caches, } const int element_num_per_block = key_caches[0][0].numel(); - VLLM_DISPATCH_FLOATING_TYPES( - key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] { - CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl) - copy_blocks_cpu_impl(key_caches, value_caches, block_mapping, - element_num_per_block, num_layers); - CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl) - }); + DISPATCH_MACRO(key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] { + CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl) + copy_blocks_cpu_impl(key_caches, value_caches, block_mapping, + element_num_per_block, num_layers); + CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl) + }); } void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, @@ -118,15 +165,46 @@ void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, int key_stride = key.stride(0); int value_stride = value.stride(0); + DISPATCH_MACRO(key.scalar_type(), "reshape_and_cache_cpu_impl", [&] { + CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl) + reshape_and_cache_cpu_impl( + key.data_ptr(), value.data_ptr(), + key_cache.data_ptr(), value_cache.data_ptr(), + slot_mapping.data_ptr(), num_tokens, key_stride, value_stride, + num_heads, head_size, block_size, x); + CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl) + }); +} + +void concat_and_cache_mla( + torch::Tensor& kv_c, // [num_tokens, kv_lora_rank] + torch::Tensor& k_pe, // [num_tokens, pe_dim] + torch::Tensor& kv_cache, // [num_blocks, block_size, (kv_lora_rank + + // pe_dim)] + torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens] + const std::string& kv_cache_dtype, torch::Tensor& scale) { + int num_tokens = slot_mapping.size(0); + int kv_lora_rank = kv_c.size(1); + int pe_dim = k_pe.size(1); + int block_size = kv_cache.size(1); + + TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim); + TORCH_CHECK(kv_cache_dtype != "fp8"); + + int kv_c_stride = kv_c.stride(0); + int k_pe_stride = k_pe.stride(0); + int block_stride = kv_cache.stride(0); + int entry_stride = kv_cache.stride(1); + VLLM_DISPATCH_FLOATING_TYPES( - key.scalar_type(), "reshape_and_cache_cpu_impl", [&] { - CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl) - reshape_and_cache_cpu_impl( - key.data_ptr(), value.data_ptr(), - key_cache.data_ptr(), value_cache.data_ptr(), - slot_mapping.data_ptr(), num_tokens, key_stride, - value_stride, num_heads, head_size, block_size, x); - CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl) + kv_c.scalar_type(), "concat_and_cache_mla_cpu_impl", [&] { + CPU_KERNEL_GUARD_IN(concat_and_cache_mla_cpu_impl) + concat_and_cache_mla_cpu_impl( + kv_c.data_ptr(), k_pe.data_ptr(), + kv_cache.data_ptr(), slot_mapping.data_ptr(), + num_tokens, block_stride, entry_stride, kv_c_stride, k_pe_stride, + kv_lora_rank, pe_dim, block_size); + CPU_KERNEL_GUARD_OUT(concat_and_cache_mla_cpu_impl) }); } diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp index a71815106133a..17bbe04eef94a 100644 --- a/csrc/cpu/cpu_types.hpp +++ b/csrc/cpu/cpu_types.hpp @@ -7,6 +7,9 @@ #elif defined(__POWER9_VECTOR__) // ppc implementation #include "cpu_types_vsx.hpp" +#elif defined(__s390x__) + // s390 implementation + #include "cpu_types_vxe.hpp" #elif defined(__aarch64__) // arm implementation #include "cpu_types_arm.hpp" diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp index 990e99f2fc069..65ffe524af738 100644 --- a/csrc/cpu/cpu_types_arm.hpp +++ b/csrc/cpu/cpu_types_arm.hpp @@ -2,6 +2,10 @@ #include #include +#if defined(__APPLE__) + #include "omp.h" +#endif + namespace vec_op { #ifdef ARM_BF16_SUPPORT diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp new file mode 100644 index 0000000000000..ab8cbbbf4ec4f --- /dev/null +++ b/csrc/cpu/cpu_types_vxe.hpp @@ -0,0 +1,480 @@ + +#ifndef CPU_TYPES_VXE_HPP +#define CPU_TYPES_VXE_HPP + +#include +#include +#include +namespace vec_op { + +#define vec_neg(a) (-(a)) +#define vec_add(a, b) ((a) + (b)) +#define vec_sub(a, b) ((a) - (b)) +#define vec_mul(a, b) ((a) * (b)) +#define vec_div(a, b) ((a) / (b)) +#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebaic +#define vec_sl(a, b) ((a) << (b)) // Vector Shift Left + +// FIXME: FP16 is not fully supported in Torch-CPU +#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) + +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) + +#ifndef CPU_OP_GUARD + #define CPU_KERNEL_GUARD_IN(NAME) + #define CPU_KERNEL_GUARD_OUT(NAME) +#else + #define CPU_KERNEL_GUARD_IN(NAME) \ + std::cout << #NAME << " invoked." << std::endl; + #define CPU_KERNEL_GUARD_OUT(NAME) \ + std::cout << #NAME << " exit." << std::endl; +#endif + +#define FORCE_INLINE __attribute__((always_inline)) inline + +namespace { +template +constexpr void unroll_loop_item(std::integer_sequence, F&& f) { + (f(std::integral_constant{}), ...); +} +}; // namespace + +template >> +constexpr void unroll_loop(F&& f) { + unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); +} + +template +struct Vec { + constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; } +}; + +typedef struct ss16x8x2_t { + __vector signed short val[2]; +} ss16x8x2_t; + +typedef struct ss16x8x4_t { + __vector signed short val[4]; +} ss16x8x4_t; + +typedef struct f32x4x2_t { + __vector float val[2]; +} f32x4x2_t; + +typedef struct f32x4x4_t { + __vector float val[4]; +} f32x4x4_t; + +struct FP32Vec8; +struct FP32Vec16; + +struct BF16Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + + __vector signed short reg; + + explicit BF16Vec8(const void* ptr) : reg(*(__vector signed short*)ptr) {} + explicit BF16Vec8(const FP32Vec8&); + + void save(void* ptr) const { + *reinterpret_cast<__vector signed short*>(ptr) = reg; + } +}; + +struct BF16Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + + ss16x8x2_t reg; + + explicit BF16Vec16(const void* ptr) { + // Load 256 bits in two parts + reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr); + reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr); + } + + explicit BF16Vec16(const FP32Vec16&); + + void save(void* ptr) const { + // Save 256 bits in two parts + vec_xst(reg.val[0], 0, (signed short*)ptr); + vec_xst(reg.val[1], 16, (signed short*)ptr); + } +}; + +const static __vector signed short zero = vec_splats((signed short)0); + +struct BF16Vec32 : public Vec { + constexpr static int VEC_ELEM_NUM = 32; + + ss16x8x4_t reg; + explicit BF16Vec32(const void* ptr) + : reg(*reinterpret_cast(ptr)) {} + + explicit BF16Vec32(ss16x8x4_t data) : reg(data) {} + + explicit BF16Vec32(const BF16Vec8& vec8_data) + : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {} + + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } +}; + +struct FP32Vec4 : public Vec { + constexpr static int VEC_ELEM_NUM = 4; + union AliasReg { + __vector float reg; + float values[VEC_ELEM_NUM]; + }; + + __vector float reg; + + explicit FP32Vec4(float v) : reg(vec_splats(v)) {} + + explicit FP32Vec4() : reg(vec_splats(0.0f)) {} + + explicit FP32Vec4(const float* ptr) : reg(vec_xl(0, ptr)) {} + + explicit FP32Vec4(__vector float data) : reg(data) {} + + explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {} +}; + +struct FP32Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + union AliasReg { + f32x4x2_t reg; + float values[VEC_ELEM_NUM]; + }; + + f32x4x2_t reg; + + explicit FP32Vec8(float v) { + reg.val[0] = vec_splats(v); + reg.val[1] = vec_splats(v); + } + + explicit FP32Vec8() { + reg.val[0] = vec_splats(0.0f); + reg.val[1] = vec_splats(0.0f); + } + + explicit FP32Vec8(const float* ptr) { + reg.val[0] = vec_xl(0, ptr); + reg.val[1] = vec_xl(16, ptr); + } + + explicit FP32Vec8(f32x4x2_t data) : reg(data) {} + + explicit FP32Vec8(const FP32Vec8& data) { + reg.val[0] = data.reg.val[0]; + reg.val[1] = data.reg.val[1]; + } + + explicit FP32Vec8(const BF16Vec8& v) { + reg.val[0] = (__vector float)vec_mergeh(zero, v.reg); + reg.val[1] = (__vector float)vec_mergel(zero, v.reg); + } + + float reduce_sum() const { + AliasReg ar; + ar.reg = reg; + float result = 0; + unroll_loop( + [&result, &ar](int i) { result += ar.values[i]; }); + + return result; + } + + FP32Vec8 exp() const { + // TODO: Vectorize this + AliasReg ar; + ar.reg = reg; + f32x4x4_t ret; + ret.val[0][0] = std::exp(ar.values[0]); + ret.val[0][1] = std::exp(ar.values[1]); + ret.val[0][2] = std::exp(ar.values[2]); + ret.val[0][3] = std::exp(ar.values[3]); + ret.val[1][0] = std::exp(ar.values[4]); + ret.val[1][1] = std::exp(ar.values[5]); + ret.val[1][2] = std::exp(ar.values[6]); + ret.val[1][3] = std::exp(ar.values[7]); + return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]})); + } + + FP32Vec8 tanh() const { + // TODO: Vectorize this + AliasReg ar; + ar.reg = reg; + f32x4x4_t ret; + ret.val[0][0] = std::tanh(ar.values[0]); + ret.val[0][1] = std::tanh(ar.values[1]); + ret.val[0][2] = std::tanh(ar.values[2]); + ret.val[0][3] = std::tanh(ar.values[3]); + ret.val[1][0] = std::tanh(ar.values[4]); + ret.val[1][1] = std::tanh(ar.values[5]); + ret.val[1][2] = std::tanh(ar.values[6]); + ret.val[1][3] = std::tanh(ar.values[7]); + return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]})); + } + + FP32Vec8 er() const { + // TODO: Vectorize this + AliasReg ar; + ar.reg = reg; + f32x4x4_t ret; + ret.val[0][0] = std::erf(ar.values[0]); + ret.val[0][1] = std::erf(ar.values[1]); + ret.val[0][2] = std::erf(ar.values[2]); + ret.val[0][3] = std::erf(ar.values[3]); + ret.val[1][0] = std::erf(ar.values[4]); + ret.val[1][1] = std::erf(ar.values[5]); + ret.val[1][2] = std::erf(ar.values[6]); + ret.val[1][3] = std::erf(ar.values[7]); + return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]})); + } + + FP32Vec8 operator*(const FP32Vec8& b) const { + return FP32Vec8( + {vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])}); + } + + FP32Vec8 operator+(const FP32Vec8& b) const { + return FP32Vec8( + {vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])}); + } + + FP32Vec8 operator-(const FP32Vec8& b) const { + return FP32Vec8( + {vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])}); + } + + FP32Vec8 operator/(const FP32Vec8& b) const { + return FP32Vec8( + {vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])}); + } + + void save(float* ptr) const { + vec_xst(reg.val[0], 0, ptr); + vec_xst(reg.val[1], 16, ptr); + } +}; + +struct FP32Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + union AliasReg { + f32x4x4_t reg; + float values[VEC_ELEM_NUM]; + }; + + f32x4x4_t reg; + + explicit FP32Vec16(float v) { + reg.val[0] = vec_splats(v); + reg.val[1] = vec_splats(v); + reg.val[2] = vec_splats(v); + reg.val[3] = vec_splats(v); + } + + explicit FP32Vec16() { + reg.val[0] = vec_splats(0.0f); + reg.val[1] = vec_splats(0.0f); + reg.val[2] = vec_splats(0.0f); + reg.val[3] = vec_splats(0.0f); + } + + explicit FP32Vec16(const float* ptr) { + reg.val[0] = vec_xl(0, ptr); + reg.val[1] = vec_xl(16, ptr); + reg.val[2] = vec_xl(32, ptr); + reg.val[3] = vec_xl(48, ptr); + } + + explicit FP32Vec16(f32x4x4_t data) : reg(data) {} + + explicit FP32Vec16(const FP32Vec16& data) { + reg.val[0] = data.reg.val[0]; + reg.val[1] = data.reg.val[1]; + reg.val[2] = data.reg.val[2]; + reg.val[3] = data.reg.val[3]; + } + + explicit FP32Vec16(const FP32Vec4& data) { + reg.val[0] = data.reg; + reg.val[1] = data.reg; + reg.val[2] = data.reg; + reg.val[3] = data.reg; + } + + explicit FP32Vec16(const FP32Vec8& data) { + reg.val[0] = data.reg.val[0]; + reg.val[1] = data.reg.val[1]; + reg.val[2] = data.reg.val[0]; + reg.val[3] = data.reg.val[1]; + } + + explicit FP32Vec16(const BF16Vec16& v) { + reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]); + reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]); + reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]); + reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]); + } + + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} + + FP32Vec16 operator*(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]), + vec_mul(reg.val[1], b.reg.val[1]), + vec_mul(reg.val[2], b.reg.val[2]), + vec_mul(reg.val[3], b.reg.val[3])})); + } + + FP32Vec16 operator+(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_add(reg.val[0], b.reg.val[0]), + vec_add(reg.val[1], b.reg.val[1]), + vec_add(reg.val[2], b.reg.val[2]), + vec_add(reg.val[3], b.reg.val[3])})); + } + + FP32Vec16 operator-(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_sub(reg.val[0], b.reg.val[0]), + vec_sub(reg.val[1], b.reg.val[1]), + vec_sub(reg.val[2], b.reg.val[2]), + vec_sub(reg.val[3], b.reg.val[3])})); + } + + FP32Vec16 operator/(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_div(reg.val[0], b.reg.val[0]), + vec_div(reg.val[1], b.reg.val[1]), + vec_div(reg.val[2], b.reg.val[2]), + vec_div(reg.val[3], b.reg.val[3])})); + } + + float reduce_sum() const { + AliasReg ar; + ar.reg = reg; + float result = 0; + unroll_loop( + [&result, &ar](int i) { result += ar.values[i]; }); + + return result; + } + + template + float reduce_sub_sum(int idx) { + static_assert(VEC_ELEM_NUM % group_size == 0); + + AliasReg ar; + ar.reg = reg; + float result = 0; + const int start = idx * group_size; + unroll_loop( + [&result, &start, ar](int i) { result += ar.values[start + i]; }); + + return result; + } + + void save(float* ptr) const { + vec_xst(reg.val[0], 0, ptr); + vec_xst(reg.val[1], 16, ptr); + vec_xst(reg.val[2], 32, ptr); + vec_xst(reg.val[3], 48, ptr); + } +}; + +template +struct VecType { + using vec_type = void; +}; + +template +using vec_t = typename VecType::vec_type; + +template <> +struct VecType { + using vec_type = FP32Vec8; +}; + +template <> +struct VecType { + using vec_type = BF16Vec8; +}; + +template +void storeFP32(float v, T* ptr) { + *ptr = v; +} + +inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { + acc = acc + a * b; +} + +namespace c10 { +struct BFloat16 { + uint16_t value; // Assume BFloat16 is defined as a struct containing a 16-bit + // value. +}; +} // namespace c10 + +template <> +inline void storeFP32(float v, c10::BFloat16* ptr) { + c10::BFloat16 __attribute__((__may_alias__))* v_ptr = + reinterpret_cast(&v); + *ptr = *(v_ptr + 1); +} + +#ifndef __VEC_CLASS_FP_NAN + #define __VEC_CLASS_FP_NAN (1 << 6) +#endif + +const static __vector unsigned char omask = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; +const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff, + 0x00007fff}; +const static __vector unsigned int nan = {0x7fc00000, 0x7fc00000, 0x7fc00000, + 0x7fc00000}; +const static __vector unsigned int sh16 = {16, 16, 16, 16}; +const static __vector unsigned int one = {1, 1, 1, 1}; + +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) { + __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]); + __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]); + int cc; + __vector __bool int sel0 = + vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc); + __vector __bool int sel1 = + vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc); + inp0 = vec_sel(inp0, nan, sel0) >> sh16; + inp1 = vec_sel(inp1, nan, sel1) >> sh16; + reg = (__vector signed short)vec_perm(inp0, inp1, omask); +} + +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) { + __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]); + __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]); + __vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]); + __vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]); + int cc; + __vector __bool int sel0 = + vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc); + __vector __bool int sel1 = + vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc); + __vector __bool int sel2 = + vec_fp_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN, &cc); + __vector __bool int sel3 = + vec_fp_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN, &cc); + inp0 = vec_sel(inp0, nan, sel0) >> sh16; + inp1 = vec_sel(inp1, nan, sel1) >> sh16; + inp2 = vec_sel(inp2, nan, sel2) >> sh16; + inp3 = vec_sel(inp3, nan, sel3) >> sh16; + reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask); + reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask); +} + +inline void prefetch(const void* addr) { void __dcbt(const void* addr); } + +}; // namespace vec_op + +#endif \ No newline at end of file diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index a4ef2be2a58ca..4568699b30773 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -16,9 +16,18 @@ namespace vec_op { AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) +#define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) + #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) +#define VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, \ + VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(__VA_ARGS__)) + #ifndef CPU_OP_GUARD #define CPU_KERNEL_GUARD_IN(NAME) #define CPU_KERNEL_GUARD_OUT(NAME) @@ -121,6 +130,8 @@ struct BF16Vec32 : public Vec { __m512i reg; + explicit BF16Vec32() : reg(_mm512_setzero_si512()) {} + explicit BF16Vec32(const void* ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {} explicit BF16Vec32(__m512i data) : reg(data) {} diff --git a/csrc/cpu/mla_decode.cpp b/csrc/cpu/mla_decode.cpp new file mode 100644 index 0000000000000..37bd463bbc159 --- /dev/null +++ b/csrc/cpu/mla_decode.cpp @@ -0,0 +1,393 @@ +#include "cpu_types.hpp" +#include + +namespace { +template +struct KernelVecType { + using qk_load_vec_type = void; + using qk_vec_type = void; + using v_load_vec_type = void; +}; + +template <> +struct KernelVecType { + using qk_load_vec_type = vec_op::FP32Vec16; + using qk_vec_type = vec_op::FP32Vec16; + using v_load_vec_type = vec_op::FP32Vec16; +}; + +template <> +struct KernelVecType { +#if defined(__powerpc64__) || defined(__s390x__) + // Power and s390x architecture-specific vector types + using qk_load_vec_type = vec_op::FP32Vec16; + using qk_vec_type = vec_op::FP32Vec16; + using v_load_vec_type = vec_op::FP32Vec16; +#else + // Fallback for other architectures, including x86 + using qk_load_vec_type = vec_op::FP16Vec16; + using qk_vec_type = vec_op::FP32Vec16; + using v_load_vec_type = vec_op::FP16Vec16; +#endif +}; + +#ifdef __AVX512BF16__ +template <> +struct KernelVecType { + using qk_load_vec_type = vec_op::BF16Vec32; + using qk_vec_type = vec_op::BF16Vec32; + using v_load_vec_type = vec_op::BF16Vec16; +}; +#elif defined(__aarch64__) && !defined(ARM_BF16_SUPPORT) +// pass +#else +template <> +struct KernelVecType { + using qk_load_vec_type = vec_op::BF16Vec16; + using qk_vec_type = vec_op::FP32Vec16; + using v_load_vec_type = vec_op::BF16Vec16; +}; +#endif + +template +void mla_decode_block_head( + const qk_vec_type* __restrict__ q_vecs, // [HEAD_UNROLL, head_dim] + const qk_vec_type* __restrict__ k_vecs, // [block_size, head_dim] + const vec_op::FP32Vec16* __restrict v_vecs_f32, // [block_size, v_head_dim] + float* __restrict__ acc_out, // [HEAD_UNROLL, v_head_dim] + float* __restrict__ acc_lse, // [HEAD_UNROLL] + const float scale, const int num_tokens) { + using f32_vec_type = vec_op::FP32Vec16; + constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM; + constexpr int V_NUM_ELEM = f32_vec_type::VEC_ELEM_NUM; + + float logits[BLOCK_SIZE][HEAD_UNROLL] = {}; // initialize to zeros + float max_val[HEAD_UNROLL]; + std::fill(max_val, max_val + HEAD_UNROLL, -FLT_MAX); + + f32_vec_type acc_vec[BLOCK_SIZE][HEAD_UNROLL]; + for (int i = 0; i < HEAD_DIM; i += QK_NUM_ELEM) { + // load to registers + qk_vec_type q_vec[HEAD_UNROLL]; + +#pragma unroll + for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) + q_vec[unroll] = + qk_vec_type{q_vecs[(i + unroll * HEAD_DIM) / QK_NUM_ELEM]}; + + for (int block_offset = 0; block_offset < num_tokens; ++block_offset) { + qk_vec_type k_vec(k_vecs[(block_offset * HEAD_DIM + i) / QK_NUM_ELEM]); + +#pragma unroll + for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) + vec_op::fma(acc_vec[block_offset][unroll], q_vec[unroll], k_vec); + } + } + + for (int block_offset = 0; block_offset < num_tokens; ++block_offset) { +#pragma unroll + for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) { + const float acc = acc_vec[block_offset][unroll].reduce_sum() * scale; + logits[block_offset][unroll] = acc; + max_val[unroll] = std::max(max_val[unroll], acc); + } + } + + float sum_exp[HEAD_UNROLL] = {}; + for (int block_offset = 0; block_offset < num_tokens; ++block_offset) { +#pragma unroll + for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) { + const float val = + std::exp(logits[block_offset][unroll] - max_val[unroll]); + logits[block_offset][unroll] = val; + sum_exp[unroll] += val; + } + } + + f32_vec_type this_out[V_HEAD_DIM / V_NUM_ELEM][HEAD_UNROLL]; + + for (int block_offset = 0; block_offset < num_tokens; ++block_offset) { + // load to registers + f32_vec_type scale_[HEAD_UNROLL]; + +#pragma unroll + for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) + scale_[unroll] = + f32_vec_type{logits[block_offset][unroll] / sum_exp[unroll]}; + + for (int i = 0; i < V_HEAD_DIM; i += V_NUM_ELEM) { + f32_vec_type v_vec( + v_vecs_f32[(block_offset * HEAD_DIM + i) / V_NUM_ELEM]); + +#pragma unroll + for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) + vec_op::fma(this_out[i / V_NUM_ELEM][unroll], v_vec, scale_[unroll]); + } + } + + // merge attention state + // section 2.2 in https://arxiv.org/pdf/2501.01005 + f32_vec_type prev_scale[HEAD_UNROLL]; + f32_vec_type curr_scale[HEAD_UNROLL]; + +#pragma unroll + for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) { + const float prev_lse = acc_lse[unroll]; + const float curr_lse = std::log(sum_exp[unroll]) + + max_val[unroll]; // add back max_val to get true lse + // softmax trick + const float max_lse = std::max(prev_lse, curr_lse); + const float prev_sum_exp = std::exp(prev_lse - max_lse); + const float curr_sum_exp = std::exp(curr_lse - max_lse); + + const float new_sum_exp = prev_sum_exp + curr_sum_exp; + acc_lse[unroll] = std::log(new_sum_exp) + max_lse; + + prev_scale[unroll] = f32_vec_type{prev_sum_exp / new_sum_exp}; + curr_scale[unroll] = f32_vec_type{curr_sum_exp / new_sum_exp}; + } + + for (int i = 0; i < V_HEAD_DIM; i += V_NUM_ELEM) { +#pragma unroll + for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) { + f32_vec_type o_vec(acc_out + i + V_HEAD_DIM * unroll); + o_vec = o_vec * prev_scale[unroll] + + this_out[i / V_NUM_ELEM][unroll] * curr_scale[unroll]; + o_vec.save(acc_out + i + V_HEAD_DIM * unroll); + } + } + + q_vecs += HEAD_DIM / QK_NUM_ELEM * HEAD_UNROLL; + acc_out += V_HEAD_DIM * HEAD_UNROLL; +} + +template +void mla_decode_block( + const qk_vec_type* __restrict__ q_vecs, // [num_heads, head_dim] + const scalar_t* __restrict__ kv_cache, // [block_size, head_dim] + float* __restrict__ acc_out, // [num_heads, v_head_dim] + float* __restrict__ acc_lse, // [num_heads] + const int num_heads, const float scale, const int num_tokens) { + using qk_load_vec_type = typename KernelVecType::qk_load_vec_type; + static_assert( + std::is_same::qk_vec_type>::value); + using v_load_vec_type = typename KernelVecType::v_load_vec_type; + using f32_vec_type = vec_op::FP32Vec16; + static_assert(qk_load_vec_type::VEC_ELEM_NUM == qk_vec_type::VEC_ELEM_NUM); + static_assert(v_load_vec_type::VEC_ELEM_NUM == f32_vec_type::VEC_ELEM_NUM); + constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM; + constexpr int V_NUM_ELEM = v_load_vec_type::VEC_ELEM_NUM; + + const qk_vec_type* k_vecs; + const f32_vec_type* v_vecs_f32; + float* kv_cache_f32 = nullptr; + + if constexpr (!std::is_same::value) { + // convert KV cache block to FP32 to reuse it across query heads and + // attn @ V computation, since FP16/BF16->FP32 is expensive. + // TODO: move malloc outside of this fn to reuse across iterations. + const int nbytes = BLOCK_SIZE * HEAD_DIM * sizeof(float); + kv_cache_f32 = static_cast(std::aligned_alloc(64, nbytes)); + + for (int block_offset = 0; block_offset < num_tokens; ++block_offset) + for (int i = 0; i < HEAD_DIM; i += V_NUM_ELEM) { + v_load_vec_type kv_load_vec(kv_cache + block_offset * HEAD_DIM + i); + f32_vec_type kv_vec_f32(kv_load_vec); + kv_vec_f32.save(kv_cache_f32 + block_offset * HEAD_DIM + i); + } + + if constexpr (std::is_same::value) { + // for AVX512_BF16, Q @ K.T uses BF16 for K (no conversion) + // NOTE: in this case, we only need to convert the V section to FP32. + // But for simplicity, we will convert the whole KV block to FP32. + k_vecs = reinterpret_cast(kv_cache); + } else { + k_vecs = reinterpret_cast(kv_cache_f32); + } + + // attn @ V always use FP32 for V, since attn is FP32. + v_vecs_f32 = reinterpret_cast(kv_cache_f32); + + } else { + // KV cache is FP32. don't need to do anything. + k_vecs = reinterpret_cast(kv_cache); + v_vecs_f32 = reinterpret_cast(kv_cache); + } + + // compute 2 heads at the same time to improve ILP and + // take advantage of register cache for K and V. + constexpr int HEAD_UNROLL = 2; + for (int iter = 0; iter < num_heads / HEAD_UNROLL; ++iter) { + mla_decode_block_head( + q_vecs, k_vecs, v_vecs_f32, acc_out, acc_lse, scale, num_tokens); + + q_vecs += HEAD_UNROLL * HEAD_DIM / QK_NUM_ELEM; + acc_out += HEAD_UNROLL * V_HEAD_DIM; + acc_lse += HEAD_UNROLL; + } + + // take care of the remaining heads + for (int iter = 0; iter < num_heads % HEAD_UNROLL; ++iter) { + mla_decode_block_head( + q_vecs, k_vecs, v_vecs_f32, acc_out, acc_lse, scale, num_tokens); + + q_vecs += HEAD_DIM / QK_NUM_ELEM; + acc_out += V_HEAD_DIM; + acc_lse += 1; + } + + if (kv_cache_f32 != nullptr) { + std::free(kv_cache_f32); + } +} +} // namespace + +template +void mla_decode_kvcache_cpu_impl( + scalar_t* __restrict__ out, // [num_seqs, num_heads, v_head_dim] + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_dim] + const scalar_t* __restrict__ kv_cache, // [num_blocks, block_size, + // head_dim] + const int num_heads, const float scale, + const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] + const int* __restrict__ seq_lens, // [num_seqs] + const int max_num_blocks_per_seq, const int o_stride, const int q_stride, + const int kv_stride, const int num_seqs) { + using qk_load_vec_type = typename KernelVecType::qk_load_vec_type; + using qk_vec_type = typename KernelVecType::qk_vec_type; + constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM; + + // shared across threads + const int max_threads = omp_get_max_threads(); + const int acc_out_nbytes = + max_threads * num_heads * V_HEAD_DIM * sizeof(float); + float* acc_out = static_cast(std::aligned_alloc(64, acc_out_nbytes)); + std::vector acc_lse(max_threads * num_heads); + + // allocate memory to pre-convert query to FP32 later + float* q_f32; + constexpr bool PRE_CONVERT_QUERY = + !std::is_same::value && + std::is_same::value; + if constexpr (PRE_CONVERT_QUERY) { + const int q_f32_nbytes = num_heads * HEAD_DIM * sizeof(float); + q_f32 = static_cast(std::aligned_alloc(64, q_f32_nbytes)); + } + +#pragma omp parallel + { + const int num_threads = omp_get_num_threads(); + const int thread_id = omp_get_thread_num(); + float* __restrict__ acc_out_thread = + acc_out + thread_id * num_heads * V_HEAD_DIM; + float* __restrict__ acc_lse_thread = acc_lse.data() + thread_id * num_heads; + + for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) { + // reset accumulator + std::fill(acc_out_thread, acc_out_thread + num_heads * V_HEAD_DIM, 0.0f); + std::fill(acc_lse_thread, acc_lse_thread + num_heads, -FLT_MAX); + + const int seq_len = seq_lens[seq_idx]; + const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE; + const int last_block_size = seq_len - (block_num - 1) * BLOCK_SIZE; + + const qk_vec_type* q_vecs; + if constexpr (PRE_CONVERT_QUERY) { +// pre-convert query to FP32 since FP16/BF16->FP32 is slow. +#pragma omp for + for (int i = 0; i < num_heads * HEAD_DIM; i += QK_NUM_ELEM) { + qk_load_vec_type q_load_vec(q + seq_idx * q_stride + i); + qk_vec_type q_vec(q_load_vec); + q_vec.save(q_f32 + i); + } + q_vecs = reinterpret_cast(q_f32); + } else { + q_vecs = reinterpret_cast(q + seq_idx * q_stride); + } + +#pragma omp for + for (int block_idx = 0; block_idx < block_num; ++block_idx) { + const int physical_block_idx = + block_tables[seq_idx * max_num_blocks_per_seq + block_idx]; + const int num_tokens = + block_idx < block_num - 1 ? BLOCK_SIZE : last_block_size; + + mla_decode_block( + q_vecs, kv_cache + physical_block_idx * kv_stride, acc_out_thread, + acc_lse_thread, num_heads, scale, num_tokens); + } + +// merge attention states across threads +// section 2.2 in https://arxiv.org/pdf/2501.01005 +// each thread is responsible for 1 head +#pragma omp for + for (int head_idx = 0; head_idx < num_heads; ++head_idx) { + float* acc_lse_head = acc_lse.data() + head_idx; + float* acc_out_head = acc_out + head_idx * V_HEAD_DIM; + + float max_val = -FLT_MAX; + for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) { + max_val = std::max(max_val, acc_lse_head[thread_id_ * num_heads]); + } + + float sum_exp = 0.0f; + for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) { + float val = std::exp(acc_lse_head[thread_id_ * num_heads] - max_val); + acc_lse_head[thread_id_ * num_heads] = val; + sum_exp += val; + } + + float inv_sum = 1.0f / sum_exp; + float out_head[V_HEAD_DIM] = {}; + for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) { + float scale_ = acc_lse_head[thread_id_ * num_heads] * inv_sum; + for (int i = 0; i < V_HEAD_DIM; ++i) { + out_head[i] += + acc_out_head[thread_id_ * num_heads * V_HEAD_DIM + i] * scale_; + } + } + + for (int i = 0; i < V_HEAD_DIM; ++i) { + vec_op::storeFP32(out_head[i], out + seq_idx * o_stride + + head_idx * V_HEAD_DIM + i); + } + } + } + } + if (PRE_CONVERT_QUERY) { + std::free(q_f32); + } + std::free(acc_out); +} + +void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query, + torch::Tensor& kv_cache, double scale, + torch::Tensor& block_tables, torch::Tensor& seq_lens) { + const int num_seqs = query.size(0); + const int num_heads = query.size(1); + const int head_dim = query.size(2); + const int block_size = kv_cache.size(1); + const int v_head_dim = out.size(2); + + const int max_num_blocks_per_seq = block_tables.size(1); + const int o_stride = out.stride(0); + const int q_stride = query.stride(0); + const int kv_stride = kv_cache.stride(0); + + VLLM_DISPATCH_FLOATING_TYPES( + query.scalar_type(), "mla_decode_kvcache_cpu_impl", [&] { + CPU_KERNEL_GUARD_IN(mla_decode_kvcache_cpu_impl) + if (head_dim == 576 && v_head_dim == 512 && block_size == 16) + mla_decode_kvcache_cpu_impl( + out.data_ptr(), query.data_ptr(), + kv_cache.data_ptr(), num_heads, scale, + block_tables.data_ptr(), seq_lens.data_ptr(), + max_num_blocks_per_seq, o_stride, q_stride, kv_stride, num_seqs); + else + TORCH_CHECK(false, "Unsupported block size: ", block_size); + CPU_KERNEL_GUARD_OUT(mla_decode_kvcache_cpu_impl) + }); +} \ No newline at end of file diff --git a/csrc/cpu/pos_encoding.cpp b/csrc/cpu/pos_encoding.cpp index 96bce7dda0132..8a59e884d6c82 100644 --- a/csrc/cpu/pos_encoding.cpp +++ b/csrc/cpu/pos_encoding.cpp @@ -170,7 +170,7 @@ void rotary_embedding_gptj_impl( void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, torch::Tensor& key, int64_t head_size, torch::Tensor& cos_sin_cache, bool is_neox) { - int num_tokens = query.numel() / query.size(-1); + int num_tokens = positions.numel(); int rot_dim = cos_sin_cache.size(1); int num_heads = query.size(-1) / head_size; int num_kv_heads = key.size(-1) / head_size; diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp index 33b1637832888..6751e7e55fc51 100644 --- a/csrc/cpu/quant.cpp +++ b/csrc/cpu/quant.cpp @@ -25,7 +25,7 @@ struct KernelVecType { template <> struct KernelVecType { -#ifdef __powerpc64__ +#if defined(__powerpc64__) || defined(__s390x__) // Power architecture-specific vector type using load_vec_type = vec_op::FP32Vec16; #else diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index 5d1c5f4c83d3e..ef5a2fb5c4d22 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -18,6 +18,10 @@ void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a, const std::optional& azp, const std::optional& bias); +void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query, + torch::Tensor& kv_cache, double scale, + torch::Tensor& block_tables, torch::Tensor& seq_lens); + TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops @@ -150,6 +154,14 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { " str kv_cache_dtype," " Tensor k_scale, Tensor v_scale) -> ()"); cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache); + + cache_ops.def( + "concat_and_cache_mla(Tensor kv_c, Tensor k_pe," + " Tensor! kv_cache," + " Tensor slot_mapping," + " str kv_cache_dtype," + " Tensor scale) -> ()"); + cache_ops.impl("concat_and_cache_mla", torch::kCPU, &concat_and_cache_mla); } TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) { @@ -157,4 +169,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) { utils.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env); } +TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cpu), cpu_ops) { + cpu_ops.def( + "mla_decode_kvcache(" + " Tensor! out, Tensor query, Tensor kv_cache," + " float scale, Tensor block_tables, Tensor seq_lens) -> ()"); + cpu_ops.impl("mla_decode_kvcache", torch::kCPU, &mla_decode_kvcache); +} + REGISTER_EXTENSION(TORCH_EXTENSION_NAME) diff --git a/csrc/cuda_utils.h b/csrc/cuda_utils.h index c35224218e91c..6e62ea208db88 100644 --- a/csrc/cuda_utils.h +++ b/csrc/cuda_utils.h @@ -1,15 +1,41 @@ #pragma once -#if defined(__CUDACC__) || defined(_NVHPC_CUDA) - #define HOST_DEVICE_INLINE __forceinline__ __host__ __device__ - #define DEVICE_INLINE __forceinline__ __device__ - #define HOST_INLINE __forceinline__ __host__ +#include + +#if defined(__HIPCC__) + #define HOST_DEVICE_INLINE __host__ __device__ + #define DEVICE_INLINE __device__ + #define HOST_INLINE __host__ +#elif defined(__CUDACC__) || defined(_NVHPC_CUDA) + #define HOST_DEVICE_INLINE __host__ __device__ __forceinline__ + #define DEVICE_INLINE __device__ __forceinline__ + #define HOST_INLINE __host__ __forceinline__ #else #define HOST_DEVICE_INLINE inline #define DEVICE_INLINE inline #define HOST_INLINE inline #endif +#define CUDA_CHECK(cmd) \ + do { \ + cudaError_t e = cmd; \ + if (e != cudaSuccess) { \ + printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \ + cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + int64_t get_device_attribute(int64_t attribute, int64_t device_id); int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id); + +namespace cuda_utils { + +template +HOST_DEVICE_INLINE constexpr std::enable_if_t, T> +ceil_div(T a, T b) { + return (a + b - 1) / b; +} + +}; // namespace cuda_utils \ No newline at end of file diff --git a/csrc/cuda_utils_kernels.cu b/csrc/cuda_utils_kernels.cu index d6f9eb646fad5..0627a42675b52 100644 --- a/csrc/cuda_utils_kernels.cu +++ b/csrc/cuda_utils_kernels.cu @@ -1,16 +1,22 @@ +#include "cuda_utils.h" #ifdef USE_ROCM #include #include #endif + int64_t get_device_attribute(int64_t attribute, int64_t device_id) { - int device, value; - if (device_id < 0) { - cudaGetDevice(&device); - } else { - device = device_id; - } - cudaDeviceGetAttribute(&value, static_cast(attribute), - device); + // Return the cached value on subsequent calls + static int value = [=]() { + int device = static_cast(device_id); + if (device < 0) { + CUDA_CHECK(cudaGetDevice(&device)); + } + int value; + CUDA_CHECK(cudaDeviceGetAttribute( + &value, static_cast(attribute), device)); + return static_cast(value); + }(); + return value; } diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp index e8555d853b7ac..fab6ca36d422e 100644 --- a/csrc/cumem_allocator.cpp +++ b/csrc/cumem_allocator.cpp @@ -12,15 +12,21 @@ extern "C" { #include #include -#define CUDA_CHECK(condition) \ - do { \ - CUresult error = condition; \ - if (error != 0) { \ - char* error_string; \ - cuGetErrorString(error, (const char**)&error_string); \ - std::cerr << "CUDA Error: " << error_string << " at " << __FILE__ << ":" \ - << __LINE__ << std::endl; \ - } \ +char error_msg[10240]; // 10KB buffer to store error messages +CUresult no_error = CUresult(0); +CUresult error_code = no_error; // store error code + +#define CUDA_CHECK(condition) \ + do { \ + CUresult error = condition; \ + if (error != 0) { \ + error_code = error; \ + char* error_string; \ + cuGetErrorString(error, (const char**)&error_string); \ + snprintf(error_msg, sizeof(error_msg), "CUDA Error: %s at %s:%d", \ + error_string, __FILE__, __LINE__); \ + std::cerr << error_msg << std::endl; \ + } \ } while (0) // Global references to Python callables @@ -54,14 +60,22 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem, // Allocate memory using cuMemCreate CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0)); + if (error_code != 0) { + return; + } CUDA_CHECK(cuMemMap(d_mem, size, 0, *p_memHandle, 0)); - + if (error_code != 0) { + return; + } CUmemAccessDesc accessDesc = {}; accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; accessDesc.location.id = device; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; CUDA_CHECK(cuMemSetAccess(d_mem, size, &accessDesc, 1)); + if (error_code != 0) { + return; + } // std::cout << "create_and_map: device=" << device << ", size=" << size << ", // d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl; } @@ -73,7 +87,13 @@ void unmap_and_release(unsigned long long device, ssize_t size, // ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl; ensure_context(device); CUDA_CHECK(cuMemUnmap(d_mem, size)); + if (error_code != 0) { + return; + } CUDA_CHECK(cuMemRelease(*p_memHandle)); + if (error_code != 0) { + return; + } } PyObject* create_tuple_from_c_integers(unsigned long long a, @@ -121,12 +141,16 @@ void* my_malloc(ssize_t size, int device, CUstream stream) { size_t granularity; CUDA_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); - + if (error_code != 0) { + return nullptr; + } size_t alignedSize = ((size + granularity - 1) / granularity) * granularity; CUdeviceptr d_mem; CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, 0, 0, 0)); - + if (error_code != 0) { + return nullptr; + } // allocate the CUmemGenericAllocationHandle CUmemGenericAllocationHandle* p_memHandle = (CUmemGenericAllocationHandle*)malloc( @@ -208,6 +232,9 @@ void my_free(void* ptr, ssize_t size, int device, CUstream stream) { // free address and the handle CUDA_CHECK(cuMemAddressFree(d_mem, size)); + if (error_code != 0) { + return; + } free(p_memHandle); } @@ -258,6 +285,12 @@ static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) { unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle); + if (error_code != 0) { + error_code = no_error; + PyErr_SetString(PyExc_RuntimeError, error_msg); + return nullptr; + } + Py_RETURN_NONE; } @@ -282,6 +315,12 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) { create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle); + if (error_code != 0) { + error_code = no_error; + PyErr_SetString(PyExc_RuntimeError, error_msg); + return nullptr; + } + Py_RETURN_NONE; } diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh index 6be4d4f2b2eb8..b9df4ed160b03 100644 --- a/csrc/custom_all_reduce.cuh +++ b/csrc/custom_all_reduce.cuh @@ -38,9 +38,13 @@ struct Signal { alignas(128) FlagType peer_counter[2][kMaxBlocks][8]; }; -struct __align__(16) RankData { const void* __restrict__ ptrs[8]; }; +struct __align__(16) RankData { + const void* __restrict__ ptrs[8]; +}; -struct __align__(16) RankSignals { Signal* signals[8]; }; +struct __align__(16) RankSignals { + Signal* signals[8]; +}; // like std::array, but aligned template diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp index 07c9e46c27b06..febc4eccd9561 100644 --- a/csrc/cutlass_extensions/common.hpp +++ b/csrc/cutlass_extensions/common.hpp @@ -32,3 +32,20 @@ inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) { } int32_t get_sm_version_num(); + +/** + * A wrapper for a kernel that is used to guard against compilation on + * architectures that will never use the kernel. The purpose of this is to + * reduce the size of the compiled binary. + * __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef + * into code that will be executed on the device where it is defined. + */ +template +struct enable_sm90_or_later : Kernel { + template + CUTLASS_DEVICE void operator()(Args&&... args) { +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900 + Kernel::operator()(std::forward(args)...); +#endif + } +}; \ No newline at end of file diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp index ef413e6dd75c5..64b7ddae3d2d7 100644 --- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp +++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp @@ -122,8 +122,8 @@ struct ScaledEpilogue auto a_args = SUPER::template args_from_tensor(a_scales); auto b_args = SUPER::template args_from_tensor(b_scales); - typename EVTCompute0::Arguments evt0_args{b_args}; - return ArgumentType{a_args, evt0_args}; + typename EVTCompute0::Arguments evt0_args{b_args, {}, {}}; + return ArgumentType{a_args, evt0_args, {}}; } }; @@ -167,8 +167,8 @@ struct ScaledEpilogueBias auto b_args = SUPER::template args_from_tensor(b_scales); auto bias_args = SUPER::template args_from_tensor(bias); - typename EVTCompute0::Arguments evt0_args{b_args}; - return ArgumentType{a_args, evt0_args, bias_args}; + typename EVTCompute0::Arguments evt0_args{b_args, {}, {}}; + return ArgumentType{a_args, evt0_args, bias_args, {}}; } }; @@ -230,9 +230,10 @@ struct ScaledEpilogueBiasAzp auto azp_adj_args = SUPER::template args_from_tensor(azp_adj); - typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args}; - typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args}; - return ArgumentType{a_args, evt_scale_b_args, bias_args}; + typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args, {}}; + typename EVTComputeScaleB::Arguments evt_scale_b_args{ + b_args, evt_azp_args, {}}; + return ArgumentType{a_args, evt_scale_b_args, bias_args, {}}; } }; @@ -309,11 +310,12 @@ struct ScaledEpilogueBiasAzpToken auto azp_adj_args = SUPER::template args_from_tensor(azp_adj); - typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args}; - typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args}; - typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args}; - return ArgumentType{a_args, evt_scale_b_args, bias_args}; + typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args, {}}; + typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args, {}}; + typename EVTComputeScaleB::Arguments evt_scale_b_args{ + b_args, evt_acc_args, {}}; + return ArgumentType{a_args, evt_scale_b_args, bias_args, {}}; } }; -}; // namespace vllm::c2x \ No newline at end of file +}; // namespace vllm::c2x diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp index c590c66a66652..0a812dc56a994 100644 --- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp +++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp @@ -16,36 +16,58 @@ namespace vllm::c3x { using namespace cute; +template +struct identity { + CUTLASS_HOST_DEVICE + T operator()(T lhs) const { return lhs; } +}; + +template +struct TrivialEpilogue { + private: + using Accum = cutlass::epilogue::fusion::Sm90AccFetch; + using Compute = cutlass::epilogue::fusion::Sm90Compute< + cutlass::epilogue::thread::Identity, ElementD, ElementAcc, + cutlass::FloatRoundStyle::round_to_nearest>; + + public: + using EVTCompute = cutlass::epilogue::fusion::Sm90EVT; + using ArgumentType = typename EVTCompute::Arguments; + + template + static ArgumentType prepare_args(Args... args) { + return {}; + } +}; + /* * This class provides the common load descriptors for the * ScaledEpilogue[...] classes */ -template +template struct ScaledEpilogueBase { protected: using Accum = cutlass::epilogue::fusion::Sm90AccFetch; template using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast< - 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, - Stride, Int<0>, Int<0>>>; + 0 /*Stages*/, TileShape, T, Stride, Int<0>, Int<0>>>; template using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast< - 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, - Stride, Int<1>, Int<0>>>; + 0 /*Stages*/, TileShape, T, Stride, Int<1>, Int<0>>>; // Don't want to support nullptr by default template using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast< - 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T, - Stride, Int<0>, Int<0>>, 128 / sizeof_bits_v, EnableNullPtr>; + 0 /*Stages*/, TileShape, T, T, Stride, Int<0>, Int<0>>, + 128 / sizeof_bits_v, EnableNullPtr>; // Don't want to support nullptr by default template using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast< - 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T, - Stride, Int<1>, Int<0>>, 128 / sizeof_bits_v, EnableNullPtr>; + 0 /*Stages*/, TileShape, T, T, Stride, Int<1>, Int<0>>, + 128 / sizeof_bits_v, EnableNullPtr>; // This utility function constructs the arguments for the load descriptors // from a tensor. It can handle both row and column, as well as row/column or @@ -92,11 +114,11 @@ struct ScaledEpilogueBase { the A and B operands respectively. These scales may be either per-tensor or per row or column. */ -template +template struct ScaledEpilogue - : private ScaledEpilogueBase { + : private ScaledEpilogueBase { private: - using SUPER = ScaledEpilogueBase; + using SUPER = ScaledEpilogueBase; using Accum = typename SUPER::Accum; using ScaleA = typename SUPER::template ColOrScalarLoad; using ScaleB = typename SUPER::template RowOrScalarLoad; @@ -122,8 +144,8 @@ struct ScaledEpilogue auto a_args = SUPER::template args_from_tensor(a_scales); auto b_args = SUPER::template args_from_tensor(b_scales); - typename EVTCompute0::Arguments evt0_args{b_args}; - return ArgumentType{a_args, evt0_args}; + typename EVTCompute0::Arguments evt0_args{b_args, {}, {}}; + return ArgumentType{a_args, evt0_args, {}}; } }; @@ -136,11 +158,11 @@ struct ScaledEpilogue * The bias tensor must be per-output channel. * ScaleA and ScaleB can be per-tensor or per-token/per-channel. */ -template +template struct ScaledEpilogueBias - : private ScaledEpilogueBase { + : private ScaledEpilogueBase { private: - using SUPER = ScaledEpilogueBase; + using SUPER = ScaledEpilogueBase; using Accum = typename SUPER::Accum; using ScaleA = typename SUPER::template ColOrScalarLoad; using ScaleB = typename SUPER::template RowOrScalarLoad; @@ -169,8 +191,51 @@ struct ScaledEpilogueBias auto b_args = SUPER::template args_from_tensor(b_scales); auto bias_args = SUPER::template args_from_tensor(bias); - typename EVTCompute0::Arguments evt0_args{b_args}; - return ArgumentType{a_args, evt0_args, bias_args}; + typename EVTCompute0::Arguments evt0_args{b_args, {}, {}}; + return ArgumentType{a_args, evt0_args, bias_args, {}}; + } +}; + +/* + * This epilogue performs the same operation as ScaledEpilogueBias, but the + * bias is a column vector instead of a row vector. Useful e.g. if we are + * computing a GEMM via C^T += B^T A^T. This happens in the 2:4 sparse kernels. + */ +template +struct ScaledEpilogueColumnBias + : private ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::template ColOrScalarLoad; + using ScaleB = typename SUPER::template RowOrScalarLoad; + using Bias = typename SUPER::template ColLoad; + + using Compute0 = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTCompute0 = + cutlass::epilogue::fusion::Sm90EVT; + + using Compute1 = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiply_add, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + public: + using EVTCompute = + cutlass::epilogue::fusion::Sm90EVT; + + using ArgumentType = typename EVTCompute::Arguments; + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& bias) { + auto a_args = SUPER::template args_from_tensor(a_scales); + auto b_args = SUPER::template args_from_tensor(b_scales); + auto bias_args = SUPER::template args_from_tensor(bias); + + typename EVTCompute0::Arguments evt0_args{b_args, {}, {}}; + return ArgumentType{a_args, evt0_args, bias_args, {}}; } }; @@ -182,11 +247,11 @@ struct ScaledEpilogueBias * * This epilogue also supports bias, which remains per-channel. */ -template +template struct ScaledEpilogueBiasAzp - : private ScaledEpilogueBase { + : private ScaledEpilogueBase { private: - using SUPER = ScaledEpilogueBase; + using SUPER = ScaledEpilogueBase; using Accum = typename SUPER::Accum; using ScaleA = typename SUPER::template ColOrScalarLoad; using ScaleB = typename SUPER::template RowOrScalarLoad; @@ -230,9 +295,10 @@ struct ScaledEpilogueBiasAzp auto azp_adj_args = SUPER::template args_from_tensor(azp_adj); - typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args}; - typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args}; - return ArgumentType{a_args, evt_scale_b_args, bias_args}; + typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args, {}}; + typename EVTComputeScaleB::Arguments evt_scale_b_args{ + b_args, evt_azp_args, {}}; + return ArgumentType{a_args, evt_scale_b_args, bias_args, {}}; } }; @@ -246,11 +312,11 @@ struct ScaledEpilogueBiasAzp * * This epilogue also supports bias, which remains per-channel. */ -template +template struct ScaledEpilogueBiasAzpToken - : private ScaledEpilogueBase { + : private ScaledEpilogueBase { private: - using SUPER = ScaledEpilogueBase; + using SUPER = ScaledEpilogueBase; using Accum = typename SUPER::Accum; using ScaleA = typename SUPER::template ColOrScalarLoad; using ScaleB = typename SUPER::template RowOrScalarLoad; @@ -307,11 +373,12 @@ struct ScaledEpilogueBiasAzpToken auto azp_adj_args = SUPER::template args_from_tensor(azp_adj); - typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args}; - typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args}; - typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args}; - return ArgumentType{a_args, evt_scale_b_args, bias_args}; + typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args, {}}; + typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args, {}}; + typename EVTComputeScaleB::Arguments evt_scale_b_args{ + b_args, evt_acc_args, {}}; + return ArgumentType{a_args, evt_scale_b_args, bias_args, {}}; } }; -}; // namespace vllm::c3x \ No newline at end of file +}; // namespace vllm::c3x diff --git a/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp b/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp new file mode 100644 index 0000000000000..ec75c29e54f4d --- /dev/null +++ b/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp @@ -0,0 +1,123 @@ +// Modified from: cutlass/gemm/collective/builders/sm90_gmma_builder.inl +// clang-format off +#pragma once + +#include "cutlass/gemm/collective/builders/sm90_gmma_builder.inl" + +#include "cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp" + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::collective { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// GMMA_TMA_WS_SS (BlockScaled Builders) +template < + class ElementA, + class GmemLayoutATag, + int AlignmentA, + class ElementB, + class GmemLayoutBTag, + int AlignmentB, + class ElementAccumulator, + class TileShape_MNK, + class ClusterShape_MNK, + class StageCountType, + int ScaleGranularityM +> +struct CollectiveBuilder< + arch::Sm90, + arch::OpClassTensorOp, + ElementA, + GmemLayoutATag, + AlignmentA, + ElementB, + GmemLayoutBTag, + AlignmentB, + ElementAccumulator, + TileShape_MNK, + ClusterShape_MNK, + StageCountType, + KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum, + cute::enable_if_t< + not detail::is_use_rmem_A()> +> { + using KernelScheduleType = KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum; + + static_assert(is_static::value); + static_assert(is_static::value); +#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED + static_assert(cutlass::detail::dependent_false, "Unsupported Toolkit for SM90 Collective Builder\n"); +#endif + static_assert(detail::is_aligned(), + "Should meet TMA alignment requirement\n"); + + static constexpr bool IsArrayOfPointersGemm = (cute::is_any_of_v); + static constexpr bool IsFP8Input = detail::is_input_fp8(); + static_assert((!IsFP8Input || !IsArrayOfPointersGemm), + "KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum is only compatible with FP8 Blocked Scaled version right now."); + + // For fp32 types, map to tf32 MMA value type + using ElementAMma = cute::conditional_t, tfloat32_t, ElementA>; + using ElementBMma = cute::conditional_t, tfloat32_t, ElementB>; + + static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A(); + static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B(); + + static constexpr bool IsCooperative = cute::is_any_of_v>; + using AtomLayoutMNK = cute::conditional_t>, Layout>>; + + using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector< + ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{})); + + using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{}))); + using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{}))); + + using SmemLayoutAtomA = decltype(detail::ss_smem_selector< + GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + using SmemLayoutAtomB = decltype(detail::ss_smem_selector< + GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + + static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0; + static constexpr int KernelSmemCarveout = static_cast(TensorMapStorage); + + static constexpr int PipelineStages = detail::compute_stage_count_or_override(StageCountType{}); + using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8; + + using SmemCopyAtomA = void; + using SmemCopyAtomB = void; + + using CollectiveOp = CollectiveMma< + DispatchPolicy, + TileShape_MNK, + ElementA, + TagToStrideA_t, + ElementB, + TagToStrideB_t, + TiledMma, + GmemTiledCopyA, + SmemLayoutAtomA, + SmemCopyAtomA, + cute::identity, + GmemTiledCopyB, + SmemLayoutAtomB, + SmemCopyAtomB, + cute::identity + >; +}; + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::collective + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp b/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp new file mode 100644 index 0000000000000..13b90e998625e --- /dev/null +++ b/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp @@ -0,0 +1,183 @@ +// clang-format off +// adapted from: https://github.com/soundOfDestiny/cutlass/blob/a4208aa6958864923505cade9c63eb2a6daf16e5/include/cutlass/gemm/collective/fp8_accumulation.hpp + +/*************************************************************************************************** + * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#pragma once + +#include "cute/algorithm/clear.hpp" +#include "cute/tensor.hpp" + +////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////FP8 Accumulation/////////////////////////// +////////////////////////////////////////////////////////////////////////////// +/// This class provides API to promote (add) or scale (multiply_add) the results +/// from the tensor core accumulators to the main accumulators when the number +/// of MMAs reaches the max number of MMA interval specified by user, after that +/// the tensor core accumulators are zeroed. +////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::collective { + +template < + class EngineAccum, + class LayoutAccum> +struct GmmaFP8AccumulationWithScale { + using TensorAccum = cute::Tensor; + using ElementAccumulator = typename EngineAccum::value_type; + + static_assert(is_static::value, "Accumulator Layout should be static"); + static_assert(is_rmem::value , "Accumulator tensor must be rmem resident."); + +private: + TensorAccum& accum_; + TensorAccum accum_temp_; + + uint32_t accum_promotion_interval_; // defines the max num of executed MMAs after which accum should be promoted. + uint32_t mma_count_per_mainloop_iteration_; // num of MMAs per k_tile of mainloop + uint32_t mma_count_; // current executed MMAs + uint32_t reset_accum_flag_; // accum needs to be zeroed or not. + + // promote or `add` the partial accumulators to main accumulator (FADD). + CUTLASS_DEVICE + void promote_core() { + warpgroup_wait<0>(); + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(accum_); ++i) { + accum_(i) += accum_temp_(i); + } + } + + // `multiply` scale the partial accumulators and `add` to main accumulator (FFMA). + template < + class EngineScale, + class LayoutScale> + CUTLASS_DEVICE + void scale_core(const cute::Tensor &scale) { + using TensorScale = cute::Tensor; + + static_assert(is_static::value, "Scale Layout should be static"); + static_assert(is_rmem::value , "Scale tensor must be rmem resident."); + + static_assert(LayoutAccum{}.shape() == LayoutScale{}.shape(), "Accumulator and scale must have same shape."); + + warpgroup_wait<0>(); + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(accum_); ++i) { + accum_(i) += accum_temp_(i) * scale(i); + } + } + +public: + CUTLASS_DEVICE + GmmaFP8AccumulationWithScale( + TensorAccum &accum, + uint32_t accum_promotion_interval, + uint32_t mma_count_per_mainloop_iteration) + : accum_(accum), + accum_promotion_interval_(accum_promotion_interval), + mma_count_per_mainloop_iteration_(mma_count_per_mainloop_iteration), + mma_count_(0), + reset_accum_flag_(0) + { + accum_temp_ = cute::make_fragment_like(accum); + } + + // + // Methods (Common) + // + + CUTLASS_DEVICE + TensorAccum& operator()() { + return accum_temp_; + } + + /// prepare the MMA accumulators when initialization or zeroing is required. + CUTLASS_DEVICE + bool prepare_if_needed() { + return reset_accum_flag_; + } + + // + // Methods (for FADD version) + // + + /// promote (add) the results from the MMA accumulators to main accumulator if needed. + CUTLASS_DEVICE + void promote_if_needed() { + mma_count_ += mma_count_per_mainloop_iteration_; + reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0); + if (reset_accum_flag_) { + promote_core(); + mma_count_ = 0; + } + } + + /// promote (add) the residue results from the MMA accumulators to main accumulator if needed. + CUTLASS_DEVICE + void promote_residue_if_needed() { + if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) { + promote_core(); + } + } + + // + // Methods (for FFMA version) + // + + /// scale (multiply_add) the results from the MMA accumulators to main accumulator if needed. + template < + class EngineScale, + class LayoutScale> + CUTLASS_DEVICE + void scale_if_needed(const cute::Tensor &scale) { + mma_count_ += mma_count_per_mainloop_iteration_; + reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0); + if (reset_accum_flag_) { + scale_core(scale); + mma_count_ = 0; + } + } + + /// scale (multiply_add) the residue results from the MMA accumulators to main accumulator if needed. + template < + class EngineScale, + class LayoutScale> + CUTLASS_DEVICE + void scale_residue_if_needed(const cute::Tensor &scale) { + if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) { + scale_core(scale); + } + } +}; + +} // namespace cutlass::gemm::collective diff --git a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp new file mode 100644 index 0000000000000..d922a3349e1e1 --- /dev/null +++ b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp @@ -0,0 +1,730 @@ +// clang-format off +// Adapted (Heavily) from: https://github.com/soundOfDestiny/cutlass/blob/9d997ce0dea4c5fa1a617db6b7ff29aa9235822c/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp + +/*************************************************************************************************** + * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/dispatch_policy.hpp" +#include "cutlass/trace.h" +#include "cutlass/numeric_types.h" + +#include "cute/arch/cluster_sm90.hpp" +#include "cute/arch/copy_sm80.hpp" +#include "cute/arch/copy_sm90.hpp" +#include "cute/algorithm/functional.hpp" +#include "cute/atom/mma_atom.hpp" +#include "cute/algorithm/gemm.hpp" +#include "cute/tensor_predicate.hpp" +#include "cute/numeric/arithmetic_tuple.hpp" + +#include "cutlass_extensions/gemm/dispatch_policy.hpp" +#include "cutlass_extensions/gemm/collective/fp8_accumulation.hpp" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::collective { +using namespace cute; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// WarpSpecialized Mainloop +template < + int Stages, + class ClusterShape, + class KernelSchedule, + int ScaleGranularityM_, + class TileShape_, + class ElementA_, + class StrideA_, + class ElementB_, + class StrideB_, + class TiledMma_, + class GmemTiledCopyA_, + class SmemLayoutAtomA_, + class SmemCopyAtomA_, + class TransformA_, + class GmemTiledCopyB_, + class SmemLayoutAtomB_, + class SmemCopyAtomB_, + class TransformB_> +struct CollectiveMma< + MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8, + TileShape_, + ElementA_, + StrideA_, + ElementB_, + StrideB_, + TiledMma_, + GmemTiledCopyA_, + SmemLayoutAtomA_, + SmemCopyAtomA_, + TransformA_, + GmemTiledCopyB_, + SmemLayoutAtomB_, + SmemCopyAtomB_, + TransformB_> +{ + // + // Type Aliases + // + using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8; + using TileShape = TileShape_; + using ElementA = ElementA_; + using StrideA = StrideA_; + using ElementB = ElementB_; + using StrideB = StrideB_; + using TiledMma = TiledMma_; + using ElementAccumulator = typename TiledMma::ValTypeC; + using ElementBlockScale = ElementAccumulator; + using GmemTiledCopyA = GmemTiledCopyA_; + using GmemTiledCopyB = GmemTiledCopyB_; + using SmemLayoutAtomA = SmemLayoutAtomA_; + using SmemLayoutAtomB = SmemLayoutAtomB_; + using SmemCopyAtomA = SmemCopyAtomA_; + using SmemCopyAtomB = SmemCopyAtomB_; + using TransformA = TransformA_; + using TransformB = TransformB_; + using ArchTag = typename DispatchPolicy::ArchTag; + + using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{})); + using MainloopPipeline = cutlass::PipelineTmaAsync; + using PipelineState = cutlass::PipelineState; + using PipelineParams = typename MainloopPipeline::Params; + + // Two threads per CTA are producers (1 for operand tile and 32 for scales) + static constexpr int NumProducerThreadEvents = 33; + + static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_; + static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM; + + static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + + static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); + + static_assert((size<0>(TileShape{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M."); + + // Tile along modes in a way that maximizes the TMA box size. + using SmemLayoutA = decltype(tile_to_shape( + SmemLayoutAtomA{}, + make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int{}), + cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})); + using SmemLayoutB = decltype(tile_to_shape( + SmemLayoutAtomB{}, + make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int{}), + cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})); + + // Block scaling gmem-to-smem copy atom + using SmemBlockScalingCopyAtomA = Copy_Atom, ElementBlockScale>; + using SmemBlockScalingCopyAtomB = Copy_Atom, ElementBlockScale>; + + // Block scaling smem layout + using SmemLayoutScaleA = Layout, Int>>; + using SmemLayoutScaleB = Layout>, Stride<_1>>; // `ScaleNsPerTile` is always 1. + + static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more."); + static_assert(cute::is_base_of::value && + cute::is_base_of::value, + "MMA atom must source both A and B operand from smem_desc for this mainloop."); + static_assert(cute::is_same_v || cute::is_same_v, + "GmemTiledCopy - invalid SM90 TMA copy atom specified."); + static_assert(cute::is_same_v || cute::is_same_v, + "GmemTiledCopy - invalid SM90 TMA copy atom specified."); + static_assert(cute::is_same_v, + "ElementAccumulator and ElementBlockScale should be same datatype"); + + struct SharedStorage + { + struct TensorStorage : cute::aligned_struct<128> { + cute::array_aligned> smem_A; // mxk + cute::array_aligned> smem_B; // nxk + cute::array_aligned> smem_scale_A; // ScaleMsPerTile x k + cute::array_aligned> smem_scale_B; // 1xk + } tensors; + + using PipelineStorage = typename MainloopPipeline::SharedStorage; + PipelineStorage pipeline; + }; + using TensorStorage = typename SharedStorage::TensorStorage; + using PipelineStorage = typename SharedStorage::PipelineStorage; + + // Host side kernel arguments + struct Arguments { + ElementA const* ptr_A; + StrideA dA; + ElementB const* ptr_B; + StrideB dB; + ElementBlockScale const* ptr_scale_A; + ElementBlockScale const* ptr_scale_B; + }; + + // Device side kernel params + struct Params { + // Assumption: StrideA is congruent with Problem_MK + using TMA_A = decltype(make_tma_copy_A_sm90( + GmemTiledCopyA{}, + make_tensor(static_cast(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}), + SmemLayoutA{}(_,_,0), + TileShape{}, + ClusterShape{})); + // Assumption: StrideB is congruent with Problem_NK + using TMA_B = decltype(make_tma_copy_B_sm90( + GmemTiledCopyB{}, + make_tensor(static_cast(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}), + SmemLayoutB{}(_,_,0), + TileShape{}, + ClusterShape{})); + TMA_A tma_load_a; + TMA_B tma_load_b; + uint32_t tma_transaction_bytes = TmaTransactionBytes; + uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK; + uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK; + // Block scaling factors for A and B + ElementBlockScale const* ptr_scale_A; + ElementBlockScale const* ptr_scale_B; + }; + + // + // Methods + // + + template + static constexpr Params + to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) { + (void) workspace; + + // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK) + auto problem_shape_MNKL = append<4>(problem_shape, 1); + auto [M,N,K,L] = problem_shape_MNKL; + + auto ptr_A = reinterpret_cast(args.ptr_A); + auto ptr_B = reinterpret_cast(args.ptr_B); + + Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA)); + Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB)); + typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90( + GmemTiledCopyA{}, + tensor_a, + SmemLayoutA{}(_,_,cute::Int<0>{}), + TileShape{}, + ClusterShape{}); + typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90( + GmemTiledCopyB{}, + tensor_b, + SmemLayoutB{}(_,_,cute::Int<0>{}), + TileShape{}, + ClusterShape{}); + uint32_t transaction_bytes_mk = TmaTransactionBytesMK; + uint32_t transaction_bytes_nk = TmaTransactionBytesNK; + uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk; + + return { + tma_load_a, + tma_load_b, + transaction_bytes, + transaction_bytes_mk, + transaction_bytes_nk, + args.ptr_scale_A, + args.ptr_scale_B + }; + } + + template + static bool + can_implement( + ProblemShape const& problem_shape, + [[maybe_unused]] Arguments const& args) { + constexpr int tma_alignment_bits = 128; + auto problem_shape_MNKL = append<4>(problem_shape, 1); + auto [M,N,K,L] = problem_shape_MNKL; + + bool implementable = true; + constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits::value; + implementable = implementable && cutlass::detail::check_alignment(cute::make_shape(M,K,L), StrideA{}); + constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits::value; + implementable = implementable && cutlass::detail::check_alignment(cute::make_shape(N,K,L), StrideB{}); + + if (!implementable) { + CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n"); + } + return implementable; + } + + static constexpr int K_PIPE_MAX = DispatchPolicy::Stages; + static constexpr int K_PIPE_MMAS = 1; + static constexpr uint32_t TmaTransactionBytesMK = + cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast(sizeof_bits::value)); + static constexpr uint32_t TmaTransactionBytesNK = + cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast(sizeof_bits::value)); + static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK; + + /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance + CUTLASS_DEVICE + static void prefetch_tma_descriptors(Params const& mainloop_params) + { + cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor()); + cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor()); + } + + /// Set up the data needed by this collective for load and mma. + /// Returns a tuple of tensors. The collective and the kernel layer have the contract + /// Returned tuple must contain at least two elements, with the first two elements being: + /// gA_mkl - The tma tensor, A after a local tile so it has shape (BLK_M,BLK_K,m,k,l) + /// gB_nkl - The tma tensor, B after a local tile so it has shape (BLK_N,BLK_K,n,k,l) + template + CUTLASS_DEVICE auto + load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const { + using X = Underscore; + // Separate out problem shape for convenience + auto [M,N,K,L] = problem_shape_MNKL; + + // TMA requires special handling of strides to deal with coord codomain mapping + // Represent the full tensors -- get these from TMA + Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L)); // (m,k,l) + Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L)); // (n,k,l) + + // Make tiled views, defer the slice + Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{}); // (BLK_M,BLK_K,m,k,l) + Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{}); // (BLK_N,BLK_K,n,k,l) + + constexpr auto scales_m = Int{}; + auto tM = get<2>(gA_mkl.shape()); + auto tN = get<2>(gB_nkl.shape()); + auto tK = get<3>(gA_mkl.shape()); + + // Make the tiled views of scale tensors + auto scaleA_shape = make_shape(M / ScaleGranularityM, tK, L); // (scale_m,k,l) + auto scaleA_layout = make_ordered_layout(scaleA_shape, Step<_0, _1, _2>{}); + auto scaleB_shape = make_shape(tN, tK, L); // (n,k,l) + auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_1, _0, _2>{}); + + // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and + // gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mScaleA_mkl and mScaleB_nkl. + Tensor mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_A), scaleA_layout); // (scale_m,k,l) + Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B), scaleB_layout); // (n,k,l) + + return cute::make_tuple(gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl); + } + + /// Perform a collective-scoped matrix multiply-accumulate + /// Producer Perspective + template < + class TensorA, class TensorB, + class TensorScaleA, class TensorScaleB, + class KTileIterator, class BlockCoord + > + CUTLASS_DEVICE void + load( + Params const& mainloop_params, + MainloopPipeline pipeline, + PipelineState smem_pipe_write, + cute::tuple const& load_inputs, + BlockCoord const& blk_coord, + KTileIterator k_tile_iter, int k_tile_count, + int thread_idx, + uint32_t block_rank_in_cluster, + TensorStorage& shared_tensors) { + int lane_predicate = cute::elect_one_sync(); + + // Blockscaling: Tma loads for load_input and CpAsync for load_scale + Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE) + Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE) + Tensor sScaleA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{}); // (ScaleMsPerTile,k) + Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k) + + // + // Prepare the TMA loads for A and B + // + + constexpr uint32_t cluster_shape_x = get<0>(ClusterShape()); + uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x}; + + Tensor gA_mkl = get<0>(load_inputs); + Tensor gB_nkl = get<1>(load_inputs); + + auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y); + auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x); + + // Partition the inputs based on the current block coordinates. + auto [m_coord, n_coord, k_coord, l_coord] = blk_coord; + Tensor gA = gA_mkl(_,_,m_coord,_,l_coord); // (BLK_M,BLK_K,k) + Tensor gB = gB_nkl(_,_,n_coord,_,l_coord); // (BLK_N,BLK_K,k) + + + // Block scaling: load_scale has scaling tensors in global memory which are not tiled + Tensor mScaleA_mkl = get<2>(load_inputs); + Tensor mScaleB_nkl = get<3>(load_inputs); + auto scales_m = get<0>(mScaleA_mkl.shape()); + + Tensor cScaleA_mkl = make_identity_tensor(mScaleA_mkl.shape()); + + Tensor gScaleA = local_tile( + mScaleA_mkl, make_tile(Int{}), + make_coord(m_coord,_,l_coord)); // (ScaleMsPerTile,k,1) + Tensor cScaleA = local_tile( + cScaleA_mkl, make_tile(Int{}), + make_coord(m_coord,_,l_coord)); + Tensor gScaleB = mScaleB_nkl(n_coord,_,l_coord); // (1,k,1) + + // TODO: test `scale_copy_a` with `ScaleMsPerTile` < 128 + TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, + Layout>{}, Layout>{}); // (1,1,1) + TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, + Layout>{}, Layout>{}); // (1,1,1) + ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x); + ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x); + + Tensor tAgA_ScaleA = thr_scale_copy_a.partition_S(gScaleA); + Tensor tAcA_ScaleA = thr_scale_copy_a.partition_S(cScaleA); + Tensor tAsA_ScaleA = thr_scale_copy_a.partition_D(sScaleA); + + Tensor tBgB_ScaleB = thr_scale_copy_b.partition_S(gScaleB); + Tensor tBsB_ScaleB = thr_scale_copy_b.partition_D(sScaleB); + + // Applies the mapping from block_tma_a + Tensor tAgA = block_tma_a.partition_S(gA); // (TMA,TMA_M,TMA_K,k) + Tensor tAsA = block_tma_a.partition_D(sA); // (TMA,TMA_M,TMA_K,PIPE) + + Tensor tBgB = block_tma_b.partition_S(gB); // (TMA,TMA_N,TMA_K,k) + Tensor tBsB = block_tma_b.partition_D(sB); // (TMA,TMA_N,TMA_K,PIPE) + + uint16_t mcast_mask_a = 0; + uint16_t mcast_mask_b = 0; + + // Issue TmaLoads for GEMM operands A/B and CpAsync for scale tensors + // Maps the tile -> block, value + if constexpr (cute::is_same_v) { + auto block_layout = Layout{}; // (m,n) -> block_id + for (int n = 0; n < size<1>(block_layout); ++n) { + mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{})); + } + } + + if constexpr (cute::is_same_v) { + auto block_layout = Layout{}; // (m,n) -> block_id + for (int m = 0; m < size<0>(block_layout); ++m) { + mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{})); + } + } + + // Allocate predicate tensors for a_scales (since we can't guarantee that + // all scales are valid, since we could have a partial tiles along M) + Tensor tApA_ScaleA = make_tensor(shape(tAsA_ScaleA(_,_,0))); + #pragma unroll + for (int i = 0; i < size(tApA_ScaleA); ++i) { + tApA_ScaleA(i) = get<0>(tAcA_ScaleA(i)) < scales_m; + } + + // Mainloop + CUTLASS_PRAGMA_NO_UNROLL + for ( ; k_tile_count > 0; --k_tile_count) { + // LOCK smem_pipe_write for _writing_ + pipeline.producer_acquire(smem_pipe_write); + + // + // Copy gmem to smem for *k_tile_iter + // + int write_stage = smem_pipe_write.index(); + using BarrierType = typename MainloopPipeline::ProducerBarrierType; + BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write); + + // Copy operands A and B from global memory to shared memory + if (lane_predicate) copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage)); + if (lane_predicate) copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage)); + + // Copy scale tensors from global memory to shared memory + copy_if(scale_copy_a, tApA_ScaleA, tAgA_ScaleA(_,_,*k_tile_iter), tAsA_ScaleA(_,_,write_stage)); + copy(scale_copy_b, tBgB_ScaleB(_,*k_tile_iter), tBsB_ScaleB(_,write_stage)); + pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc); + + ++k_tile_iter; + + // Advance smem_pipe_write + ++smem_pipe_write; + } + } + + /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster + CUTLASS_DEVICE void + load_tail( + MainloopPipeline pipeline, + PipelineState smem_pipe_write) { + int lane_predicate = cute::elect_one_sync(); + + // Issue the epilogue waits + if (lane_predicate) { + /* This helps avoid early exit of blocks in Cluster + * Waits for all stages to either be released (all + * Consumer UNLOCKs), or if the stage was never used + * then would just be acquired since the phase was + * still inverted from make_producer_start_state + */ + pipeline.producer_tail(smem_pipe_write); + } + } + + /// Perform a collective-scoped matrix multiply-accumulate + /// Consumer Perspective + template < + class FrgTensorC + > + CUTLASS_DEVICE void + mma(MainloopPipeline pipeline, + PipelineState smem_pipe_read, + FrgTensorC& accum, + int k_tile_count, + int thread_idx, + TensorStorage& shared_tensors, + Params const& mainloop_params) { + + + static_assert(is_rmem::value, "C tensor must be rmem resident."); + static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3."); + static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3."); + static_assert(cute::is_void_v, + "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions."); + static_assert(cute::is_void_v, + "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions."); + + Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE) + Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE) + + // Block scaling + Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), + Layout< + Shape, Int>, cute::tuple_element_t<1, TileShape>, Int>, + Stride, _0, Int> + >{}); // ((ScaleGranularityM,ScaleMsPerTile),n,k) + Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k) + + // + // Define C accumulators and A/B partitioning + // + + // Layout of warp group to thread mapping + + static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and + stride<0>(typename TiledMma::BLayout{}) == 0 and + size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and + size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, + "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup"); + + constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup; + Layout warp_group_thread_layout = make_layout(Int{}, + Int{}); + + int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0); + + TiledMma tiled_mma; + auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx)); + + Tensor tCsScaleAViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleAViewAsC); // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C. + + Tensor tCsA = thread_mma.partition_A(sA); // (MMA,MMA_M,MMA_K,PIPE) + Tensor tCsB = thread_mma.partition_B(sB); // (MMA,MMA_N,MMA_K,PIPE) + + // Allocate "fragments/descriptors" + Tensor tCrA = thread_mma.make_fragment_A(tCsA); // (MMA,MMA_M,MMA_K,PIPE) + Tensor tCrB = thread_mma.make_fragment_B(tCsB); // (MMA,MMA_N,MMA_K,PIPE) + + CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum)); // M + CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum)); // N + CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB)); // K + CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB)); // PIPE + CUTE_STATIC_ASSERT_V(Int{} == size<2>(sA)); // PIPE + CUTE_STATIC_ASSERT_V(Int{} == size<2>(sB)); // PIPE + + // + // PIPELINED MAIN LOOP + // + static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS < K_PIPE_MAX), + "ERROR : Incorrect number of MMAs in flight"); + + // We release buffers to producer warps(dma load) with some mmas in flight + PipelineState smem_pipe_release = smem_pipe_read; + + // Per block scale values for operand A and B + + using RegLayoutScaleAViewAsC = decltype(make_layout_like(tCsScaleAViewAsC(_, _, _, 0).layout())); // `make_layout_like` makes a compact layout. + using RegLayoutScaleAEssential = decltype(filter_zeros(RegLayoutScaleAViewAsC{}.stride(), RegLayoutScaleAViewAsC{}.shape())); // an interface to traverse the underlying storage for the compact layout mentioned above + + Tensor tCrScaleAViewAsC = make_tensor(RegLayoutScaleAViewAsC{}); // (MMA,MMA_M,MMA_N) + ElementBlockScale scale_b; + + // Prologue GMMAs + int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count); + + tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; + + GmmaFP8AccumulationWithScale accumulation(accum, size<2>(TileShape{}) / size<2>(typename TiledMma::AtomShape_MNK{}), size<2>(tCrA)); + warpgroup_fence_operand(accumulation()); + CUTLASS_PRAGMA_UNROLL + for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue) + { + // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value) + auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read); + pipeline.consumer_wait(smem_pipe_read, barrier_token); + + if (accumulation.prepare_if_needed()) { + tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; + } + + int read_stage = smem_pipe_read.index(); + + // Load per block scale values from shared memory to registers. + scale_b = sScaleB[read_stage]; + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { + tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{})); + } + if constexpr (ScaleMsPerTile == 1) { + static_assert(size(RegLayoutScaleAEssential{}) == 1); + tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`. + } else { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { + tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b; + } + } + + warpgroup_arrive(); + // Unroll the K mode manually to set scale D to 1 + CUTLASS_PRAGMA_UNROLL + for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) { + // (V,M,K) x (V,N,K) => (V,M,N) + cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation()); + tiled_mma.accumulate_ = GMMA::ScaleOut::One; + } + warpgroup_commit_batch(); + + // Block scale the accumulators with reg tensor `tCrScaleAViewAsC` + accumulation.scale_if_needed(tCrScaleAViewAsC); + + ++smem_pipe_read; + } + + warpgroup_fence_operand(accumulation()); + // Mainloop GMMAs + k_tile_count -= prologue_mma_count; + + CUTLASS_PRAGMA_NO_UNROLL + for ( ; k_tile_count > 0; --k_tile_count) + { + // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value) + auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read); + pipeline.consumer_wait(smem_pipe_read, barrier_token); + + // + // Compute on k_tile + // + + int read_stage = smem_pipe_read.index(); + + // Load per block scale values from shared memory to registers (at most twice per block along M and exactly once per block along N) + scale_b = sScaleB[read_stage]; + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { + tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{})); + } + if constexpr (ScaleMsPerTile == 1) { + static_assert(size(RegLayoutScaleAEssential{}) == 1); + tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`. + } else { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { + tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b; + } + } + + if (accumulation.prepare_if_needed()) { + tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; + } + + warpgroup_fence_operand(accumulation()); + warpgroup_arrive(); + // Unroll the K mode manually to set scale D to 1 + CUTLASS_PRAGMA_UNROLL + for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) { + // (V,M,K) x (V,N,K) => (V,M,N) + cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation()); + tiled_mma.accumulate_ = GMMA::ScaleOut::One; + } + warpgroup_commit_batch(); + + /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed + warpgroup_wait(); + warpgroup_fence_operand(accumulation()); + + // Block scale the accumulators with reg tensor `tCrScaleAViewAsC` + accumulation.scale_if_needed(tCrScaleAViewAsC); + + pipeline.consumer_release(smem_pipe_release); // UNLOCK smem_pipe_release, done _computing_ on it + + // Advance smem_pipe_read and smem_pipe_release + ++smem_pipe_read; + ++smem_pipe_release; + } + + accumulation.scale_residue_if_needed(tCrScaleAViewAsC); + + warpgroup_fence_operand(accumulation()); + } + + /// Perform a Consumer Epilogue to release all buffers + CUTLASS_DEVICE void + mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) { + // Prologue GMMAs + int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count); + k_tile_count -= prologue_mma_count; + + smem_pipe_release.advance(k_tile_count); + + // Wait on all GMMAs to complete + warpgroup_wait<0>(); + + for (int count = 0; count < prologue_mma_count; ++count) { + pipeline.consumer_release(smem_pipe_release); // UNLOCK smem_pipe_release, done _computing_ on it + ++smem_pipe_release; + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::collective + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/csrc/cutlass_extensions/gemm/dispatch_policy.hpp b/csrc/cutlass_extensions/gemm/dispatch_policy.hpp new file mode 100644 index 0000000000000..df809e27a3efe --- /dev/null +++ b/csrc/cutlass_extensions/gemm/dispatch_policy.hpp @@ -0,0 +1,39 @@ +#pragma once + +#include "cutlass/gemm/dispatch_policy.hpp" + +namespace cutlass::gemm { + +////////////////////////////////////////////////////////////////////////////// + +// FP8 related policies (including Blocked Scaled Accumulation) +// `ScaleGranularityM` specifies scaling granularity along M, while zero-value +// `ScaleGranularityM` indicates that scaling granularity is +// `size<0>(TileShape_MNK{})` along M. +template +struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum + : KernelTmaWarpSpecializedCooperative {}; + +// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp +// specialized dynamic schedule For FP8 kernels with Block Scaling +template , + class KernelSchedule = KernelTmaWarpSpecialized, + int ScaleGranularityM = + 0 // `ScaleGranularityM` specifies scaling granularity along M, + // while zero-value `ScaleGranularityM` indicates that scaling + // granularity is `size<0>(TileShape_MNK{})` along M. + > +struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8 + : MainloopSm90TmaGmmaWarpSpecialized { + static_assert( + cute::is_same_v< + KernelSchedule, + KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum< + ScaleGranularityM>>, + "KernelSchedule must be one of the warp specialized policies"); +}; + +////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm \ No newline at end of file diff --git a/csrc/cutlass_extensions/vllm_collective_builder.cuh b/csrc/cutlass_extensions/vllm_collective_builder.cuh index 085ee1290031f..e7fbba4cd4b0d 100644 --- a/csrc/cutlass_extensions/vllm_collective_builder.cuh +++ b/csrc/cutlass_extensions/vllm_collective_builder.cuh @@ -1,6 +1,6 @@ #pragma once -#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass_extensions/gemm/collective/collective_builder.hpp" namespace cutlass::gemm::collective { using namespace cute; diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py index b401736c9824b..d64f0d0a5c2a0 100644 --- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py +++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py @@ -1,5 +1,7 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum -from typing import Dict, Union +from typing import Union from cutlass_library import * @@ -19,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum): TmaWarpSpecializedCooperative = enum_auto() -VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = { **DataTypeNames, # type: ignore **{ VLLMDataType.u4b8: "u4b8", @@ -27,7 +29,7 @@ VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = { } } -VLLMDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = { **DataTypeTag, # type: ignore **{ VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t", @@ -35,7 +37,7 @@ VLLMDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = { } } -VLLMDataTypeSize: Dict[Union[VLLMDataType, DataType], int] = { +VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = { **DataTypeSize, # type: ignore **{ VLLMDataType.u4b8: 4, @@ -43,7 +45,7 @@ VLLMDataTypeSize: Dict[Union[VLLMDataType, DataType], int] = { } } -VLLMDataTypeVLLMScalarTypeTag: Dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = { VLLMDataType.u4b8: "vllm::kU4B8", VLLMDataType.u8b128: "vllm::kU8B128", DataType.u4: "vllm::kU4", @@ -54,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: Dict[Union[VLLMDataType, DataType], str] = { DataType.bf16: "vllm::kBfloat16", } -VLLMDataTypeTorchDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = { DataType.u8: "at::ScalarType::Byte", DataType.s8: "at::ScalarType::Char", DataType.e4m3: "at::ScalarType::Float8_e4m3fn", @@ -64,7 +66,7 @@ VLLMDataTypeTorchDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = { DataType.f32: "at::ScalarType::Float", } -VLLMKernelScheduleTag: Dict[Union[ +VLLMKernelScheduleTag: dict[Union[ MixedInputKernelScheduleType, KernelScheduleType], str] = { **KernelScheduleTag, # type: ignore **{ diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h index 03414b7e1ae93..dc6e0769b8780 100644 --- a/csrc/dispatch_utils.h +++ b/csrc/dispatch_utils.h @@ -6,6 +6,11 @@ #include +// Need a special dispatch case macro since we will nest the FP8 dispatch. +// Instead of the usual 'scalar_t', this names the dispatched type 'fp8_t'. +#define AT_DISPATCH_FP8_CASE(enum_type, ...) \ + AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, fp8_t, __VA_ARGS__) + #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ @@ -14,17 +19,32 @@ #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) -// TODO(luka/varun): use FP8_TYPE macro after refactoring -#ifndef USE_ROCM +// ROCm devices might use either fn or fnuz, so set up dispatch table for both. +// A host-based check at runtime will create a preferred FP8 type for ROCm +// such that the correct kernel is dispatched. +#ifdef USE_ROCM + #define VLLM_DISPATCH_CASE_FP8_TYPES(...) \ + AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \ + AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e4m3fnuz, __VA_ARGS__) + + #define VLLM_DISPATCH_CASE_QUANT_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fnuz, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) +#else + #define VLLM_DISPATCH_CASE_FP8_TYPES(...) \ + AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) + #define VLLM_DISPATCH_CASE_QUANT_TYPES(...) \ AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) -#else - #define VLLM_DISPATCH_CASE_QUANT_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fnuz, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) #endif +// When using this dispatch macro, the type is 'fp8_t' not 'scalar_t'. +// See AT_DISPATCH_FP8_CASE above. +#define VLLM_DISPATCH_FP8_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FP8_TYPES(__VA_ARGS__)) + #define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__)) diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu index c18e2a4e4abe0..d595b9e889c84 100644 --- a/csrc/layernorm_quant_kernels.cu +++ b/csrc/layernorm_quant_kernels.cu @@ -21,9 +21,9 @@ namespace vllm { // TODO(woosuk): Further optimize this kernel. -template +template __global__ void rms_norm_static_fp8_quant_kernel( - FP8_TYPE* __restrict__ out, // [..., hidden_size] + fp8_type* __restrict__ out, // [..., hidden_size] const scalar_t* __restrict__ input, // [..., hidden_size] const scalar_t* __restrict__ weight, // [hidden_size] const float* __restrict__ scale, // [1] @@ -52,7 +52,7 @@ __global__ void rms_norm_static_fp8_quant_kernel( float x = (float)input[blockIdx.x * hidden_size + idx]; float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx]; out[blockIdx.x * hidden_size + idx] = - scaled_fp8_conversion(out_norm, scale_inv); + scaled_fp8_conversion(out_norm, scale_inv); } } @@ -60,10 +60,10 @@ __global__ void rms_norm_static_fp8_quant_kernel( Additional optimizations we can make in this case are packed and vectorized operations, which help with the memory latency bottleneck. */ -template +template __global__ std::enable_if_t<(width > 0) && _typeConvert::exists> fused_add_rms_norm_static_fp8_quant_kernel( - FP8_TYPE* __restrict__ out, // [..., hidden_size] + fp8_type* __restrict__ out, // [..., hidden_size] scalar_t* __restrict__ input, // [..., hidden_size] scalar_t* __restrict__ residual, // [..., hidden_size] const scalar_t* __restrict__ weight, // [hidden_size] @@ -114,7 +114,7 @@ fused_add_rms_norm_static_fp8_quant_kernel( #pragma unroll for (int i = 0; i < width; ++i) { out[id * width + i] = - scaled_fp8_conversion(float(temp.data[i]), scale_inv); + scaled_fp8_conversion(float(temp.data[i]), scale_inv); } } } @@ -122,10 +122,10 @@ fused_add_rms_norm_static_fp8_quant_kernel( /* Generic fused_add_rms_norm_kernel The width field is not used here but necessary for other specializations. */ -template +template __global__ std::enable_if_t<(width == 0) || !_typeConvert::exists> fused_add_rms_norm_static_fp8_quant_kernel( - FP8_TYPE* __restrict__ out, // [..., hidden_size] + fp8_type* __restrict__ out, // [..., hidden_size] scalar_t* __restrict__ input, // [..., hidden_size] scalar_t* __restrict__ residual, // [..., hidden_size] const scalar_t* __restrict__ weight, // [hidden_size] @@ -158,7 +158,7 @@ fused_add_rms_norm_static_fp8_quant_kernel( float x = (float)residual[blockIdx.x * hidden_size + idx]; float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx]; out[blockIdx.x * hidden_size + idx] = - scaled_fp8_conversion(out_norm, scale_inv); + scaled_fp8_conversion(out_norm, scale_inv); } } @@ -176,25 +176,33 @@ void rms_norm_static_fp8_quant(torch::Tensor& out, // [..., hidden_size] dim3 block(std::min(hidden_size, 1024)); const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] { - vllm::rms_norm_static_fp8_quant_kernel - <<>>( - out.data_ptr(), input.data_ptr(), - weight.data_ptr(), scale.data_ptr(), epsilon, - num_tokens, hidden_size); - }); + VLLM_DISPATCH_FLOATING_TYPES( + input.scalar_type(), "rms_norm_kernel_scalar_type", [&] { + VLLM_DISPATCH_FP8_TYPES( + out.scalar_type(), "rms_norm_kernel_fp8_type", [&] { + vllm::rms_norm_static_fp8_quant_kernel + <<>>( + out.data_ptr(), input.data_ptr(), + weight.data_ptr(), scale.data_ptr(), + epsilon, num_tokens, hidden_size); + }); + }); } -#define LAUNCH_FUSED_ADD_RMS_NORM(width) \ - VLLM_DISPATCH_FLOATING_TYPES( \ - input.scalar_type(), "fused_add_rms_norm_kernel", [&] { \ - vllm::fused_add_rms_norm_static_fp8_quant_kernel \ - <<>>( \ - out.data_ptr(), input.data_ptr(), \ - residual.data_ptr(), weight.data_ptr(), \ - scale.data_ptr(), epsilon, num_tokens, hidden_size); \ +#define LAUNCH_FUSED_ADD_RMS_NORM(width) \ + VLLM_DISPATCH_FLOATING_TYPES( \ + input.scalar_type(), "fused_add_rms_norm_kernel_scalar_type", [&] { \ + VLLM_DISPATCH_FP8_TYPES( \ + out.scalar_type(), "fused_add_rms_norm_kernel_fp8_type", [&] { \ + vllm::fused_add_rms_norm_static_fp8_quant_kernel \ + <<>>( \ + out.data_ptr(), input.data_ptr(), \ + residual.data_ptr(), \ + weight.data_ptr(), scale.data_ptr(), \ + epsilon, num_tokens, hidden_size); \ + }); \ }); - void fused_add_rms_norm_static_fp8_quant( torch::Tensor& out, // [..., hidden_size], torch::Tensor& input, // [..., hidden_size] diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel.h b/csrc/moe/marlin_kernels/marlin_moe_kernel.h index a217401b3d7c2..47ecf109d0f53 100644 --- a/csrc/moe/marlin_kernels/marlin_moe_kernel.h +++ b/csrc/moe/marlin_kernels/marlin_moe_kernel.h @@ -138,8 +138,8 @@ __device__ inline FragB dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; @@ -182,8 +182,8 @@ __device__ inline FragB dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); const int SUB = 0x64006400; const int MUL = 0x2c002c00; diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index d609ce1697df3..d7be769458e35 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -3,7 +3,7 @@ #include #include -#include +#include #include "../cuda_compat.h" #include "../dispatch_utils.h" @@ -33,7 +33,9 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, extern __shared__ int32_t shared_mem[]; int32_t* cumsum = shared_mem; // 1d tensor with shape (num_experts + 1) - token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + blockDim.x + 1); + token_cnts_t* tokens_cnts = + (token_cnts_t*)(shared_mem + num_experts + + 1); // 2d tensor with shape (blockDim.x + 1, num_experts) for (int i = 0; i < num_experts; ++i) { tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0; @@ -195,6 +197,83 @@ __global__ void moe_align_block_size_global_mem_kernel( } } +// taken from +// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957 +template +__global__ void sgl_moe_align_block_size_kernel( + scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids, + int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts, + int32_t block_size, size_t numel, int32_t* cumsum) { + __shared__ int32_t shared_counts[32][8]; + + const int warp_id = threadIdx.x / 32; + const int experts_per_warp = 8; + const int my_expert_start = warp_id * experts_per_warp; + + // Initialize shared_counts for this warp's experts + for (int i = 0; i < experts_per_warp; ++i) { + if (my_expert_start + i < num_experts) { + shared_counts[warp_id][i] = 0; + } + } + + __syncthreads(); + + const size_t tokens_per_thread = CEILDIV(numel, blockDim.x); + const size_t start_idx = threadIdx.x * tokens_per_thread; + + for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { + int expert_id = topk_ids[i]; + int warp_idx = expert_id / experts_per_warp; + int expert_offset = expert_id % experts_per_warp; + atomicAdd(&shared_counts[warp_idx][expert_offset], 1); + } + + __syncthreads(); + + // Single thread computes cumulative sum and total tokens + if (threadIdx.x == 0) { + cumsum[0] = 0; + for (int i = 1; i <= num_experts; ++i) { + int expert_count = 0; + int warp_idx = (i - 1) / experts_per_warp; + int expert_offset = (i - 1) % experts_per_warp; + expert_count = shared_counts[warp_idx][expert_offset]; + + cumsum[i] = + cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size; + } + *total_tokens_post_pad = cumsum[num_experts]; + } + + __syncthreads(); + + // Assign expert IDs to blocks + if (threadIdx.x < num_experts) { + for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; + i += block_size) { + expert_ids[i / block_size] = threadIdx.x; + } + } +} + +// taken from +// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957 +template +__global__ void sgl_moe_token_sort_kernel(scalar_t* __restrict__ topk_ids, + int32_t* sorted_token_ids, + int32_t* cumsum_buffer, + size_t numel) { + const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const size_t stride = blockDim.x * gridDim.x; + + for (size_t i = tid; i < numel; i += stride) { + int32_t expert_id = topk_ids[i]; + int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1); + sorted_token_ids[rank_post_pad] = i; + } +} + template __global__ void moe_sum_kernel( scalar_t* __restrict__ out, // [..., d] @@ -303,6 +382,43 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, } } +void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, + int64_t block_size, + torch::Tensor sorted_token_ids, + torch::Tensor experts_ids, + torch::Tensor num_tokens_post_pad) { + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + TORCH_CHECK(num_experts == 256, + "sgl_moe_align_block_size kernel only supports deepseek v3."); + + VLLM_DISPATCH_INTEGRAL_TYPES( + topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] { + // calc needed amount of shared mem for `cumsum` tensors + auto options_int = + torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device()); + torch::Tensor cumsum_buffer = + torch::zeros({num_experts + 1}, options_int); + + auto align_kernel = + vllm::moe::sgl_moe_align_block_size_kernel; + align_kernel<<<1, 1024, 0, stream>>>( + topk_ids.data_ptr(), sorted_token_ids.data_ptr(), + experts_ids.data_ptr(), + num_tokens_post_pad.data_ptr(), num_experts, block_size, + topk_ids.numel(), cumsum_buffer.data_ptr()); + + const int block_threads = 256; + const int num_blocks = + (topk_ids.numel() + block_threads - 1) / block_threads; + const int max_blocks = 65535; + const int actual_blocks = std::min(num_blocks, max_blocks); + auto sort_kernel = vllm::moe::sgl_moe_token_sort_kernel; + sort_kernel<<>>( + topk_ids.data_ptr(), sorted_token_ids.data_ptr(), + cumsum_buffer.data_ptr(), topk_ids.numel()); + }); +} + void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size] torch::Tensor& output) // [num_tokens, hidden_size] { diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h index 596cc0aa6c855..0bae119a7c460 100644 --- a/csrc/moe/moe_ops.h +++ b/csrc/moe/moe_ops.h @@ -12,3 +12,20 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t block_size, torch::Tensor sorted_token_ids, torch::Tensor experts_ids, torch::Tensor num_tokens_post_pad); + +void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, + int64_t block_size, + torch::Tensor sorted_token_ids, + torch::Tensor experts_ids, + torch::Tensor num_tokens_post_pad); +#ifndef USE_ROCM +torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output, + torch::Tensor b_qweight, torch::Tensor b_scales, + std::optional b_qzeros, + std::optional topk_weights, + torch::Tensor sorted_token_ids, + torch::Tensor expert_ids, + torch::Tensor num_tokens_post_pad, int64_t top_k, + int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N, + int64_t BLOCK_SIZE_K, int64_t bit); +#endif \ No newline at end of file diff --git a/csrc/moe/moe_wna16.cu b/csrc/moe/moe_wna16.cu new file mode 100644 index 0000000000000..51ae76c1ec882 --- /dev/null +++ b/csrc/moe/moe_wna16.cu @@ -0,0 +1,346 @@ + +#include +#include +#include +#include + +#include +#include +#include "moe_wna16_utils.h" + +#define DIVIDE(x, size) (((x) + (size) - 1) / (size)) + +template +__global__ void moe_wna16_gemm_kernel( + const scalar_t* __restrict__ input, scalar_t* __restrict__ output, + + const uint32_t* __restrict__ qweight, const scalar_t* __restrict__ scales, + const uint32_t* __restrict__ qzeros, + + const float* __restrict__ topk_weights, + const int32_t* __restrict__ sorted_token_ids, + const int32_t* __restrict__ expert_ids, + const int32_t* __restrict__ num_tokens_post_pad, + + uint16_t num_experts, uint16_t group_size, uint16_t top_k, uint32_t size_m, + uint32_t size_n, uint32_t size_k, uint16_t BLOCK_SIZE_M, + uint16_t BLOCK_SIZE_N, uint16_t BLOCK_SIZE_K, bool has_zp, + bool mul_topk_weight) { +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800 + if constexpr (std::is_same::value) { + return; + } else { +#endif + + using Dtype = ScalarType; + using scalar_t2 = typename ScalarType::scalar_t2; + + if (blockIdx.x * BLOCK_SIZE_M >= num_tokens_post_pad[0]) return; + + const int32_t offset_n = blockIdx.y * BLOCK_SIZE_N + threadIdx.x; + const int32_t offset_k = blockIdx.z * BLOCK_SIZE_K; + + const int32_t expert_id = expert_ids[blockIdx.x]; + + int32_t num_valid_tokens = 0; + extern __shared__ uint16_t block_input_tmp[]; + scalar_t* block_input = reinterpret_cast(block_input_tmp); + scalar_t2* block_input_half2 = reinterpret_cast(block_input); + + // load BLOCK_SIZE_M * BLOCK_SIZE_K into shared memory + for (int m = 0; m < BLOCK_SIZE_M; m++) { + const int32_t offset_m = blockIdx.x * BLOCK_SIZE_M + m; + const int32_t token_index = sorted_token_ids[offset_m]; + if (token_index / top_k >= size_m) break; + + num_valid_tokens = m + 1; + if (blockIdx.z == 0 && offset_n < size_n) + output[token_index * size_n + offset_n] = Dtype::int2num(0); + + if (expert_id != -1) { + int k_per_thread = DIVIDE(BLOCK_SIZE_K, BLOCK_SIZE_N); + for (int i = 0; i < k_per_thread; i++) { + int k = BLOCK_SIZE_N * i + threadIdx.x; + if (k >= BLOCK_SIZE_K) break; + if (offset_k + k >= size_k) break; + + // load input to shared memory + // use a special layout to fit the layout of dequanted-weight + int origin_k; + if constexpr (bit == 4) { + // [0, 4, 1, 5, 2, 6, 3, 7] + int8_t order = (threadIdx.x % 2) * 4 + ((threadIdx.x % 8) / 2); + origin_k = BLOCK_SIZE_N * i + threadIdx.x / 8 * 8 + order; + } else { + // [0, 2, 1, 3] + int8_t order = (threadIdx.x % 2) * 2 + ((threadIdx.x % 4) / 2); + origin_k = BLOCK_SIZE_N * i + threadIdx.x / 4 * 4 + order; + } + + origin_k += token_index / top_k * size_k + blockIdx.z * BLOCK_SIZE_K; + block_input[m * BLOCK_SIZE_K + k] = input[origin_k]; + } + } + } + + if (expert_id == -1) return; + __syncthreads(); + if (threadIdx.x >= BLOCK_SIZE_N || offset_n >= size_n) return; + + float res[64]; // assume BLOCK_SIZE_M <= 64 + scalar_t2 res2; + scalar_t2 scale_f2; + scalar_t2 qzero_f2; + + // note that (size_n * size_k * expert_id) may greater than 2 ** 31 + constexpr int8_t pack_factor = 32 / bit; + const uint64_t expert_offset = ((uint64_t)size_n) * size_k * expert_id; + const uint32_t* expert_qweight = qweight + expert_offset / pack_factor; + const scalar_t* expert_scales = scales + expert_offset / group_size; + const uint32_t* expert_qzeros = + qzeros + expert_offset / group_size / pack_factor; + + // load 4*int32 one time: 4 int32 = 128 bit = 1 float4 + // weight would be loaded in loop + uint32_t expert_qweight_tmp[4]; + float4* expert_qweight_tmp_float4 = + reinterpret_cast(expert_qweight_tmp); + + // load all required scales one time + scalar_t expert_scales_groups[GROUPS]; + int scales_offset_tmp = + (offset_n * size_k + offset_k) / group_size / GROUPS; + if constexpr (GROUPS == 1) { + *expert_scales_groups = expert_scales[scales_offset_tmp]; + } else if constexpr (GROUPS == 2) { + float* expert_scales_groups_tmp = + reinterpret_cast(expert_scales_groups); + *expert_scales_groups_tmp = + reinterpret_cast(expert_scales)[scales_offset_tmp]; + } else if constexpr (GROUPS == 4) { + float2* expert_scales_groups_tmp = + reinterpret_cast(expert_scales_groups); + *expert_scales_groups_tmp = + reinterpret_cast(expert_scales)[scales_offset_tmp]; + } else if constexpr (GROUPS == 8) { + float4* expert_scales_groups_tmp = + reinterpret_cast(expert_scales_groups); + *expert_scales_groups_tmp = + reinterpret_cast(expert_scales)[scales_offset_tmp]; + } + + // load all required qzeros one time + uint8_t expert_qzeros_groups[GROUPS]; + if (!has_zp) { + if constexpr (bit == 4) { + qzero_f2 = Dtype::num2num2(Dtype::int2num(8)); + } else { + qzero_f2 = Dtype::num2num2(Dtype::int2num(128)); + } + } else { + int qzeros_offset_tmp = + (offset_n / (8 / bit)) * (size_k / group_size / GROUPS) + + offset_k / group_size / GROUPS; + if constexpr (GROUPS == 1) { + uint8_t* expert_qzeros_groups_tmp = + reinterpret_cast(expert_qzeros_groups); + *expert_qzeros_groups_tmp = + reinterpret_cast(expert_qzeros)[qzeros_offset_tmp]; + } else if constexpr (GROUPS == 2) { + uint16_t* expert_qzeros_groups_tmp = + reinterpret_cast(expert_qzeros_groups); + *expert_qzeros_groups_tmp = + reinterpret_cast(expert_qzeros)[qzeros_offset_tmp]; + } else if constexpr (GROUPS == 4) { + uint32_t* expert_qzeros_groups_tmp = + reinterpret_cast(expert_qzeros_groups); + *expert_qzeros_groups_tmp = + reinterpret_cast(expert_qzeros)[qzeros_offset_tmp]; + } else if constexpr (GROUPS == 8) { + uint64_t* expert_qzeros_groups_tmp = + reinterpret_cast(expert_qzeros_groups); + *expert_qzeros_groups_tmp = + reinterpret_cast(expert_qzeros)[qzeros_offset_tmp]; + } + } + + for (int tmp_k = 0; tmp_k < BLOCK_SIZE_K / pack_factor; tmp_k++) { + int k = offset_k + tmp_k * pack_factor; + if (k >= size_k) break; + const int32_t weight_offset = offset_n * size_k + k; + + if (tmp_k % 4 == 0) { + *expert_qweight_tmp_float4 = reinterpret_cast( + expert_qweight)[weight_offset / pack_factor / 4]; + } + + if (tmp_k % (group_size / pack_factor) == 0) { + scalar_t scale_f = + expert_scales_groups[tmp_k / (group_size / pack_factor)]; + scale_f2 = Dtype::num2num2(scale_f); + + if (has_zp) { + uint8_t qzero = + expert_qzeros_groups[tmp_k / (group_size / pack_factor)]; + if constexpr (bit == 4) { + qzero = (qzero >> ((threadIdx.x % 2) * 4)) & 0xF; + } + qzero_f2 = Dtype::num2num2(Dtype::int2num(qzero)); + } + } + + scalar_t2 weight_half2[16 / bit]; + dequant(expert_qweight_tmp[tmp_k % 4], weight_half2); + + for (int m = 0; m < num_valid_tokens; m++) { + res2 = {}; + +#pragma unroll + for (int i = 0; i < 16 / bit; i++) { + int32_t offset_input = m * BLOCK_SIZE_K / 2 + tmp_k * (16 / bit) + i; + res2 = __hfma2(__hmul2(__hsub2(weight_half2[i], qzero_f2), scale_f2), + block_input_half2[offset_input], res2); + } + + if (tmp_k == 0) { + res[m] = Dtype::num2float(res2.x) + Dtype::num2float(res2.y); + } else { + res[m] += Dtype::num2float(res2.x) + Dtype::num2float(res2.y); + } + } + } + + for (int m = 0; m < num_valid_tokens; ++m) { + const int32_t token_index = + sorted_token_ids[blockIdx.x * BLOCK_SIZE_M + m]; + if (mul_topk_weight) { + res[m] *= topk_weights[token_index]; + } + atomicAdd(&output[token_index * size_n + offset_n], + Dtype::float2num(res[m])); + } + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800 + } +#endif +} + +template +void run_moe_wna16_gemm(const scalar_t* input, scalar_t* output, + const uint32_t* b_qweight, const scalar_t* b_scales, + const uint32_t* b_qzeros, const float* topk_weights, + const int32_t* sorted_token_ids, + const int32_t* expert_ids, + const int32_t* num_tokens_post_pad, int num_experts, + int group_size, int num_token_blocks, int top_k, + int size_m, int size_n, int size_k, int BLOCK_SIZE_M, + int BLOCK_SIZE_N, int BLOCK_SIZE_K, int bit, + bool has_zp, bool mul_topk_weight) { + dim3 blockDim, gridDim; + blockDim.x = BLOCK_SIZE_N; + blockDim.y = 1; + blockDim.z = 1; + gridDim.x = num_token_blocks; + gridDim.y = DIVIDE(size_n, BLOCK_SIZE_N); + gridDim.z = DIVIDE(size_k, BLOCK_SIZE_K); + + auto kernel = moe_wna16_gemm_kernel; + if (bit == 4) { + if (BLOCK_SIZE_K / group_size == 2) { + kernel = moe_wna16_gemm_kernel; + } else if (BLOCK_SIZE_K / group_size == 4) { + kernel = moe_wna16_gemm_kernel; + } else if (BLOCK_SIZE_K / group_size == 8) { + kernel = moe_wna16_gemm_kernel; + } + } else { + if (BLOCK_SIZE_K / group_size == 1) { + kernel = moe_wna16_gemm_kernel; + } else if (BLOCK_SIZE_K / group_size == 2) { + kernel = moe_wna16_gemm_kernel; + } else if (BLOCK_SIZE_K / group_size == 4) { + kernel = moe_wna16_gemm_kernel; + } else if (BLOCK_SIZE_K / group_size == 8) { + kernel = moe_wna16_gemm_kernel; + } + } + + const int shared_mem_size = BLOCK_SIZE_M * BLOCK_SIZE_K * 2; + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + kernel<<>>( + input, output, b_qweight, b_scales, b_qzeros, topk_weights, + sorted_token_ids, expert_ids, num_tokens_post_pad, num_experts, + group_size, top_k, size_m, size_n, size_k, BLOCK_SIZE_M, BLOCK_SIZE_N, + BLOCK_SIZE_K, has_zp, mul_topk_weight); +} + +torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output, + torch::Tensor b_qweight, torch::Tensor b_scales, + std::optional b_qzeros, + std::optional topk_weights, + torch::Tensor sorted_token_ids, + torch::Tensor expert_ids, + torch::Tensor num_tokens_post_pad, int64_t top_k, + int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N, + int64_t BLOCK_SIZE_K, int64_t bit) { + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); + auto options = + torch::TensorOptions().dtype(input.dtype()).device(input.device()); + + const int num_experts = b_qweight.size(0); + const int size_m = input.size(0); + const int size_n = b_qweight.size(1); + const int size_k = input.size(1); + const int group_size = size_k / b_scales.size(2); + + int64_t EM = sorted_token_ids.size(0); + if (size_m <= BLOCK_SIZE_M) { + EM = min(EM, size_m * BLOCK_SIZE_M * top_k); + } + const int num_token_blocks = (EM + BLOCK_SIZE_M - 1) / BLOCK_SIZE_M; + + const uint32_t* b_qzeros_ptr; + if (b_qzeros.has_value()) + b_qzeros_ptr = (const uint32_t*)b_qzeros.value().data_ptr(); + const float* topk_weights_ptr; + if (topk_weights.has_value()) + topk_weights_ptr = (const float*)topk_weights.value().data_ptr(); + + int groups_per_block_row = BLOCK_SIZE_K / group_size; + TORCH_CHECK(bit == 4 || bit == 8, "bit must be 4 or 8"); + TORCH_CHECK(size_k % BLOCK_SIZE_K == 0, + "size_k must divisible by BLOCK_SIZE_K"); + TORCH_CHECK(BLOCK_SIZE_K % group_size == 0, + "BLOCK_SIZE_K must divisible by group_size"); + TORCH_CHECK(BLOCK_SIZE_M <= 64, "BLOCK_SIZE_M must less or equal to 64"); + TORCH_CHECK(groups_per_block_row == 1 || groups_per_block_row == 2 || + groups_per_block_row == 4 || groups_per_block_row == 8, + "BLOCK_SIZE_K // group_size must be one of [1, 2, 4, 8]"); + + if (input.scalar_type() == at::ScalarType::Half) { + run_moe_wna16_gemm( + (const half*)input.data_ptr(), + (half*)output.data_ptr(), + (const uint32_t*)b_qweight.data_ptr(), + (const half*)b_scales.data_ptr(), b_qzeros_ptr, + topk_weights_ptr, sorted_token_ids.data_ptr(), + expert_ids.data_ptr(), num_tokens_post_pad.data_ptr(), + num_experts, group_size, num_token_blocks, top_k, size_m, size_n, + size_k, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, bit, + b_qzeros.has_value(), topk_weights.has_value()); + } else if (input.scalar_type() == at::ScalarType::BFloat16) { + run_moe_wna16_gemm( + (const nv_bfloat16*)input.data_ptr(), + (nv_bfloat16*)output.data_ptr(), + (const uint32_t*)b_qweight.data_ptr(), + (const nv_bfloat16*)b_scales.data_ptr(), b_qzeros_ptr, + topk_weights_ptr, sorted_token_ids.data_ptr(), + expert_ids.data_ptr(), num_tokens_post_pad.data_ptr(), + num_experts, group_size, num_token_blocks, top_k, size_m, size_n, + size_k, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, bit, + b_qzeros.has_value(), topk_weights.has_value()); + } else { + TORCH_CHECK(false, "moe_wna16_gemm only supports bfloat16 and float16"); + } + return output; +} diff --git a/csrc/moe/moe_wna16_utils.h b/csrc/moe/moe_wna16_utils.h new file mode 100644 index 0000000000000..4396b80240efe --- /dev/null +++ b/csrc/moe/moe_wna16_utils.h @@ -0,0 +1,200 @@ + +#include +#include + +template +class ScalarType {}; + +template <> +class ScalarType { + public: + using scalar_t = half; + using scalar_t2 = half2; + + static __device__ float inline num2float(const half x) { + return __half2float(x); + } + + static __device__ half2 inline num2num2(const half x) { + return __half2half2(x); + } + + static __device__ half2 inline nums2num2(const half x1, const half x2) { + return __halves2half2(x1, x2); + } + + static __host__ __device__ half inline float2num(const float x) { + return __float2half(x); + } + + static __host__ __device__ half inline int2num(const float x) { + return __int2half_rn(x); + } + + static __host__ __device__ float2 inline num22float2(const half2 x) { + return __half22float2(x); + } + + static __host__ __device__ half2 inline float22num2(const float2 x) { + return __float22half2_rn(x); + } +}; + +template <> +class ScalarType { + public: + using scalar_t = nv_bfloat16; + using scalar_t2 = nv_bfloat162; + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + static __device__ float inline num2float(const nv_bfloat16 x) { + return __bfloat162float(x); + } + + static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) { + return __bfloat162bfloat162(x); + } + + static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1, + const nv_bfloat16 x2) { + return __halves2bfloat162(x1, x2); + } + + static __host__ __device__ nv_bfloat16 inline float2num(const float x) { + return __float2bfloat16(x); + } + + static __host__ __device__ nv_bfloat16 inline int2num(const float x) { + return __int2bfloat16_rn(x); + } + + static __host__ __device__ float2 inline num22float2(const nv_bfloat162 x) { + return __bfloat1622float2(x); + } + + static __host__ __device__ nv_bfloat162 inline float22num2(const float2 x) { + return __float22bfloat162_rn(x); + } +#endif +}; + +template +__device__ inline int lop3(int a, int b, int c) { + int res; + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(res) + : "r"(a), "r"(b), "r"(c), "n"(lut)); + return res; +} + +template +__device__ inline uint32_t prmt(uint32_t a) { + uint32_t res; + asm volatile("prmt.b32 %0, %1, %2, %3;\n" + : "=r"(res) + : "r"(a), "n"(start_byte), "n"(mask)); + return res; +} + +template +__device__ inline void dequant(int q, scalar_t2* res) {} + +template <> +__device__ inline void dequant(int q, half2* res) { + const int LO = 0x000f000f; + const int HI = 0x00f000f0; + const int EX = 0x64006400; + const int SUB = 0x64006400; + const int MUL = 0x2c002c00; + const int ADD = 0xd400d400; + + int lo0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + q >>= 8; + int lo1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + + res[0] = __hsub2(*reinterpret_cast(&lo0), + *reinterpret_cast(&SUB)); + res[1] = __hfma2(*reinterpret_cast(&hi0), + *reinterpret_cast(&MUL), + *reinterpret_cast(&ADD)); + res[2] = __hsub2(*reinterpret_cast(&lo1), + *reinterpret_cast(&SUB)); + res[3] = __hfma2(*reinterpret_cast(&hi1), + *reinterpret_cast(&MUL), + *reinterpret_cast(&ADD)); +} + +template <> +__device__ inline void dequant(int q, half2* res) { + static constexpr uint32_t mask_for_elt_01 = 0x5250; + static constexpr uint32_t mask_for_elt_23 = 0x5351; + static constexpr uint32_t start_byte_for_fp16 = 0x64646464; + + uint32_t lo = prmt(q); + uint32_t hi = prmt(q); + + static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64006400; + + res[0] = __hsub2(*reinterpret_cast(&lo), + *reinterpret_cast(&I8s_TO_F16s_MAGIC_NUM)); + res[1] = __hsub2(*reinterpret_cast(&hi), + *reinterpret_cast(&I8s_TO_F16s_MAGIC_NUM)); +} + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +template <> +__device__ inline void dequant(int q, nv_bfloat162* res) { + static constexpr uint32_t MASK = 0x000f000f; + static constexpr uint32_t EX = 0x43004300; + + int lo0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + q >>= 4; + int hi0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + q >>= 4; + int lo1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + q >>= 4; + int hi1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + + static constexpr uint32_t MUL = 0x3F803F80; + static constexpr uint32_t ADD = 0xC300C300; + + res[0] = __hfma2(*reinterpret_cast(&lo0), + *reinterpret_cast(&MUL), + *reinterpret_cast(&ADD)); + res[1] = __hfma2(*reinterpret_cast(&hi0), + *reinterpret_cast(&MUL), + *reinterpret_cast(&ADD)); + res[2] = __hfma2(*reinterpret_cast(&lo1), + *reinterpret_cast(&MUL), + *reinterpret_cast(&ADD)); + res[3] = __hfma2(*reinterpret_cast(&hi1), + *reinterpret_cast(&MUL), + *reinterpret_cast(&ADD)); +} + +template <> +__device__ inline void dequant(int q, nv_bfloat162* res) { + float fp32_intermediates[4]; + uint32_t* fp32_intermediates_casted = + reinterpret_cast(fp32_intermediates); + + static constexpr uint32_t fp32_base = 0x4B000000; + fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650); + fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652); + fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651); + fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653); + + fp32_intermediates[0] -= 8388608.f; + fp32_intermediates[1] -= 8388608.f; + fp32_intermediates[2] -= 8388608.f; + fp32_intermediates[3] -= 8388608.f; + + uint32_t* bf16_result_ptr = reinterpret_cast(res); + bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0], + fp32_intermediates_casted[1], 0x7632); + bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2], + fp32_intermediates_casted[3], 0x7632); +} +#endif diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index f3a558c14ab93..718418e6cd497 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -22,7 +22,26 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { " Tensor! num_tokens_post_pad) -> ()"); m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size); + // temporarily adapted from + // https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a + m.def( + "sgl_moe_align_block_size(Tensor topk_ids, int num_experts," + " int block_size, Tensor! sorted_token_ids," + " Tensor! experts_ids," + " Tensor! num_tokens_post_pad) -> ()"); + m.impl("sgl_moe_align_block_size", torch::kCUDA, &sgl_moe_align_block_size); + #ifndef USE_ROCM + m.def( + "moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, " + "Tensor b_scales, Tensor? b_qzeros, " + "Tensor? topk_weights, Tensor sorted_token_ids, " + "Tensor expert_ids, Tensor num_tokens_post_pad, " + "int top_k, int BLOCK_SIZE_M, int BLOCK_SIZE_N, int BLOCK_SIZE_K, " + "int bit) -> Tensor"); + + m.impl("moe_wna16_gemm", torch::kCUDA, &moe_wna16_gemm); + m.def( "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, " "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! " @@ -33,6 +52,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { "int moe_block_size, bool replicate_input, bool apply_weights)" " -> Tensor"); // conditionally compiled so impl registration is in source file + #endif } diff --git a/csrc/ops.h b/csrc/ops.h index 6d494fee681d2..dc7216319a54a 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -163,8 +163,24 @@ torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X, torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type, int64_t row); +torch::Tensor ggml_moe_a8(torch::Tensor X, torch::Tensor W, + torch::Tensor sorted_token_ids, + torch::Tensor expert_ids, + torch::Tensor num_tokens_post_padded, int64_t type, + int64_t row, int64_t top_k, int64_t tokens); + +int64_t ggml_moe_get_block_size(int64_t type); + #ifndef USE_ROCM + +bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability); bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability); +bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability); + +void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A, + torch::Tensor const& B, torch::Tensor const& A_sf, + torch::Tensor const& B_sf, + torch::Tensor const& alpha); void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, @@ -187,8 +203,11 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b_scales, std::optional const& bias); -bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed, - torch::Tensor& e, torch::Tensor const& a); +std::vector cutlass_sparse_compress(torch::Tensor const& a); + +void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input, + torch::Tensor& output_scale, + torch::Tensor const& input_scale); #endif void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu index 97184a8735593..c085d31a3e9b1 100644 --- a/csrc/pos_encoding_kernels.cu +++ b/csrc/pos_encoding_kernels.cu @@ -124,18 +124,54 @@ __global__ void batched_rotary_embedding_kernel( void rotary_embedding( torch::Tensor& positions, // [batch_size, seq_len] or [num_tokens] torch::Tensor& query, // [batch_size, seq_len, num_heads * head_size] or - // [num_tokens, num_heads * head_size] + // [num_tokens, num_heads * head_size] or + // [batch_size, seq_len, num_heads, head_size] or + // [num_tokens, num_heads, head_size] torch::Tensor& key, // [batch_size, seq_len, num_kv_heads * head_size] or - // [num_tokens, num_kv_heads * head_size] + // [num_tokens, num_kv_heads * head_size] or + // [batch_size, seq_len, num_heads, head_size] or + // [num_tokens, num_heads, head_size] int64_t head_size, torch::Tensor& cos_sin_cache, // [max_position, rot_dim] bool is_neox) { - int64_t num_tokens = query.numel() / query.size(-1); + // num_tokens = batch_size * seq_len + int64_t num_tokens = positions.numel(); + int positions_ndim = positions.dim(); + + // Make sure num_tokens dim is consistent across positions, query, and key. + TORCH_CHECK( + positions_ndim == 1 || positions_ndim == 2, + "positions must have shape [num_tokens] or [batch_size, seq_len]"); + if (positions_ndim == 1) { + TORCH_CHECK( + query.size(0) == positions.size(0) && key.size(0) == positions.size(0), + "query, key and positions must have the same number of tokens"); + } + if (positions_ndim == 2) { + TORCH_CHECK( + query.size(0) == positions.size(0) && + key.size(0) == positions.size(0) && + query.size(1) == positions.size(1) && + key.size(1) == positions.size(1), + "query, key and positions must have the same batch_size and seq_len"); + } + + // Make sure head_size is valid for query and key + // hidden_size = num_heads * head_size + int query_hidden_size = query.numel() / num_tokens; + int key_hidden_size = key.numel() / num_tokens; + TORCH_CHECK(query_hidden_size % head_size == 0); + TORCH_CHECK(key_hidden_size % head_size == 0); + + // Make sure query and key have consistent number of heads + int num_heads = query_hidden_size / head_size; + int num_kv_heads = key_hidden_size / head_size; + TORCH_CHECK(num_heads % num_kv_heads == 0); + int rot_dim = cos_sin_cache.size(1); - int num_heads = query.size(-1) / head_size; - int num_kv_heads = key.size(-1) / head_size; - int64_t query_stride = query.stride(-2); - int64_t key_stride = key.stride(-2); + int seq_dim_idx = positions_ndim - 1; + int64_t query_stride = query.stride(seq_dim_idx); + int64_t key_stride = key.stride(seq_dim_idx); dim3 grid(num_tokens); dim3 block(std::min(num_heads * rot_dim / 2, 512)); @@ -165,19 +201,58 @@ and process in batched manner. void batched_rotary_embedding( torch::Tensor& positions, // [batch_size, seq_len] or [num_tokens] torch::Tensor& query, // [batch_size, seq_len, num_heads * head_size] or - // [num_tokens, num_heads * head_size] + // [num_tokens, num_heads * head_size] or + // [batch_size, seq_len, num_heads, head_size] or + // [num_tokens, num_heads, head_size] torch::Tensor& key, // [batch_size, seq_len, num_kv_heads * head_size] or - // [num_tokens, num_kv_heads * head_size] + // [num_tokens, num_kv_heads * head_size] or + // [batch_size, seq_len, num_heads, head_size] or + // [num_tokens, num_heads, head_size] int64_t head_size, torch::Tensor& cos_sin_cache, // [max_position, rot_dim] bool is_neox, int64_t rot_dim, - torch::Tensor& cos_sin_cache_offsets // [num_tokens] + torch::Tensor& cos_sin_cache_offsets // [num_tokens] or [batch_size] ) { + // num_tokens = batch_size * seq_len int64_t num_tokens = cos_sin_cache_offsets.size(0); - int num_heads = query.size(-1) / head_size; - int num_kv_heads = key.size(-1) / head_size; - int64_t query_stride = query.stride(-2); - int64_t key_stride = key.stride(-2); + TORCH_CHECK( + positions.size(0) == num_tokens || positions.numel() == num_tokens, + "positions must have the same num_tokens or batch_size as " + "cos_sin_cache_offsets"); + + int positions_ndim = positions.dim(); + // Make sure num_tokens dim is consistent across positions, query, and key. + TORCH_CHECK( + positions_ndim == 1 || positions_ndim == 2, + "positions must have shape [num_tokens] or [batch_size, seq_len]"); + if (positions_ndim == 1) { + TORCH_CHECK( + query.size(0) == positions.size(0) && key.size(0) == positions.size(0), + "query, key and positions must have the same number of tokens"); + } + if (positions_ndim == 2) { + TORCH_CHECK( + query.size(0) == positions.size(0) && + key.size(0) == positions.size(0) && + query.size(1) == positions.size(1) && + key.size(1) == positions.size(1), + "query, key and positions must have the same batch_size and seq_len"); + } + + // Make sure head_size is valid for query and key + int query_hidden_size = query.numel() / num_tokens; + int key_hidden_size = key.numel() / num_tokens; + TORCH_CHECK(query_hidden_size % head_size == 0); + TORCH_CHECK(key_hidden_size % head_size == 0); + + // Make sure query and key have concistent number of heads + int num_heads = query_hidden_size / head_size; + int num_kv_heads = key_hidden_size / head_size; + TORCH_CHECK(num_heads % num_kv_heads == 0); + + int seq_dim_idx = positions_ndim - 1; + int64_t query_stride = query.stride(seq_dim_idx); + int64_t key_stride = key.stride(seq_dim_idx); dim3 grid(num_tokens); dim3 block(std::min(num_heads * rot_dim / 2, 512)); diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu index c3902f4c2a163..fea4bc2ca0d8f 100644 --- a/csrc/prepare_inputs/advance_step.cu +++ b/csrc/prepare_inputs/advance_step.cu @@ -274,7 +274,7 @@ void advance_step_flashinfer( cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev); cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev); - int block_tables_stride = block_tables.stride(0); + [[maybe_unused]] int block_tables_stride = block_tables.stride(0); TORCH_CHECK((blocks * threads > num_queries), "multi-step: not enough threads to map to num_queries = ", num_queries, " block_tables.stride(0) = ", block_tables.stride(0), diff --git a/csrc/quantization/awq/gemm_kernels.cu b/csrc/quantization/awq/gemm_kernels.cu index 9da724a1b43c3..53c47679cdd72 100644 --- a/csrc/quantization/awq/gemm_kernels.cu +++ b/csrc/quantization/awq/gemm_kernels.cu @@ -334,7 +334,7 @@ __global__ void __launch_bounds__(64) } // TODO: Shang: Hoist loop invariance. - for (int ax1_0_1 = 0; ax1_0_1 < 4; ++ax1_0_1) { + for (int ax1_0_1 = 0; ax1_0_1 < (N / 32); ++ax1_0_1) { for (int local_id = 0; local_id < 8; ++local_id) { int row_offset = (((int)blockIdx_y) / j_factors1) * 16 + ((int)threadIdx.x) / 4 + (local_id % 4) / 2 * 8; diff --git a/csrc/quantization/cutlass_w8a8/Epilogues.md b/csrc/quantization/cutlass_w8a8/Epilogues.md index aae04157b10de..a30e1fdf3ac77 100644 --- a/csrc/quantization/cutlass_w8a8/Epilogues.md +++ b/csrc/quantization/cutlass_w8a8/Epilogues.md @@ -1,17 +1,19 @@ # CUTLASS Epilogues ## Introduction -This document describes the various CUTLASS epilogues implemented for fusing de-quantization operations onto GEMMs. + +This document describes the various CUTLASS epilogues implemented for fusing de-quantization operations onto GEMMs. Currently, we only support symmetric quantization for weights, and symmetric and asymmetric quantization for activations. Both can be quantized per-tensor or per-channel (weights) / per-token (activations). There are 4 epilogues: -1. ScaledEpilogue: symmetric quantization for activations, no bias. -1. ScaledEpilogueBias: symmetric quantization for activations, supports bias. -1. ScaledEpilogueAzp: asymmetric per-tensor quantization for activations, supports bias. -1. ScaledEpilogueAzpPerToken: asymmetric per-token quantization for activations, supports bias. + +1. `ScaledEpilogue`: symmetric quantization for activations, no bias. +1. `ScaledEpilogueBias`: symmetric quantization for activations, supports bias. +1. `ScaledEpilogueAzp`: asymmetric per-tensor quantization for activations, supports bias. +1. `ScaledEpilogueAzpPerToken`: asymmetric per-token quantization for activations, supports bias. We do not have epilogues for asymmetric quantization of activations without bias in order to reduce final binary size. Instead, if no bias is passed, the epilogue will use 0 as the bias. @@ -26,12 +28,15 @@ If $` \widehat X `$ is the quantized $` X `$, our matrices become the following ```math A = s_a (\widehat A - J_a z_a) ``` + ```math B = s_b \widehat B ``` + ```math D = A B + C ``` + ```math D = s_a s_b \widehat D + C ``` @@ -48,9 +53,11 @@ Expanding further, we can calculate $` \widehat D `$ as follows: ```math A B = s_a ( \widehat A - J_a z_a ) s_b \widehat B ``` + ```math A B = s_a s_b \left( \widehat A \widehat B - J_a z_a \widehat B \right) ``` + ```math \widehat D = \widehat A \widehat B - z_a J_a \widehat B ``` @@ -61,16 +68,19 @@ Each row of it is equal to $` \mathbf 1 \widehat B `$, which is a row-vector of ## Epilogues -### ScaledEpilogue +### `ScaledEpilogue` + This epilogue computes the symmetric quantization for activations without bias, meaning $` C = 0 `$ and $` z_a = 0 `$. The output of the GEMM is: ```math \widehat D = \widehat A \widehat B ``` + ```math D = s_a s_b \widehat D ``` + ```math D = s_a s_b \widehat A \widehat B ``` @@ -79,44 +89,51 @@ Epilogue parameters: - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector). - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector). -### ScaledEpilogueBias +### `ScaledEpilogueBias` + This epilogue computes the symmetric quantization for activations with bias, meaning $` z_a = 0 `$. The output of the GEMM is: ```math \widehat D = \widehat A \widehat B ``` + ```math D = s_a s_b \widehat D + C ``` + ```math D = s_a s_b \widehat A \widehat B + C ``` - Epilogue parameters: + - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector). - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector). - `bias` is the bias, is always per-channel (row-vector). -### ScaledEpilogueAzp +### `ScaledEpilogueAzp` + This epilogue computes the asymmetric per-tensor quantization for activations with bias. The output of the GEMM is: ```math \widehat D = \widehat A \widehat B - z_a J_a \widehat B ``` + ```math D = s_a s_b \widehat D + C ``` + ```math D = s_a s_b \left( \widehat A \widehat B - z_a J_a \widehat B \right) + C ``` -Because $` z_a `$ is a scalar, the zero-point term $` z_a J_a \widehat B `$ has every row equal to $` z_a \mathbf 1 B `$. +Because $` z_a `$ is a scalar, the zero-point term $` z_a J_a \widehat B `$ has every row equal to $` z_a \mathbf 1 B `$. That is precomputed and stored in `azp_with_adj` as a row-vector. Epilogue parameters: + - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector). - Generally this will be per-tensor as the zero-points are per-tensor. - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector). @@ -125,13 +142,15 @@ Epilogue parameters: To use these kernels efficiently, users must precompute the `azp_with_adj` term offline and pass it to the kernel. -### ScaledEpilogueAzpPerToken +### `ScaledEpilogueAzpPerToken` + This epilogue computes the asymmetric per-token quantization for activations with bias. The output of the GEMM is the same as above, but the $` z_a `$ is a column-vector. That means the zero-point term $` z_a J_a \widehat B `$ becomes an outer product of $` z_a `$ and $` \mathbf 1 \widehat B `$. Epilogue parameters: + - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector). - Generally this will be per-token as the zero-points are per-token. - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector). @@ -142,6 +161,7 @@ Epilogue parameters: To use these kernels efficiently, users must precompute the `azp_adj` term offline and pass it to the kernel. The epilogue performs the following computation (where `Dq` is the raw quantized output of the GEMM): -``` + +```math out = scale_a * scale_b * (Dq - azp_adj * azp) + bias ``` diff --git a/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh new file mode 100644 index 0000000000000..26de32ce2b16a --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh @@ -0,0 +1,107 @@ +#pragma once + +// clang-format will break include orders +// clang-format off +#include + +#include + +#include "cutlass/cutlass.h" + +#include "cute/tensor.hpp" +#include "cute/atom/mma_atom.hpp" +#include "cutlass/numeric_types.h" + +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" +#include "cutlass/epilogue/collective/collective_builder.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass/util/packed_stride.hpp" + +#include "core/math.hpp" +#include "cutlass_extensions/common.hpp" +// clang-format on + +namespace vllm::c3x { + +static inline cute::Shape get_problem_shape( + torch::Tensor const& a, torch::Tensor const& b) { + int32_t m = a.size(0), n = b.size(1), k = a.size(1); + return {m, n, k, 1}; +} + +template +void cutlass_gemm_caller( + torch::Device device, cute::Shape prob_shape, + typename GemmKernel::MainloopArguments mainloop_args, + typename GemmKernel::EpilogueArguments epilogue_args, + typename GemmKernel::TileSchedulerArguments scheduler = {}) { + cutlass::KernelHardwareInfo hw_info; + typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm, + prob_shape, + mainloop_args, + epilogue_args, + hw_info, + scheduler}; + + // Launch the CUTLASS GEMM kernel. + using GemmOp = cutlass::gemm::device::GemmUniversalAdapter; + GemmOp gemm_op; + CUTLASS_CHECK(gemm_op.can_implement(args)); + + size_t workspace_size = gemm_op.get_workspace_size(args); + auto const workspace_options = + torch::TensorOptions().dtype(torch::kUInt8).device(device); + auto workspace = torch::empty(workspace_size, workspace_options); + + auto stream = at::cuda::getCurrentCUDAStream(device.index()); + + cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream); + CUTLASS_CHECK(status); +} + +template +void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... epilogue_params) { + using ElementAB = typename Gemm::ElementAB; + using ElementC = typename Gemm::ElementC; + using ElementD = typename Gemm::ElementD; + using GemmKernel = typename Gemm::GemmKernel; + + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideC = typename Gemm::GemmKernel::StrideC; + using StrideD = StrideC; + using StrideAux = StrideC; + + typename GemmKernel::ProblemShape prob_shape = get_problem_shape(a, b); + auto [M, N, K, L] = prob_shape; + + StrideA a_stride = + cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); + StrideB b_stride = + cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)); + StrideC c_stride = + cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); + StrideD d_stride = + cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); + StrideAux aux_stride = d_stride; + + auto a_ptr = static_cast(a.data_ptr()); + auto b_ptr = static_cast(b.data_ptr()); + typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr, + b_stride}; + + auto c_ptr = static_cast(out.data_ptr()); + // auto d_ptr = static_cast(out.data_ptr()); + typename GemmKernel::EpilogueArguments epilogue_args{ + Gemm::Epilogue::prepare_args( + std::forward(epilogue_params)...), + c_ptr, c_stride, c_ptr, d_stride}; + + cutlass_gemm_caller(a.device(), prob_shape, mainloop_args, + epilogue_args); +} + +} // namespace vllm::c3x \ No newline at end of file diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh new file mode 100644 index 0000000000000..8f4df836bcc8d --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh @@ -0,0 +1,147 @@ +#pragma once + +// clang-format will break include orders +// clang-format off + +#include "cutlass/cutlass.h" + +#include "cute/tensor.hpp" +#include "cute/atom/mma_atom.hpp" +#include "cutlass/numeric_types.h" + +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" +#include "cutlass/epilogue/collective/collective_builder.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" + +#include "core/math.hpp" +#include "cutlass_extensions/common.hpp" +// clang-format on + +/* + Epilogues defined in, + csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp, + must contain a public type named EVTCompute of type Sm90EVT, as well as a + static prepare_args function that constructs an EVTCompute::Arguments struct. +*/ + +using namespace cute; + +namespace vllm { + +template typename Epilogue_, + typename TileShape, typename ClusterShape, typename KernelSchedule, + typename EpilogueSchedule> +struct cutlass_3x_gemm { + using ElementAB = ElementAB_; + using ElementD = ElementD_; + using ElementAcc = + typename std::conditional, int32_t, + float>::type; + + using Epilogue = Epilogue_; + + using StrideD = Stride, Int<0>>; + using ElementC = void; + using StrideC = StrideD; + + using EVTCompute = typename Epilogue::EVTCompute; + + // These are the minimum alignments needed for the kernels to compile + static constexpr int AlignmentAB = + 128 / cutlass::sizeof_bits::value; + static constexpr int AlignmentCD = 4; + + using CollectiveEpilogue = + typename cutlass::epilogue::collective::CollectiveBuilder< + cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape, + ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto, + ElementAcc, float, ElementC, StrideC, AlignmentCD, ElementD, StrideD, + AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp; + + static constexpr size_t CEStorageSize = + sizeof(typename CollectiveEpilogue::SharedStorage); + using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout< + static_cast(CEStorageSize)>; + + // clang-format off + using CollectiveMainloop = + typename cutlass::gemm::collective::CollectiveBuilder< + cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, + ElementAB, cutlass::layout::RowMajor, AlignmentAB, + ElementAB, cutlass::layout::ColumnMajor, AlignmentAB, + ElementAcc, TileShape, ClusterShape, + Stages, + KernelSchedule>::CollectiveOp; + // clang-format on + + using KernelType = enable_sm90_or_later, CollectiveMainloop, CollectiveEpilogue, + cutlass::gemm::PersistentScheduler>>; + + struct GemmKernel : public KernelType {}; +}; + +template typename Epilogue_, + typename TileShape, typename ClusterShape, typename KernelSchedule, + typename EpilogueSchedule> +struct cutlass_3x_gemm_sm100 { + using ElementAB = ElementAB_; + using LayoutA = cutlass::layout::RowMajor; + static constexpr int AlignmentA = + 128 / cutlass::sizeof_bits::value; + + using LayoutB = cutlass::layout::ColumnMajor; + static constexpr int AlignmentB = + 128 / cutlass::sizeof_bits::value; + + using ElementC = void; + using LayoutC = cutlass::layout::RowMajor; + static constexpr int AlignmentC = + 128 / cutlass::sizeof_bits::value; + + using ElementD = ElementD_; + using LayoutD = cutlass::layout::RowMajor; + static constexpr int AlignmentD = AlignmentC; + + using ElementAcc = + typename std::conditional, int32_t, + float>::type; + using Epilogue = Epilogue_; + + // MMA type + using ElementAccumulator = float; + + // Epilogue types + using ElementBias = cutlass::half_t; + using ElementCompute = float; + using ElementAux = ElementD; + using LayoutAux = LayoutD; + using ElementAmax = float; + + using EVTCompute = typename Epilogue::EVTCompute; + + using CollectiveEpilogue = + typename cutlass::epilogue::collective::CollectiveBuilder< + cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, TileShape, + ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto, + ElementAccumulator, ElementCompute, ElementC, LayoutC, AlignmentC, + ElementD, LayoutD, AlignmentD, EpilogueSchedule, + EVTCompute>::CollectiveOp; + + using CollectiveMainloop = + typename cutlass::gemm::collective::CollectiveBuilder< + cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, ElementAB, + LayoutA, AlignmentA, ElementAB, LayoutB, AlignmentB, + ElementAccumulator, TileShape, ClusterShape, + cutlass::gemm::collective::StageCountAutoCarveout( + sizeof(typename CollectiveEpilogue::SharedStorage))>, + KernelSchedule>::CollectiveOp; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, CollectiveMainloop, CollectiveEpilogue, void>; +}; + +} // namespace vllm diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu new file mode 100644 index 0000000000000..4cd38f4975df7 --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu @@ -0,0 +1,24 @@ +#include "scaled_mm_kernels.hpp" +#include "scaled_mm_sm90_int8_dispatch.cuh" +#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" + +namespace vllm { + +void cutlass_scaled_mm_azp_sm90_int8(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& azp_adj, + std::optional const& azp, + std::optional const& bias) { + if (azp) { + return cutlass_scaled_mm_sm90_int8_epilogue< + c3x::ScaledEpilogueBiasAzpToken>(out, a, b, a_scales, b_scales, azp_adj, + *azp, bias); + } else { + return cutlass_scaled_mm_sm90_int8_epilogue( + out, a, b, a_scales, b_scales, azp_adj, bias); + } +} + +} // namespace vllm diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu new file mode 100644 index 0000000000000..0501e6da160e2 --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu @@ -0,0 +1,24 @@ + +#include "scaled_mm_kernels.hpp" +#include "scaled_mm_blockwise_sm90_fp8_dispatch.cuh" +#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" + +namespace vllm { + +void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out, + torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { + if (out.dtype() == torch::kBFloat16) { + cutlass_gemm_blockwise_sm90_fp8_dispatch( + out, a, b, a_scales, b_scales); + + } else { + TORCH_CHECK(out.dtype() == torch::kFloat16); + cutlass_gemm_blockwise_sm90_fp8_dispatch( + out, a, b, a_scales, b_scales); + } +} + +} // namespace vllm \ No newline at end of file diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh new file mode 100644 index 0000000000000..e089c3d4be2cc --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh @@ -0,0 +1,194 @@ +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/numeric_types.h" + +#include "cute/tensor.hpp" +#include "cutlass/tensor_ref.h" +#include "cutlass/gemm/dispatch_policy.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" +#include "cutlass/gemm/kernel/tile_scheduler_params.h" +#include "cutlass/epilogue/dispatch_policy.hpp" +#include "cutlass/epilogue/collective/collective_builder.hpp" + +#include "cutlass_extensions/gemm/dispatch_policy.hpp" +#include "cutlass_extensions/gemm/collective/collective_builder.hpp" + +#include "cutlass_gemm_caller.cuh" + +namespace vllm { + +using namespace cute; + +template > +struct cutlass_3x_gemm_fp8_blockwise { + using GroupSizeM = Int; + using GroupSizeN = Int; + using GroupSizeK = Int; + using TileSizeM = Int; + + static_assert(TileSizeM_ % GroupSizeM_ == 0, + "TileSizeM must be a multiple of GroupSizeM"); + + using ElementAB = cutlass::float_e4m3_t; + + using ElementA = ElementAB; + using LayoutA = cutlass::layout::RowMajor; + static constexpr int AlignmentA = 128 / cutlass::sizeof_bits::value; + + using ElementB = ElementAB; + using LayoutB = cutlass::layout::ColumnMajor; + static constexpr int AlignmentB = 128 / cutlass::sizeof_bits::value; + + using ElementD = OutType; + using StrideD = Stride, Int<0>>; + static constexpr int AlignmentD = 128 / cutlass::sizeof_bits::value; + + using ElementC = void; + using StrideC = StrideD; + static constexpr int AlignmentC = AlignmentD; + + using ElementAccumulator = float; + using ElementBlockScale = float; + using ElementCompute = float; + using ArchTag = cutlass::arch::Sm90; + using OperatorClass = cutlass::arch::OpClassTensorOp; + using TileShape = Shape; + + using KernelSchedule = cutlass::gemm:: + KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum< + GroupSizeM_>; + using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative; + using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto; + + using StoreEpilogueCompute = typename cutlass::epilogue::fusion::Sm90EVT< + cutlass::epilogue::fusion::Sm90AccFetch>; + + using CollectiveEpilogue = + typename cutlass::epilogue::collective::CollectiveBuilder< + ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType, + ElementAccumulator, ElementCompute, ElementC, StrideC, AlignmentC, + ElementD, StrideD, AlignmentD, EpilogueSchedule, + StoreEpilogueCompute>::CollectiveOp; + + using CollectiveMainloop = + typename cutlass::gemm::collective::CollectiveBuilder< + ArchTag, OperatorClass, ElementA, LayoutA, AlignmentA, ElementB, + LayoutB, AlignmentB, ElementAccumulator, TileShape, ClusterShape, + cutlass::gemm::collective::StageCountAutoCarveout( + sizeof(typename CollectiveEpilogue::SharedStorage))>, + KernelSchedule>::CollectiveOp; + + using KernelType = enable_sm90_or_later, CollectiveMainloop, CollectiveEpilogue, + SchedulerType>>; + + struct GemmKernel : public KernelType {}; + + using StrideA = typename GemmKernel::StrideA; + using StrideB = typename GemmKernel::StrideB; +}; + +template +void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { + using GemmKernel = typename Gemm::GemmKernel; + + using ElementAB = typename Gemm::ElementAB; + using ElementD = typename Gemm::ElementD; + + auto prob_shape = c3x::get_problem_shape(a, b); + int32_t m = get<0>(prob_shape), n = get<1>(prob_shape), + k = get<2>(prob_shape); + + int64_t lda = a.stride(0); + int64_t ldb = b.stride(1); + int64_t ldc = out.stride(0); + + using StrideA = Stride, int64_t>; + using StrideB = Stride, int64_t>; + using StrideC = typename Gemm::StrideC; + + StrideA a_stride{lda, Int<1>{}, 0}; + StrideB b_stride{ldb, Int<1>{}, 0}; + StrideC c_stride{ldc, Int<1>{}, Int<0>{}}; + + auto a_ptr = static_cast(a.data_ptr()); + auto b_ptr = static_cast(b.data_ptr()); + auto a_scales_ptr = static_cast(a_scales.data_ptr()); + auto b_scales_ptr = static_cast(b_scales.data_ptr()); + + // Check is the t is contiguous and is 1D or 2D with one of the dimensions + // being 1 (i.e. a row or column vector) + auto is_contiguous_vector = [](const torch::Tensor& t) { + auto t_sizes = t.sizes(); + return t.is_contiguous() && + (t.dim() == 1 || + (t.dim() == 2 && + *std::min_element(t_sizes.begin(), t_sizes.end()) == 1)); + }; + + // TODO(lucas): lets clean-up the kernel so that we pass in Strides so + // we don't have to deal with enforcing implicit layouts + TORCH_CHECK(a_scales.size(0) == m / Gemm::GroupSizeM::value); + TORCH_CHECK(a_scales.size(1) == k / Gemm::GroupSizeK::value); + TORCH_CHECK(a_scales.stride(0) == 1 || is_contiguous_vector(a_scales), + "a_scales must be M major"); + TORCH_CHECK(b_scales.size(0) == k / Gemm::GroupSizeK::value); + TORCH_CHECK(b_scales.size(1) == n / Gemm::GroupSizeN::value); + TORCH_CHECK(b_scales.stride(0) == 1 || is_contiguous_vector(b_scales), + "b_scales must be K major"); + typename GemmKernel::MainloopArguments mainloop_args{ + a_ptr, a_stride, b_ptr, b_stride, a_scales_ptr, b_scales_ptr}; + + auto c_ptr = static_cast(out.data_ptr()); + typename GemmKernel::EpilogueArguments epilogue_args{ + {}, c_ptr, c_stride, c_ptr, c_stride}; + + typename GemmKernel::TileSchedulerArguments scheduler; + + static constexpr bool UsesStreamKScheduler = + cute::is_same_v; + + if constexpr (UsesStreamKScheduler) { + using DecompositionMode = typename cutlass::gemm::kernel::detail:: + PersistentTileSchedulerSm90StreamKParams::DecompositionMode; + using ReductionMode = typename cutlass::gemm::kernel::detail:: + PersistentTileSchedulerSm90StreamKParams::ReductionMode; + + scheduler.decomposition_mode = DecompositionMode::StreamK; + scheduler.reduction_mode = ReductionMode::Nondeterministic; + } + + c3x::cutlass_gemm_caller(a.device(), prob_shape, mainloop_args, + epilogue_args, scheduler); +} + +template +void cutlass_gemm_blockwise_sm90_fp8_dispatch(torch::Tensor& out, + torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { + auto k = a.size(1); + auto n = b.size(1); + + if (k > 3 * n) { + cutlass_gemm_caller_blockwise>( + out, a, b, a_scales, b_scales); + } else { + cutlass_gemm_caller_blockwise>( + out, a, b, a_scales, b_scales); + } +} + +} // namespace vllm \ No newline at end of file diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp new file mode 100644 index 0000000000000..85272804774db --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp @@ -0,0 +1,39 @@ +#pragma once + +#include + +namespace vllm { + +void cutlass_scaled_mm_sm90_fp8(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + std::optional const& bias); + +void cutlass_scaled_mm_sm90_int8(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + std::optional const& bias); + +void cutlass_scaled_mm_azp_sm90_int8(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& azp_adj, + std::optional const& azp, + std::optional const& bias); + +void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out, + torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales); + +void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + std::optional const& bias); + +} // namespace vllm diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu new file mode 100644 index 0000000000000..cf2cccc913f62 --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu @@ -0,0 +1,24 @@ +#include "scaled_mm_kernels.hpp" +#include "scaled_mm_sm100_fp8_dispatch.cuh" +#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" + +namespace vllm { + +void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + std::optional const& bias) { + TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); + if (bias) { + TORCH_CHECK(bias->dtype() == out.dtype(), + "currently bias dtype must match output dtype ", out.dtype()); + return cutlass_scaled_mm_sm100_fp8_epilogue( + out, a, b, a_scales, b_scales, *bias); + } else { + return cutlass_scaled_mm_sm100_fp8_epilogue( + out, a, b, a_scales, b_scales); + } +} + +} // namespace vllm diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh new file mode 100644 index 0000000000000..468b77d9593bc --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh @@ -0,0 +1,67 @@ +#pragma once + +#include "scaled_mm.cuh" +#include "cutlass_gemm_caller.cuh" + +/** + * This file defines Gemm kernel configurations for SM100 (fp8) based on the + * Gemm shape. + */ + +namespace vllm { + +using c3x::cutlass_gemm_caller; + +template typename Epilogue> +struct sm100_fp8_config_default { + static_assert(std::is_same()); + using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; + using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; + using TileShape = Shape<_256, _128, _64>; + using ClusterShape = Shape<_2, _2, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm_sm100; +}; + +template typename Epilogue, + typename... EpilogueArgs> +inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out, + torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... args) { + static_assert(std::is_same()); + TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn); + TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn); + + using Cutlass3xGemmDefault = + typename sm100_fp8_config_default::Cutlass3xGemm; + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); +} + +template