Merge remote-tracking branch 'origin/main' into lwilkinson/attn-slicing

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
This commit is contained in:
Lucas Wilkinson 2025-07-01 03:00:07 +00:00 committed by Sage Moore
commit f7a3ee0ea1
493 changed files with 27317 additions and 7922 deletions

View File

@ -16,7 +16,7 @@ Please download the visualization scripts in the post
- Download `nightly-benchmarks.zip`. - Download `nightly-benchmarks.zip`.
- In the same folder, run the following code: - In the same folder, run the following code:
```console ```bash
export HF_TOKEN=<your HF token> export HF_TOKEN=<your HF token>
apt update apt update
apt install -y git apt install -y git

View File

@ -102,6 +102,7 @@ steps:
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
@ -117,6 +118,7 @@ steps:
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)" - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"

View File

@ -54,10 +54,11 @@ docker run --rm -it --device=/dev/neuron0 --network bridge \
--name "${container_name}" \ --name "${container_name}" \
${image_name} \ ${image_name} \
/bin/bash -c " /bin/bash -c "
set -e; # Exit on first error
python3 /workspace/vllm/examples/offline_inference/neuron.py; python3 /workspace/vllm/examples/offline_inference/neuron.py;
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys; python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
for f in /workspace/vllm/tests/neuron/2_core/*.py; do for f in /workspace/vllm/tests/neuron/2_core/*.py; do
echo 'Running test file: '$f; echo \"Running test file: \$f\";
python3 -m pytest \$f -v --capture=tee-sys; python3 -m pytest \$f -v --capture=tee-sys;
done done
" "

View File

@ -159,6 +159,8 @@ run_and_track_test 14 "test_tpu_qkv_linear.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py" "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
run_and_track_test 15 "test_spmd_model_weight_loading.py" \ run_and_track_test 15 "test_spmd_model_weight_loading.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py" "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
run_and_track_test 16 "test_kv_cache_update_kernel.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
# After all tests have been attempted, exit with the overall status. # After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then if [ "$overall_script_exit_code" -ne 0 ]; then

View File

@ -28,4 +28,5 @@ docker run \
sh -c ' sh -c '
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2 VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
' '

View File

@ -4,8 +4,8 @@ CONTAINER_NAME=vllm-tpu
# vllm config # vllm config
MODEL=meta-llama/Llama-3.1-8B-Instruct MODEL=meta-llama/Llama-3.1-8B-Instruct
MAX_NUM_SEQS=512 MAX_NUM_SEQS=256
MAX_NUM_BATCHED_TOKENS=512 MAX_NUM_BATCHED_TOKENS=1024
TENSOR_PARALLEL_SIZE=1 TENSOR_PARALLEL_SIZE=1
MAX_MODEL_LEN=2048 MAX_MODEL_LEN=2048
DOWNLOAD_DIR=/mnt/disks/persist DOWNLOAD_DIR=/mnt/disks/persist

View File

@ -68,7 +68,7 @@ docker run \
echo "run script..." echo "run script..."
echo echo
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh" docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/tpu/run_bm.sh"
echo "copy result back..." echo "copy result back..."
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt

View File

@ -41,6 +41,16 @@ steps:
# TODO: add `--strict` once warnings in docstrings are fixed # TODO: add `--strict` once warnings in docstrings are fixed
- mkdocs build - mkdocs build
- label: Pytorch Nightly Dependency Override Check # 2min
# if this test fails, it means the nightly torch version is not compatible with some
# of the dependencies. Please check the error message and add the package to whitelist
# in /vllm/tools/generate_nightly_torch_test.py
soft_fail: true
source_file_dependencies:
- requirements/nightly_torch_test.txt
commands:
- bash standalone_tests/pytorch_nightly_dependency.sh
- label: Async Engine, Inputs, Utils, Worker Test # 24min - label: Async Engine, Inputs, Utils, Worker Test # 24min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
@ -89,7 +99,7 @@ steps:
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Chunked Prefill Test - label: Chunked Prefill Test
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/basic_correctness/test_chunked_prefill - tests/basic_correctness/test_chunked_prefill
@ -168,6 +178,23 @@ steps:
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd - popd
- label: EPLB Algorithm Test
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/distributed/eplb
- tests/distributed/test_eplb_algo.py
commands:
- pytest -v -s distributed/test_eplb_algo.py
- label: EPLB Execution Test # 5min
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/eplb
- tests/distributed/test_eplb_execute.py
commands:
- pytest -v -s distributed/test_eplb_execute.py
- label: Metrics, Tracing Test # 10min - label: Metrics, Tracing Test # 10min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
num_gpus: 2 num_gpus: 2
@ -271,6 +298,15 @@ steps:
commands: commands:
- pytest -v -s prefix_caching - pytest -v -s prefix_caching
- label: Platform Tests (CUDA)
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/cuda
commands:
- pytest -v -s cuda/test_cuda_context.py
- label: Samplers Test # 36min - label: Samplers Test # 36min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
@ -500,6 +536,17 @@ steps:
- pip freeze | grep -E 'torch' - pip freeze | grep -E 'torch'
- pytest -v -s models/language -m core_model - pytest -v -s models/language -m core_model
- label: Language Models Test (Hybrid) # 35 min
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/language/generation
commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
- pytest -v -s models/language/generation -m hybrid_model
- label: Language Models Test (Extended Generation) # 1hr20min - label: Language Models Test (Extended Generation) # 1hr20min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
optional: true optional: true
@ -509,7 +556,7 @@ steps:
commands: commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
- pytest -v -s models/language/generation -m 'not core_model' - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
- label: Language Models Test (Extended Pooling) # 36min - label: Language Models Test (Extended Pooling) # 36min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
@ -606,13 +653,18 @@ steps:
- vllm/executor/ - vllm/executor/
- vllm/model_executor/models/ - vllm/model_executor/models/
- tests/distributed/ - tests/distributed/
- tests/examples/offline_inference/data_parallel.py
commands: commands:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 40min - label: Distributed Tests (2 GPUs) # 40min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
@ -736,7 +788,7 @@ steps:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
- label: Weight Loading Multiple GPU Test - Large Models # optional - label: Weight Loading Multiple GPU Test - Large Models # optional
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
gpu: a100 gpu: a100

4
.github/CODEOWNERS vendored
View File

@ -18,6 +18,10 @@
/vllm/entrypoints @aarnphm /vllm/entrypoints @aarnphm
CMakeLists.txt @tlrmchlsmth CMakeLists.txt @tlrmchlsmth
# Any change to the VllmConfig changes can have a large user-facing impact,
# so spam a lot of people
/vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
# vLLM V1 # vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/structured_output @mgoin @russellb @aarnphm /vllm/v1/structured_output @mgoin @russellb @aarnphm

28
.github/mergify.yml vendored
View File

@ -45,6 +45,7 @@ pull_request_rules:
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
- files~=^vllm/model_executor/models/.*llama.*\.py - files~=^vllm/model_executor/models/.*llama.*\.py
- files~=^vllm/transformers_utils/configs/.*llama.*\.py - files~=^vllm/transformers_utils/configs/.*llama.*\.py
- title~=(?i)llama
actions: actions:
label: label:
add: add:
@ -65,6 +66,33 @@ pull_request_rules:
add: add:
- multi-modality - multi-modality
- name: label-performance
description: Automatically apply performance label
conditions:
- or:
- files~=^benchmarks/
- files~=^vllm/benchmarks/
- files~=^tests/benchmarks/
- files~=^\.buildkite/nightly-benchmarks/
actions:
label:
add:
- performance
- name: label-qwen
description: Automatically apply qwen label
conditions:
- or:
- files~=^examples/.*qwen.*\.py
- files~=^tests/.*qwen.*\.py
- files~=^vllm/model_executor/models/.*qwen.*\.py
- files~=^vllm/reasoning/.*qwen.*\.py
- title~=(?i)Qwen
actions:
label:
add:
- qwen
- name: label-rocm - name: label-rocm
description: Automatically apply rocm label description: Automatically apply rocm label
conditions: conditions:

View File

@ -53,6 +53,11 @@ repos:
files: ^requirements/test\.(in|txt)$ files: ^requirements/test\.(in|txt)$
- repo: local - repo: local
hooks: hooks:
- id: format-torch-nightly-test
name: reformat nightly_torch_test.txt to be in sync with test.in
language: python
entry: python tools/generate_nightly_torch_test.py
files: ^requirements/test\.(in|txt)$
- id: mypy-local - id: mypy-local
name: Run mypy for local Python installation name: Run mypy for local Python installation
entry: tools/mypy.sh 0 "local" entry: tools/mypy.sh 0 "local"
@ -115,6 +120,11 @@ repos:
entry: python tools/check_spdx_header.py entry: python tools/check_spdx_header.py
language: python language: python
types: [python] types: [python]
- id: check-root-lazy-imports
name: Check root lazy imports
entry: python tools/check_init_lazy_imports.py
language: python
types: [python]
- id: check-filenames - id: check-filenames
name: Check for spaces in all filenames name: Check for spaces in all filenames
entry: bash entry: bash

View File

@ -420,9 +420,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
endif() endif()
# The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
# CUDA 12.8 or later # require CUDA 12.8 or later
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}") cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
@ -513,6 +513,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
CUDA_ARCHS "${FP4_ARCHS}") CUDA_ARCHS "${FP4_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}") list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1") list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}") message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
else() else()
message(STATUS "Not building NVFP4 as no compatible archs were found.") message(STATUS "Not building NVFP4 as no compatible archs were found.")
@ -547,8 +548,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# if it's possible to compile MoE kernels that use its output. # if it's possible to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}") cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu" set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu")
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}") CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -562,7 +562,27 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"if you intend on running FP8 quantized MoE models on Hopper.") "if you intend on running FP8 quantized MoE models on Hopper.")
else() else()
message(STATUS "Not building grouped_mm_c3x as no compatible archs found " message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
"in CUDA target architectures") "in CUDA target architectures.")
endif()
endif()
# moe_data.cu is used by all CUTLASS MoE kernels.
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
message(STATUS "Building moe_data for archs: ${CUTLASS_MOE_DATA_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
message(STATUS "Not building moe_data as CUDA Compiler version is "
"not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
"if you intend on running FP8 quantized MoE models on Hopper or Blackwell.")
else()
message(STATUS "Not building moe_data as no compatible archs found "
"in CUDA target architectures.")
endif() endif()
endif() endif()
@ -638,6 +658,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# if CUDA endif # if CUDA endif
endif() endif()
if (VLLM_GPU_LANG STREQUAL "HIP")
# Add QuickReduce kernels
list(APPEND VLLM_EXT_SRC
"csrc/custom_quickreduce.cu"
)
# if ROCM endif
endif()
message(STATUS "Enabling C extension.") message(STATUS "Enabling C extension.")
define_gpu_extension_target( define_gpu_extension_target(
_C _C

View File

@ -154,11 +154,13 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
## Contact Us ## Contact Us
<!-- --8<-- [start:contact-us] -->
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions) - For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai) - For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
- coordinating contributions and development, please use [Slack](https://slack.vllm.ai) - For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature - For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu) - For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
<!-- --8<-- [end:contact-us] -->
## Media Kit ## Media Kit

View File

@ -4,7 +4,7 @@ This README guides you through running benchmark tests with the extensive
datasets supported on vLLM. Its a living document, updated as new features and datasets datasets supported on vLLM. Its a living document, updated as new features and datasets
become available. become available.
## Dataset Overview **Dataset Overview**
<table style="width:100%; border-collapse: collapse;"> <table style="width:100%; border-collapse: collapse;">
<thead> <thead>
@ -82,7 +82,10 @@ become available.
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf` **Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
--- ---
## Example - Online Benchmark <details>
<summary><b>🚀 Example - Online Benchmark</b></summary>
<br/>
First start serving your model First start serving your model
@ -130,7 +133,8 @@ P99 ITL (ms): 8.39
================================================== ==================================================
``` ```
### Custom Dataset **Custom Dataset**
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
``` ```
@ -162,7 +166,7 @@ python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detaile
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`. You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
### VisionArena Benchmark for Vision Language Models **VisionArena Benchmark for Vision Language Models**
```bash ```bash
# need a model with vision capability here # need a model with vision capability here
@ -180,7 +184,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
--num-prompts 1000 --num-prompts 1000
``` ```
### InstructCoder Benchmark with Speculative Decoding **InstructCoder Benchmark with Speculative Decoding**
``` bash ``` bash
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
@ -197,7 +201,7 @@ python3 benchmarks/benchmark_serving.py \
--num-prompts 2048 --num-prompts 2048
``` ```
### Other HuggingFaceDataset Examples **Other HuggingFaceDataset Examples**
```bash ```bash
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
@ -251,7 +255,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
--num-prompts 80 --num-prompts 80
``` ```
### Running With Sampling Parameters **Running With Sampling Parameters**
When using OpenAI-compatible backends such as `vllm`, optional sampling When using OpenAI-compatible backends such as `vllm`, optional sampling
parameters can be specified. Example client command: parameters can be specified. Example client command:
@ -269,8 +273,27 @@ python3 vllm/benchmarks/benchmark_serving.py \
--num-prompts 10 --num-prompts 10
``` ```
--- **Running With Ramp-Up Request Rate**
## Example - Offline Throughput Benchmark
The benchmark tool also supports ramping up the request rate over the
duration of the benchmark run. This can be useful for stress testing the
server or finding the maximum throughput that it can handle, given some latency budget.
Two ramp-up strategies are supported:
- `linear`: Increases the request rate linearly from a start value to an end value.
- `exponential`: Increases the request rate exponentially.
The following arguments can be used to control the ramp-up:
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
</details>
<details>
<summary><b>📈 Example - Offline Throughput Benchmark</b></summary>
<br/>
```bash ```bash
python3 vllm/benchmarks/benchmark_throughput.py \ python3 vllm/benchmarks/benchmark_throughput.py \
@ -288,7 +311,7 @@ Total num prompt tokens: 5014
Total num output tokens: 1500 Total num output tokens: 1500
``` ```
### VisionArena Benchmark for Vision Language Models **VisionArena Benchmark for Vision Language Models**
``` bash ``` bash
python3 vllm/benchmarks/benchmark_throughput.py \ python3 vllm/benchmarks/benchmark_throughput.py \
@ -308,7 +331,7 @@ Total num prompt tokens: 14527
Total num output tokens: 1280 Total num output tokens: 1280
``` ```
### InstructCoder Benchmark with Speculative Decoding **InstructCoder Benchmark with Speculative Decoding**
``` bash ``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_WORKER_MULTIPROC_METHOD=spawn \
@ -332,7 +355,7 @@ Total num prompt tokens: 261136
Total num output tokens: 204800 Total num output tokens: 204800
``` ```
### Other HuggingFaceDataset Examples **Other HuggingFaceDataset Examples**
**`lmms-lab/LLaVA-OneVision-Data`** **`lmms-lab/LLaVA-OneVision-Data`**
@ -371,7 +394,7 @@ python3 benchmarks/benchmark_throughput.py \
--num-prompts 10 --num-prompts 10
``` ```
### Benchmark with LoRA Adapters **Benchmark with LoRA Adapters**
``` bash ``` bash
# download dataset # download dataset
@ -387,3 +410,196 @@ python3 vllm/benchmarks/benchmark_throughput.py \
--enable-lora \ --enable-lora \
--lora-path yard1/llama-2-7b-sql-lora-test --lora-path yard1/llama-2-7b-sql-lora-test
``` ```
</details>
<details>
<summary><b>🛠️ Example - Structured Output Benchmark</b></summary>
<br/>
Benchmark the performance of structured output generation (JSON, grammar, regex).
**Server Setup**
```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
```
**JSON Schema Benchmark**
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset json \
--structured-output-ratio 1.0 \
--request-rate 10 \
--num-prompts 1000
```
**Grammar-based Generation Benchmark**
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset grammar \
--structure-type grammar \
--request-rate 10 \
--num-prompts 1000
```
**Regex-based Generation Benchmark**
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset regex \
--request-rate 10 \
--num-prompts 1000
```
**Choice-based Generation Benchmark**
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset choice \
--request-rate 10 \
--num-prompts 1000
```
**XGrammar Benchmark Dataset**
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset xgrammar_bench \
--request-rate 10 \
--num-prompts 1000
```
</details>
<details>
<summary><b>📚 Example - Long Document QA Benchmark</b></summary>
<br/>
Benchmark the performance of long document question-answering with prefix caching.
**Basic Long Document QA Test**
```bash
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 16 \
--document-length 2000 \
--output-len 50 \
--repeat-count 5
```
**Different Repeat Modes**
```bash
# Random mode (default) - shuffle prompts randomly
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode random
# Tile mode - repeat entire prompt list in sequence
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode tile
# Interleave mode - repeat each prompt consecutively
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode interleave
```
</details>
<details>
<summary><b>🗂️ Example - Prefix Caching Benchmark</b></summary>
<br/>
Benchmark the efficiency of automatic prefix caching.
**Fixed Prompt with Prefix Caching**
```bash
python3 benchmarks/benchmark_prefix_caching.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-prompts 1 \
--repeat-count 100 \
--input-length-range 128:256
```
**ShareGPT Dataset with Prefix Caching**
```bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 benchmarks/benchmark_prefix_caching.py \
--model meta-llama/Llama-2-7b-chat-hf \
--dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
--enable-prefix-caching \
--num-prompts 20 \
--repeat-count 5 \
--input-length-range 128:256
```
</details>
<details>
<summary><b>⚡ Example - Request Prioritization Benchmark</b></summary>
<br/>
Benchmark the performance of request prioritization in vLLM.
**Basic Prioritization Test**
```bash
python3 benchmarks/benchmark_prioritization.py \
--model meta-llama/Llama-2-7b-chat-hf \
--input-len 128 \
--output-len 64 \
--num-prompts 100 \
--scheduling-policy priority
```
**Multiple Sequences per Prompt**
```bash
python3 benchmarks/benchmark_prioritization.py \
--model meta-llama/Llama-2-7b-chat-hf \
--input-len 128 \
--output-len 64 \
--num-prompts 100 \
--scheduling-policy priority \
--n 2
```
</details>

View File

@ -10,6 +10,7 @@
# 3. Set variables (ALL REQUIRED) # 3. Set variables (ALL REQUIRED)
# BASE: your directory for vllm repo # BASE: your directory for vllm repo
# MODEL: the model served by vllm # MODEL: the model served by vllm
# SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support.
# TP: ways of tensor parallelism # TP: ways of tensor parallelism
# DOWNLOAD_DIR: directory to download and load model weights. # DOWNLOAD_DIR: directory to download and load model weights.
# INPUT_LEN: request input len # INPUT_LEN: request input len
@ -34,6 +35,7 @@
TAG=$(date +"%Y_%m_%d_%H_%M") TAG=$(date +"%Y_%m_%d_%H_%M")
BASE="" BASE=""
MODEL="meta-llama/Llama-3.1-8B-Instruct" MODEL="meta-llama/Llama-3.1-8B-Instruct"
SYSTEM="TPU"
TP=1 TP=1
DOWNLOAD_DIR="" DOWNLOAD_DIR=""
INPUT_LEN=4000 INPUT_LEN=4000
@ -45,12 +47,15 @@ NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
LOG_FOLDER="$BASE/auto-benchmark/$TAG" LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt" RESULT="$LOG_FOLDER/result.txt"
PROFILE_PATH="$LOG_FOLDER/profile"
echo "result file: $RESULT" echo "result file: $RESULT"
echo "model: $MODEL" echo "model: $MODEL"
rm -rf $LOG_FOLDER rm -rf $LOG_FOLDER
rm -rf $PROFILE_PATH
mkdir -p $LOG_FOLDER mkdir -p $LOG_FOLDER
mkdir -p $PROFILE_PATH
cd "$BASE/vllm" cd "$BASE/vllm"
@ -70,10 +75,11 @@ start_server() {
local max_num_seqs=$2 local max_num_seqs=$2
local max_num_batched_tokens=$3 local max_num_batched_tokens=$3
local vllm_log=$4 local vllm_log=$4
local profile_dir=$5
pkill -f vllm pkill -f vllm
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \ VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
--disable-log-requests \ --disable-log-requests \
--port 8004 \ --port 8004 \
--gpu-memory-utilization $gpu_memory_utilization \ --gpu-memory-utilization $gpu_memory_utilization \
@ -105,19 +111,37 @@ start_server() {
fi fi
} }
update_best_profile() {
local profile_dir=$1
local profile_index=$2
sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
selected_profile_file=
if [[ "$SYSTEM" == "TPU" ]]; then
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
fi
if [[ "$SYSTEM" == "GPU" ]]; then
selected_profile_file="${sorted_paths[$profile_index]}"
fi
rm -f $PROFILE_PATH/*
cp $selected_profile_file $PROFILE_PATH
}
run_benchmark() { run_benchmark() {
local max_num_seqs=$1 local max_num_seqs=$1
local max_num_batched_tokens=$2 local max_num_batched_tokens=$2
local gpu_memory_utilization=$3 local gpu_memory_utilization=$3
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt" local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
echo "vllm_log: $vllm_log" echo "vllm_log: $vllm_log"
echo echo
rm -f $vllm_log rm -f $vllm_log
mkdir -p $profile_dir
pkill -f vllm pkill -f vllm
local profile_index=0
echo "starting server..." echo "starting server..."
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
result=$? result=$?
if [[ "$result" -eq 1 ]]; then if [[ "$result" -eq 1 ]]; then
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
@ -144,7 +168,8 @@ run_benchmark() {
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 1000 \ --num-prompts 1000 \
--random-prefix-len $prefix_len \ --random-prefix-len $prefix_len \
--port 8004 &> "$bm_log" --port 8004 \
--profile &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
@ -158,6 +183,7 @@ run_benchmark() {
# start from request-rate as int(throughput) + 1 # start from request-rate as int(throughput) + 1
request_rate=$((${throughput%.*} + 1)) request_rate=$((${throughput%.*} + 1))
while ((request_rate > 0)); do while ((request_rate > 0)); do
profile_index=$((profile_index+1))
# clear prefix cache # clear prefix cache
curl -X POST http://0.0.0.0:8004/reset_prefix_cache curl -X POST http://0.0.0.0:8004/reset_prefix_cache
sleep 5 sleep 5
@ -195,6 +221,12 @@ run_benchmark() {
best_max_num_seqs=$max_num_seqs best_max_num_seqs=$max_num_seqs
best_num_batched_tokens=$max_num_batched_tokens best_num_batched_tokens=$max_num_batched_tokens
best_goodput=$goodput best_goodput=$goodput
if [[ "$SYSTEM" == "TPU" ]]; then
update_best_profile "$profile_dir/plugins/profile" $profile_index
fi
if [[ "$SYSTEM" == "GPU" ]]; then
update_best_profile "$profile_dir" $profile_index
fi
fi fi
else else
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
@ -239,6 +271,6 @@ for num_seqs in "${num_seqs_list[@]}"; do
done done
done done
echo "finish permutations" echo "finish permutations"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"

View File

@ -404,8 +404,14 @@ async def async_request_openai_chat_completions(
chunk_bytes = chunk_bytes.strip() chunk_bytes = chunk_bytes.strip()
if not chunk_bytes: if not chunk_bytes:
continue continue
chunk_bytes = chunk_bytes.decode("utf-8")
# NOTE: SSE comments (often used as pings) start with a colon.
# These are not JSON data payload and should be skipped.
if chunk_bytes.startswith(":"):
continue
chunk = chunk_bytes.removeprefix("data: ")
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
if chunk != "[DONE]": if chunk != "[DONE]":
timestamp = time.perf_counter() timestamp = time.perf_counter()
data = json.loads(chunk) data = json.loads(chunk)

View File

@ -349,11 +349,12 @@ class RandomDataset(BenchmarkDataset):
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
# To avoid uncontrolled change of the prompt length, # To avoid uncontrolled change of the prompt length,
# the encoded sequence is truncated before being decode again. # the encoded sequence is truncated before being decode again.
total_input_len = prefix_len + int(input_lens[i])
re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[ re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
: input_lens[i] :total_input_len
] ]
prompt = tokenizer.decode(re_encoded_sequence) prompt = tokenizer.decode(re_encoded_sequence)
total_input_len = prefix_len + int(input_lens[i]) total_input_len = len(re_encoded_sequence)
requests.append( requests.append(
SampleRequest( SampleRequest(
prompt=prompt, prompt=prompt,

View File

@ -33,7 +33,7 @@ import warnings
from collections.abc import AsyncGenerator, Iterable from collections.abc import AsyncGenerator, Iterable
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime from datetime import datetime
from typing import Any, Optional from typing import Any, Literal, Optional
import numpy as np import numpy as np
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
@ -107,14 +107,42 @@ class BenchmarkMetrics:
percentiles_e2el_ms: list[tuple[float, float]] percentiles_e2el_ms: list[tuple[float, float]]
def _get_current_request_rate(
ramp_up_strategy: Optional[Literal["linear", "exponential"]],
ramp_up_start_rps: Optional[int],
ramp_up_end_rps: Optional[int],
request_index: int,
total_requests: int,
request_rate: float,
) -> float:
if (
ramp_up_strategy
and ramp_up_start_rps is not None
and ramp_up_end_rps is not None
):
progress = request_index / max(total_requests - 1, 1)
if ramp_up_strategy == "linear":
increase = (ramp_up_end_rps - ramp_up_start_rps) * progress
return ramp_up_start_rps + increase
elif ramp_up_strategy == "exponential":
ratio = ramp_up_end_rps / ramp_up_start_rps
return ramp_up_start_rps * (ratio**progress)
else:
raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}")
return request_rate
async def get_request( async def get_request(
input_requests: list[SampleRequest], input_requests: list[SampleRequest],
request_rate: float, request_rate: float,
burstiness: float = 1.0, burstiness: float = 1.0,
) -> AsyncGenerator[SampleRequest, None]: ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
ramp_up_start_rps: Optional[int] = None,
ramp_up_end_rps: Optional[int] = None,
) -> AsyncGenerator[tuple[SampleRequest, float], None]:
""" """
Asynchronously generates requests at a specified rate Asynchronously generates requests at a specified rate
with OPTIONAL burstiness. with OPTIONAL burstiness and OPTIONAL ramp-up strategy.
Args: Args:
input_requests: input_requests:
@ -129,22 +157,44 @@ async def get_request(
A lower burstiness value (0 < burstiness < 1) results A lower burstiness value (0 < burstiness < 1) results
in more bursty requests, while a higher burstiness value in more bursty requests, while a higher burstiness value
(burstiness > 1) results in a more uniform arrival of requests. (burstiness > 1) results in a more uniform arrival of requests.
ramp_up_strategy (optional):
The ramp-up strategy. Can be "linear" or "exponential".
If None, uses constant request rate (specified by request_rate).
ramp_up_start_rps (optional):
The starting request rate for ramp-up.
ramp_up_end_rps (optional):
The ending request rate for ramp-up.
""" """
input_requests: Iterable[SampleRequest] = iter(input_requests)
# Calculate scale parameter theta to maintain the desired request_rate.
assert burstiness > 0, ( assert burstiness > 0, (
f"A positive burstiness factor is expected, but given {burstiness}." f"A positive burstiness factor is expected, but given {burstiness}."
) )
theta = 1.0 / (request_rate * burstiness) # Convert to list to get length for ramp-up calculations
if isinstance(input_requests, Iterable) and not isinstance(input_requests, list):
input_requests = list(input_requests)
total_requests = len(input_requests)
request_index = 0
for request in input_requests: for request in input_requests:
yield request current_request_rate = _get_current_request_rate(
ramp_up_strategy,
ramp_up_start_rps,
ramp_up_end_rps,
request_index,
total_requests,
request_rate,
)
if request_rate == float("inf"): yield request, current_request_rate
request_index += 1
if current_request_rate == float("inf"):
# If the request rate is infinity, then we don't need to wait. # If the request rate is infinity, then we don't need to wait.
continue continue
theta = 1.0 / (current_request_rate * burstiness)
# Sample the request interval from the gamma distribution. # Sample the request interval from the gamma distribution.
# If burstiness is 1, it follows exponential distribution. # If burstiness is 1, it follows exponential distribution.
interval = np.random.gamma(shape=burstiness, scale=theta) interval = np.random.gamma(shape=burstiness, scale=theta)
@ -290,6 +340,9 @@ async def benchmark(
max_concurrency: Optional[int], max_concurrency: Optional[int],
lora_modules: Optional[Iterable[str]], lora_modules: Optional[Iterable[str]],
extra_body: Optional[dict], extra_body: Optional[dict],
ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
ramp_up_start_rps: Optional[int] = None,
ramp_up_end_rps: Optional[int] = None,
): ):
if backend in ASYNC_REQUEST_FUNCS: if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend] request_func = ASYNC_REQUEST_FUNCS[backend]
@ -353,7 +406,15 @@ async def benchmark(
distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution" distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
print(f"Traffic request rate: {request_rate}") if ramp_up_strategy is not None:
print(
f"Traffic ramp-up strategy: {ramp_up_strategy}. Will increase "
f"RPS from {ramp_up_start_rps} to {ramp_up_end_rps} RPS over "
"the duration of the benchmark."
)
else:
print(f"Traffic request rate: {request_rate} RPS.")
print(f"Burstiness factor: {burstiness} ({distribution})") print(f"Burstiness factor: {burstiness} ({distribution})")
print(f"Maximum request concurrency: {max_concurrency}") print(f"Maximum request concurrency: {max_concurrency}")
@ -373,7 +434,34 @@ async def benchmark(
benchmark_start_time = time.perf_counter() benchmark_start_time = time.perf_counter()
tasks: list[asyncio.Task] = [] tasks: list[asyncio.Task] = []
async for request in get_request(input_requests, request_rate, burstiness):
rps_change_events = []
last_int_rps = -1
if ramp_up_strategy is not None and ramp_up_start_rps is not None:
last_int_rps = ramp_up_start_rps
rps_change_events.append(
{
"rps": last_int_rps,
"timestamp": datetime.now().isoformat(),
}
)
async for request, current_request_rate in get_request(
input_requests,
request_rate,
burstiness,
ramp_up_strategy,
ramp_up_start_rps,
ramp_up_end_rps,
):
if ramp_up_strategy is not None:
current_int_rps = int(current_request_rate)
if current_int_rps > last_int_rps:
timestamp = datetime.now().isoformat()
for rps_val in range(last_int_rps + 1, current_int_rps + 1):
rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
last_int_rps = current_int_rps
prompt, prompt_len, output_len, mm_content = ( prompt, prompt_len, output_len, mm_content = (
request.prompt, request.prompt,
request.prompt_len, request.prompt_len,
@ -397,11 +485,8 @@ async def benchmark(
ignore_eos=ignore_eos, ignore_eos=ignore_eos,
extra_body=extra_body, extra_body=extra_body,
) )
tasks.append( task = limited_request_func(request_func_input=request_func_input, pbar=pbar)
asyncio.create_task( tasks.append(asyncio.create_task(task))
limited_request_func(request_func_input=request_func_input, pbar=pbar)
)
)
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
if profile: if profile:
@ -477,6 +562,9 @@ async def benchmark(
"errors": [output.error for output in outputs], "errors": [output.error for output in outputs],
} }
if rps_change_events:
result["rps_change_events"] = rps_change_events
def process_one_metric( def process_one_metric(
# E.g., "ttft" # E.g., "ttft"
metric_attribute_name: str, metric_attribute_name: str,
@ -610,6 +698,26 @@ def main(args: argparse.Namespace):
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
tokenizer_mode = args.tokenizer_mode tokenizer_mode = args.tokenizer_mode
# Validate ramp-up arguments
if args.ramp_up_strategy is not None:
if args.request_rate != float("inf"):
raise ValueError(
"When using ramp-up, do not specify --request-rate. "
"The request rate will be controlled by ramp-up parameters. "
"Please remove the --request-rate argument."
)
if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
raise ValueError(
"When using --ramp-up-strategy, both --ramp-up-start-rps and "
"--ramp-up-end-rps must be specified"
)
if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
raise ValueError("Ramp-up start and end RPS must be non-negative")
if args.ramp_up_start_rps > args.ramp_up_end_rps:
raise ValueError("Ramp-up start RPS must be less than end RPS")
if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0:
raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")
if args.base_url is not None: if args.base_url is not None:
api_url = f"{args.base_url}{args.endpoint}" api_url = f"{args.base_url}{args.endpoint}"
base_url = f"{args.base_url}" base_url = f"{args.base_url}"
@ -802,6 +910,9 @@ def main(args: argparse.Namespace):
max_concurrency=args.max_concurrency, max_concurrency=args.max_concurrency,
lora_modules=args.lora_modules, lora_modules=args.lora_modules,
extra_body=sampling_params, extra_body=sampling_params,
ramp_up_strategy=args.ramp_up_strategy,
ramp_up_start_rps=args.ramp_up_start_rps,
ramp_up_end_rps=args.ramp_up_end_rps,
) )
) )
@ -834,6 +945,11 @@ def main(args: argparse.Namespace):
result_json["burstiness"] = args.burstiness result_json["burstiness"] = args.burstiness
result_json["max_concurrency"] = args.max_concurrency result_json["max_concurrency"] = args.max_concurrency
if args.ramp_up_strategy is not None:
result_json["ramp_up_strategy"] = args.ramp_up_strategy
result_json["ramp_up_start_rps"] = args.ramp_up_start_rps
result_json["ramp_up_end_rps"] = args.ramp_up_end_rps
# Merge with benchmark result # Merge with benchmark result
result_json = {**result_json, **benchmark_result} result_json = {**result_json, **benchmark_result}
@ -859,7 +975,10 @@ def main(args: argparse.Namespace):
if args.max_concurrency is not None if args.max_concurrency is not None
else "" else ""
) )
file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa if args.ramp_up_strategy is not None:
file_name = f"{backend}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
else:
file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
if args.result_filename: if args.result_filename:
file_name = args.result_filename file_name = args.result_filename
if args.result_dir: if args.result_dir:
@ -1225,6 +1344,31 @@ def create_argument_parser():
"script chooses a LoRA module at random.", "script chooses a LoRA module at random.",
) )
parser.add_argument(
"--ramp-up-strategy",
type=str,
default=None,
choices=["linear", "exponential"],
help="The ramp-up strategy. This would be used to "
"ramp up the request rate from initial RPS to final "
"RPS rate (specified by --ramp-up-start-rps and --ramp-up-end-rps). "
"over the duration of the benchmark.",
)
parser.add_argument(
"--ramp-up-start-rps",
type=int,
default=None,
help="The starting request rate for ramp-up (RPS). "
"Needs to be specified when --ramp-up-strategy is used.",
)
parser.add_argument(
"--ramp-up-end-rps",
type=int,
default=None,
help="The ending request rate for ramp-up (RPS). "
"Needs to be specified when --ramp-up-strategy is used.",
)
return parser return parser

View File

@ -97,7 +97,7 @@ def run_vllm(
assert lora_requests is None, "BeamSearch API does not support LoRA" assert lora_requests is None, "BeamSearch API does not support LoRA"
prompts = [request.prompt for request in requests] prompts = [request.prompt for request in requests]
# output_len should be the same for all requests. # output_len should be the same for all requests.
output_len = requests[0][2] output_len = requests[0].expected_output_len
for request in requests: for request in requests:
assert request.expected_output_len == output_len assert request.expected_output_len == output_len
start = time.perf_counter() start = time.perf_counter()

View File

@ -19,7 +19,7 @@ from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import ( from vllm.model_executor.layers.quantization.utils.fp8_utils import (
w8a8_block_fp8_matmul, w8a8_block_fp8_matmul,
) )
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser, cdiv
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
@ -117,14 +117,9 @@ def bench_fp8(
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
def ceil_div(x: int, y: int) -> int: block_scale_a = torch.rand((m, cdiv(k, 128)), device="cuda", dtype=torch.float32)
return (x + y - 1) // y
block_scale_a = torch.rand(
(m, ceil_div(k, 128)), device="cuda", dtype=torch.float32
)
block_scale_b = torch.rand( block_scale_b = torch.rand(
ceil_div(k, 128), ceil_div(n, 128), device="cuda", dtype=torch.float32 cdiv(k, 128), cdiv(n, 128), device="cuda", dtype=torch.float32
) )
block_scale_a_M_major = block_scale_a.t().contiguous().t() block_scale_a_M_major = block_scale_a.t().contiguous().t()
block_scale_b_K_major = block_scale_b.t().contiguous().t() block_scale_b_K_major = block_scale_b.t().contiguous().t()

View File

@ -22,8 +22,16 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
MARLIN_SUPPORTED_GROUP_SIZES, MARLIN_SUPPORTED_GROUP_SIZES,
query_marlin_supported_quant_types, query_marlin_supported_quant_types,
) )
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
FP4_MARLIN_SUPPORTED_GROUP_SIZES,
rand_marlin_weight_fp4_like,
)
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
marlin_quant_fp8_torch,
)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
MarlinWorkspace, MarlinWorkspace,
awq_marlin_quantize,
marlin_quantize, marlin_quantize,
) )
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import ( from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
@ -35,7 +43,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
quantize_weights, quantize_weights,
sort_weights, sort_weights,
) )
from vllm.scalar_type import ScalarType from vllm.scalar_type import ScalarType, scalar_types
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"] DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
@ -57,80 +65,144 @@ def bench_run(
size_n: int, size_n: int,
): ):
label = "Quant Matmul" label = "Quant Matmul"
sub_label = "{}, act={} k_full={}, q={}, g={}, MKN=({}x{}x{})".format( sub_label = "{}, act={} k_full={}, q={}, g={}, MKN=({}x{}x{})".format(
model, act_order, is_k_full, str(quant_type), group_size, size_m, size_k, size_n model, act_order, is_k_full, str(quant_type), group_size, size_m, size_k, size_n
) )
print(f"Testing: {sub_label}") print(f"Testing: {sub_label}")
a = torch.randn(size_m, size_k).to(torch.half).cuda() a = torch.randn(size_m, size_k).to(torch.half).cuda()
b = torch.rand(size_k, size_n).to(torch.half).cuda() b = torch.rand(size_k, size_n).to(torch.half).cuda()
has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
if act_order and (group_size == -1 or group_size == size_k or has_zp):
return
if size_k % group_size != 0:
return
a_tmp = torch.zeros(size_m, size_k).to(torch.half).cuda() marlin_24_supported = (
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
# Marlin quant and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
(
marlin_w_ref,
marlin_q_w,
marlin_s,
marlin_g_idx,
marlin_sort_indices,
marlin_rand_perm,
) = marlin_quantize(b, quant_type, group_size, act_order)
# Marlin_24 quant
(marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
marlin_24_quantize(b, quant_type, group_size)
) )
repack_supported = (
marlin_zp = torch.empty(0, dtype=torch.int, device=b.device) quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
and group_size in MARLIN_SUPPORTED_GROUP_SIZES
# GPTQ quant
(w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights(
b, quant_type, group_size, act_order
) )
q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n) allspark_supported = (
# For act_order, sort the "weights" and "g_idx"
# so that group ids are increasing
repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
if act_order:
(q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
# Prepare
marlin_workspace = MarlinWorkspace(
size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
)
marlin_24_workspace = MarlinWorkspace(
size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
)
marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int)
# AllSpark W8A16 quant
as_supported_case = (
quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
and group_size == -1 and group_size == -1
and not act_order and not act_order
and is_k_full and is_k_full
) )
if as_supported_case:
properties = torch.cuda.get_device_properties(b.device.index)
sm_count = properties.multi_processor_count
sm_version = properties.major * 10 + properties.minor
supported_arch = sm_version >= 80 and sm_version < 90 def gen_marlin_params():
as_supported_case = as_supported_case and supported_arch # Marlin quant
if supported_arch: marlin_g_idx = marlin_sort_indices = marlin_zp = marlin_s2 = None
has_zp = False if quant_type == scalar_types.float4_e2m1f:
w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp) if group_size != 16 or act_order:
qw = qw.to(torch.uint8) return
marlin_w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_fp4_like(
qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight( b.T, group_size
qw, s, zp, has_zp
) )
CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD elif quant_type == scalar_types.float8_e4m3fn:
if group_size not in [-1, 128] or act_order:
return
marlin_w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(b.T, group_size)
elif group_size == 16:
return
elif has_zp:
marlin_w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
b, quant_type, group_size
)
else:
marlin_w_ref, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, _ = (
marlin_quantize(b, quant_type, group_size, act_order)
)
return (
marlin_w_ref,
marlin_q_w,
marlin_s,
marlin_s2,
marlin_zp,
marlin_g_idx,
marlin_sort_indices,
)
def gen_marlin_24_params():
marlin_24_w_ref = marlin_24_q_w_comp = marlin_24_meta = marlin_24_s = None
if marlin_24_supported:
(marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
marlin_24_quantize(b, quant_type, group_size)
)
return (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s)
def gen_repack_params():
q_w_gptq = None
repack_sort_indices = None
if repack_supported:
(w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights(
b, quant_type, group_size, act_order
)
q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
# For act_order, sort the "weights" and "g_idx"
# so that group ids are increasing
repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
if act_order:
(q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
return q_w_gptq, repack_sort_indices
def gen_allspark_params():
qw_reorder = s_reorder = zp_reorder = sm_count = sm_version = (
CUBLAS_M_THRESHOLD
) = None
nonlocal allspark_supported
if allspark_supported:
properties = torch.cuda.get_device_properties(b.device.index)
sm_count = properties.multi_processor_count
sm_version = properties.major * 10 + properties.minor
supported_arch = sm_version >= 80 and sm_version < 90
allspark_supported = allspark_supported and supported_arch
if supported_arch:
w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp)
qw = qw.to(torch.uint8)
qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
qw, s, zp, has_zp
)
CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD
return (
qw_reorder,
s_reorder,
zp_reorder,
sm_count,
sm_version,
CUBLAS_M_THRESHOLD,
)
(
marlin_w_ref,
marlin_q_w,
marlin_s,
marlin_s2,
marlin_zp,
marlin_g_idx,
marlin_sort_indices,
) = gen_marlin_params()
marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s = (
gen_marlin_24_params()
)
q_w_gptq, repack_sort_indices = gen_repack_params()
qw_reorder, s_reorder, zp_reorder, sm_count, sm_version, CUBLAS_M_THRESHOLD = (
gen_allspark_params()
)
# Prepare
marlin_workspace = MarlinWorkspace(
size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
)
marlin_24_workspace = MarlinWorkspace(
size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
)
globals = { globals = {
# Gen params # Gen params
@ -140,15 +212,14 @@ def bench_run(
"size_n": size_n, "size_n": size_n,
"size_k": size_k, "size_k": size_k,
"a": a, "a": a,
"a_tmp": a_tmp,
# Marlin params # Marlin params
"marlin_w_ref": marlin_w_ref, "marlin_w_ref": marlin_w_ref,
"marlin_q_w": marlin_q_w, "marlin_q_w": marlin_q_w,
"marlin_s": marlin_s, "marlin_s": marlin_s,
"marlin_s2": marlin_s2,
"marlin_zp": marlin_zp, "marlin_zp": marlin_zp,
"marlin_g_idx": marlin_g_idx, "marlin_g_idx": marlin_g_idx,
"marlin_sort_indices": marlin_sort_indices, "marlin_sort_indices": marlin_sort_indices,
"marlin_rand_perm": marlin_rand_perm,
"marlin_workspace": marlin_workspace, "marlin_workspace": marlin_workspace,
"is_k_full": is_k_full, "is_k_full": is_k_full,
# Marlin_24 params # Marlin_24 params
@ -161,12 +232,12 @@ def bench_run(
"q_w_gptq": q_w_gptq, "q_w_gptq": q_w_gptq,
"repack_sort_indices": repack_sort_indices, "repack_sort_indices": repack_sort_indices,
# AllSpark W8A16 params # AllSpark W8A16 params
"qw_reorder": qw_reorder if as_supported_case else None, "qw_reorder": qw_reorder,
"s_reorder": s_reorder if as_supported_case else None, "s_reorder": s_reorder,
"zp_reorder": zp_reorder if as_supported_case else None, "zp_reorder": zp_reorder,
"sm_count": sm_count if as_supported_case else None, "sm_count": sm_count,
"sm_version": sm_version if as_supported_case else None, "sm_version": sm_version,
"CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD if as_supported_case else None, "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD,
# Kernels # Kernels
"gptq_marlin_gemm": ops.gptq_marlin_gemm, "gptq_marlin_gemm": ops.gptq_marlin_gemm,
"gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm, "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
@ -177,7 +248,7 @@ def bench_run(
min_run_time = 1 min_run_time = 1
# Warmup pytorch # Warmup pytorch
for i in range(5): for _ in range(5):
torch.matmul(a, marlin_w_ref) torch.matmul(a, marlin_w_ref)
results.append( results.append(
@ -192,17 +263,17 @@ def bench_run(
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501 stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,
description="gptq_marlin_gemm_fp16", description="gptq_marlin_gemm",
).blocked_autorange(min_run_time=min_run_time) ).blocked_autorange(min_run_time=min_run_time)
) )
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501 stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,
@ -210,10 +281,7 @@ def bench_run(
).blocked_autorange(min_run_time=min_run_time) ).blocked_autorange(min_run_time=min_run_time)
) )
if ( if marlin_24_supported:
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
):
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501 stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501
@ -224,17 +292,18 @@ def bench_run(
).blocked_autorange(min_run_time=min_run_time) ).blocked_autorange(min_run_time=min_run_time)
) )
results.append( if repack_supported:
benchmark.Timer( results.append(
stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501 benchmark.Timer(
globals=globals, stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501
label=label, globals=globals,
sub_label=sub_label, label=label,
description="gptq_marlin_repack", sub_label=sub_label,
).blocked_autorange(min_run_time=min_run_time) description="gptq_marlin_repack",
) ).blocked_autorange(min_run_time=min_run_time)
)
if as_supported_case: if allspark_supported:
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)", # noqa: E501 stmt="output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)", # noqa: E501
@ -250,7 +319,6 @@ def main(args):
print("Benchmarking models:") print("Benchmarking models:")
for i, model in enumerate(args.models): for i, model in enumerate(args.models):
print(f"[{i}] {model}") print(f"[{i}] {model}")
results: list[benchmark.Measurement] = [] results: list[benchmark.Measurement] = []
for model in args.models: for model in args.models:
@ -278,14 +346,17 @@ def main(args):
): ):
continue continue
for quant_type in query_marlin_supported_quant_types(False): for quant_type in query_marlin_supported_quant_types():
if ( if (
len(args.limit_num_bits) > 0 len(args.limit_num_bits) > 0
and quant_type.size_bits not in args.limit_num_bits and quant_type.size_bits not in args.limit_num_bits
): ):
continue continue
for group_size in MARLIN_SUPPORTED_GROUP_SIZES: for group_size in (
MARLIN_SUPPORTED_GROUP_SIZES
+ FP4_MARLIN_SUPPORTED_GROUP_SIZES
):
if ( if (
len(args.limit_group_size) > 0 len(args.limit_group_size) > 0
and group_size not in args.limit_group_size and group_size not in args.limit_group_size

View File

@ -0,0 +1,159 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import itertools
import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
moe_align_block_size_triton,
)
from vllm.triton_utils import triton
def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
return torch.stack(
[
torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk]
for _ in range(num_tokens)
]
)
def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
"""
Verifies vllm vs. Triton
"""
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
# 1. malloc space for triton and vllm
# malloc enough space (max_num_tokens_padded) for the sorted ids
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
sorted_ids_triton = torch.empty(
(max_num_tokens_padded,), dtype=torch.int32, device="cuda"
)
sorted_ids_triton.fill_(topk_ids.numel()) # fill with sentinel value
expert_ids_triton = torch.zeros(
(max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
)
num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
sorted_ids_vllm.fill_(topk_ids.numel())
expert_ids_vllm = torch.zeros_like(expert_ids_triton)
num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
# 2. run implementations
moe_align_block_size_triton(
topk_ids,
num_experts,
block_size,
sorted_ids_triton,
expert_ids_triton,
num_tokens_post_pad_triton,
)
ops.moe_align_block_size(
topk_ids,
num_experts,
block_size,
sorted_ids_vllm,
expert_ids_vllm,
num_tokens_post_pad_vllm,
)
print(f"✅ VLLM implementation works with {num_experts} experts!")
# 3. compare results
if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
num_tokens_post_pad_triton, num_tokens_post_pad_vllm
):
print("✅ Triton and VLLM implementations match.")
else:
print("❌ Triton and VLLM implementations DO NOT match.")
print("Triton expert_ids:", expert_ids_triton)
print("VLLM expert_ids:", expert_ids_vllm)
print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
# test configurations
num_tokens_range = [1, 16, 256, 4096]
num_experts_range = [16, 64, 224, 256, 280, 512]
topk_range = [1, 2, 8]
configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["num_tokens", "num_experts", "topk"],
x_vals=configs,
line_arg="provider",
line_vals=["vllm", "triton"], # "triton"
line_names=["VLLM", "Triton"], # "Triton"
plot_name="moe-align-block-size-performance",
args={},
)
)
def benchmark(num_tokens, num_experts, topk, provider):
"""Benchmark function for Triton."""
block_size = 256
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
sorted_ids.fill_(topk_ids.numel())
max_num_m_blocks = max_num_tokens_padded // block_size
expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
quantiles = [0.5, 0.2, 0.8]
if provider == "vllm":
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: ops.moe_align_block_size(
topk_ids,
num_experts,
block_size,
sorted_ids.clone(),
expert_ids.clone(),
num_tokens_post_pad.clone(),
),
quantiles=quantiles,
)
elif provider == "triton":
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: moe_align_block_size_triton(
topk_ids,
num_experts,
block_size,
sorted_ids.clone(),
expert_ids.clone(),
num_tokens_post_pad.clone(),
),
quantiles=quantiles,
)
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--num_experts",
type=int,
default=64,
choices=[8, 16, 32, 64, 128, 256],
)
parser.add_argument(
"--topk",
type=int,
default=8,
choices=[2, 4, 8],
help="Top-k value for correctness check.",
)
args = parser.parse_args()
print("Running correctness check...")
check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
benchmark.run(print_data=True, show_plots=True)

View File

@ -85,12 +85,6 @@ def benchmark_shape(m: int,
# === DeepGEMM Implementation === # === DeepGEMM Implementation ===
def deepgemm_gemm(): def deepgemm_gemm():
# A quantization is inside the loop as it depends on activations
# A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
# A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(
# A, block_size[1])
# A_scale_aligned = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
# C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm), deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm),
(B_deepgemm, B_scale_deepgemm), (B_deepgemm, B_scale_deepgemm),
C_deepgemm) C_deepgemm)
@ -98,8 +92,6 @@ def benchmark_shape(m: int,
# === vLLM Triton Implementation === # === vLLM Triton Implementation ===
def vllm_triton_gemm(): def vllm_triton_gemm():
# A quantization is inside the loop as it depends on activations
# A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
return w8a8_block_fp8_matmul(A_vllm, return w8a8_block_fp8_matmul(A_vllm,
B_vllm, B_vllm,
A_scale_vllm, A_scale_vllm,
@ -109,9 +101,6 @@ def benchmark_shape(m: int,
# === vLLM CUTLASS Implementation === # === vLLM CUTLASS Implementation ===
def vllm_cutlass_gemm(): def vllm_cutlass_gemm():
# A quantization is inside the loop as it depends on activations
# A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
# A, block_size[1], column_major_scales=True)
return ops.cutlass_scaled_mm(A_vllm_cutlass, return ops.cutlass_scaled_mm(A_vllm_cutlass,
B_vllm.T, B_vllm.T,
scale_a=A_scale_vllm_cutlass, scale_a=A_scale_vllm_cutlass,

View File

@ -38,7 +38,7 @@ else()
FetchContent_Declare( FetchContent_Declare(
vllm-flash-attn vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 763ad155a1c826f71ff318f41edb1e4e5e376ddb GIT_TAG 5f3644181c7a15345ce20bfc65af117d3601b524
GIT_PROGRESS TRUE GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types # Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@ -122,6 +122,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
"-DENABLE_FP8" "-DENABLE_FP8"
"-U__HIP_NO_HALF_CONVERSIONS__" "-U__HIP_NO_HALF_CONVERSIONS__"
"-U__HIP_NO_HALF_OPERATORS__" "-U__HIP_NO_HALF_OPERATORS__"
"-Werror=unused-variable"
"-fno-gpu-rdc") "-fno-gpu-rdc")
endif() endif()
@ -264,8 +265,8 @@ macro(set_gencode_flags_for_srcs)
endmacro() endmacro()
# #
# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form # For the given `SRC_CUDA_ARCHS` list of gencode versions in the form
# `<major>.<minor>[letter]` compute the "loose intersection" with the # `<major>.<minor>[letter]` compute the "loose intersection" with the
# `TGT_CUDA_ARCHS` list of gencodes. We also support the `+PTX` suffix in # `TGT_CUDA_ARCHS` list of gencodes. We also support the `+PTX` suffix in
# `SRC_CUDA_ARCHS` which indicates that the PTX code should be built when there # `SRC_CUDA_ARCHS` which indicates that the PTX code should be built when there
# is a CUDA_ARCH in `TGT_CUDA_ARCHS` that is equal to or larger than the # is a CUDA_ARCH in `TGT_CUDA_ARCHS` that is equal to or larger than the
@ -277,7 +278,7 @@ endmacro()
# in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`. # in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
# We have special handling for x.0a, if x.0a is in `SRC_CUDA_ARCHS` and x.0 is # We have special handling for x.0a, if x.0a is in `SRC_CUDA_ARCHS` and x.0 is
# in `TGT_CUDA_ARCHS` then we should remove x.0a from `SRC_CUDA_ARCHS` and add # in `TGT_CUDA_ARCHS` then we should remove x.0a from `SRC_CUDA_ARCHS` and add
# x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS). # x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS).
# The result is stored in `OUT_CUDA_ARCHS`. # The result is stored in `OUT_CUDA_ARCHS`.
# #
# Example: # Example:
@ -312,21 +313,16 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
# if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
# remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
set(_CUDA_ARCHS) set(_CUDA_ARCHS)
if ("9.0a" IN_LIST _SRC_CUDA_ARCHS) foreach(_arch ${_SRC_CUDA_ARCHS})
list(REMOVE_ITEM _SRC_CUDA_ARCHS "9.0a") if(_arch MATCHES "\\a$")
if ("9.0" IN_LIST TGT_CUDA_ARCHS) list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
list(REMOVE_ITEM _TGT_CUDA_ARCHS "9.0") string(REPLACE "a" "" _base "${_arch}")
set(_CUDA_ARCHS "9.0a") if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
list(APPEND _CUDA_ARCHS "${_arch}")
endif()
endif() endif()
endif() endforeach()
if ("10.0a" IN_LIST _SRC_CUDA_ARCHS)
list(REMOVE_ITEM _SRC_CUDA_ARCHS "10.0a")
if ("10.0" IN_LIST TGT_CUDA_ARCHS)
list(REMOVE_ITEM _TGT_CUDA_ARCHS "10.0")
set(_CUDA_ARCHS "10.0a")
endif()
endif()
list(SORT _SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING) list(SORT _SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
@ -358,7 +354,7 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
endforeach() endforeach()
list(REMOVE_DUPLICATES _CUDA_ARCHS) list(REMOVE_DUPLICATES _CUDA_ARCHS)
# reapply +PTX suffix to architectures that requested PTX # reapply +PTX suffix to architectures that requested PTX
set(_FINAL_ARCHS) set(_FINAL_ARCHS)
foreach(_arch ${_CUDA_ARCHS}) foreach(_arch ${_CUDA_ARCHS})
@ -369,7 +365,7 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
endif() endif()
endforeach() endforeach()
set(_CUDA_ARCHS ${_FINAL_ARCHS}) set(_CUDA_ARCHS ${_FINAL_ARCHS})
set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE) set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
endfunction() endfunction()

View File

@ -207,7 +207,7 @@ void cutlass_mla_decode_sm100a(torch::Tensor const& out,
"page_table must be a 32-bit integer tensor"); "page_table must be a 32-bit integer tensor");
auto in_dtype = q_nope.dtype(); auto in_dtype = q_nope.dtype();
at::cuda::CUDAGuard device_guard{(char)q_nope.get_device()}; const at::cuda::OptionalCUDAGuard device_guard(device_of(q_nope));
const cudaStream_t stream = const cudaStream_t stream =
at::cuda::getCurrentCUDAStream(q_nope.get_device()); at::cuda::getCurrentCUDAStream(q_nope.get_device());
if (in_dtype == at::ScalarType::Half) { if (in_dtype == at::ScalarType::Half) {

View File

@ -131,16 +131,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// Quantization // Quantization
#ifdef __AVX512F__ #ifdef __AVX512F__
at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
// Compute int8 quantized tensor for given scaling factor. // Compute int8 quantized tensor for given scaling factor.
ops.def( ops.def(
"static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale," "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
"Tensor? azp) -> ()"); "Tensor? azp) -> ()",
{stride_tag});
ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant); ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
// Compute int8 quantized tensor and scaling factor // Compute int8 quantized tensor and scaling factor
ops.def( ops.def(
"dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, " "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
"Tensor!? azp) -> ()"); "Tensor!? azp) -> ()",
{stride_tag});
ops.impl("dynamic_scaled_int8_quant", torch::kCPU, ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
&dynamic_scaled_int8_quant); &dynamic_scaled_int8_quant);
// W8A8 GEMM, supporting symmetric per-tensor or per-row/column // W8A8 GEMM, supporting symmetric per-tensor or per-row/column
@ -148,7 +151,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def( ops.def(
"cutlass_scaled_mm(Tensor! out, Tensor a," "cutlass_scaled_mm(Tensor! out, Tensor a,"
" Tensor b, Tensor a_scales," " Tensor b, Tensor a_scales,"
" Tensor b_scales, Tensor? bias) -> ()"); " Tensor b_scales, Tensor? bias) -> ()",
{stride_tag});
ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm); ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
// w8a8 GEMM, supporting asymmetric per-tensor or per-row/column // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
// quantization. // quantization.
@ -156,7 +160,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"cutlass_scaled_mm_azp(Tensor! out, Tensor a," "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
" Tensor b, Tensor a_scales," " Tensor b, Tensor a_scales,"
" Tensor b_scales, Tensor azp_adj," " Tensor b_scales, Tensor azp_adj,"
" Tensor? azp, Tensor? bias) -> ()"); " Tensor? azp, Tensor? bias) -> ()",
{stride_tag});
ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp); ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
#elif defined(__powerpc64__) #elif defined(__powerpc64__)
// Compute int8 quantized tensor for given scaling factor. // Compute int8 quantized tensor for given scaling factor.

114
csrc/custom_quickreduce.cu Normal file
View File

@ -0,0 +1,114 @@
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAStream.h>
#include <torch/all.h>
#ifdef USE_ROCM
#include "quickreduce/quick_reduce.h"
quickreduce::fptr_t init_custom_qr(int64_t rank, int64_t world_size,
std::optional<int64_t> qr_max_size) {
if (world_size > 8)
throw std::invalid_argument("world size > 8 is not supported");
if (world_size == 6)
throw std::invalid_argument("world size == 6 is not supported");
if (world_size % 2 != 0)
throw std::invalid_argument("Odd num gpus is not supported for now");
if (rank < 0 || rank >= world_size)
throw std::invalid_argument("invalid rank passed in");
quickreduce::DeviceComms* fptr = new quickreduce::DeviceComms();
fptr->init(world_size, rank, qr_max_size);
return (quickreduce::fptr_t)fptr;
}
void qr_destroy(quickreduce::fptr_t _fa) {
if (_fa) {
auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
fa->destroy();
delete fa;
}
}
torch::Tensor qr_get_handle(quickreduce::fptr_t _fa) {
auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
hipIpcMemHandle_t handle = fa->get_handle();
auto options =
torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
auto data_handle =
torch::empty({static_cast<int64_t>(sizeof(hipIpcMemHandle_t))}, options);
std::memcpy(data_handle.data_ptr(), &handle, sizeof(hipIpcMemHandle_t));
return data_handle;
}
void qr_open_handles(quickreduce::fptr_t _fa,
const std::vector<torch::Tensor>& handles) {
auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
std::vector<hipIpcMemHandle_t> ipc_handles;
ipc_handles.reserve(handles.size());
for (auto& handle : handles) {
// Ensure the tensor is on the same device as the current device.
hipIpcMemHandle_t ipc_handle;
std::memcpy(&ipc_handle, handle.data_ptr(), sizeof(hipIpcMemHandle_t));
ipc_handles.push_back(ipc_handle);
}
fa->open_ipc_handles(ipc_handles);
}
void qr_all_reduce(quickreduce::fptr_t _fa, torch::Tensor& inp,
torch::Tensor& out, int64_t quant_level, bool cast_bf2half) {
auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
auto stream = at::cuda::getCurrentHIPStreamMasqueradingAsCUDA();
TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
TORCH_CHECK_EQ(inp.numel(), out.numel());
TORCH_CHECK_LE(out.numel(), fa->kMaxProblemSize);
if (out.scalar_type() == at::ScalarType::Half) {
fa->allreduce<half, false>(reinterpret_cast<half*>(inp.data_ptr()),
reinterpret_cast<half*>(out.data_ptr()),
out.numel(), quant_level, stream);
} else if (out.scalar_type() == at::ScalarType::BFloat16) {
if (cast_bf2half) {
fa->allreduce<half, true>(reinterpret_cast<half*>(inp.data_ptr()),
reinterpret_cast<half*>(out.data_ptr()),
out.numel(), quant_level, stream);
} else {
fa->allreduce<quickreduce::nv_bfloat16, false>(
reinterpret_cast<quickreduce::nv_bfloat16*>(inp.data_ptr()),
reinterpret_cast<quickreduce::nv_bfloat16*>(out.data_ptr()),
out.numel(), quant_level, stream);
}
} else {
throw std::runtime_error(
"quick allreduce only supports float16 and bfloat16");
}
}
int64_t qr_max_size() {
// The default is 2GB (2,147,483,648 bytes)
return static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + 1;
}
#define INSTANTIATE_FOR_WORLDSIZE(T, Codec, cast_bf2half) \
template struct quickreduce::AllReduceTwoshot<T, Codec<T, 2>, \
cast_bf2half>; \
template struct quickreduce::AllReduceTwoshot<T, Codec<T, 4>, \
cast_bf2half>; \
template struct quickreduce::AllReduceTwoshot<T, Codec<T, 8>, cast_bf2half>;
INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecFP, false)
INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ4, false)
INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ6, false)
INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ8, false)
INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecFP, true)
INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ4, true)
INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ6, true)
INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ8, true)
INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecFP, false)
INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ4, false)
INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ6, false)
INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ8, false)
#endif // USE_ROCM

View File

@ -185,9 +185,7 @@ void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
params.conv_states_ptr = nullptr; params.conv_states_ptr = nullptr;
} }
// Otherwise the kernel will be launched from cuda:0 device const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
// Cast to char to avoid compiler warning about narrowing
at::cuda::CUDAGuard device_guard{(char)x.get_device()};
auto stream = at::cuda::getCurrentCUDAStream().stream(); auto stream = at::cuda::getCurrentCUDAStream().stream();
DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_fwd", [&] { DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_fwd", [&] {
causal_conv1d_fwd_cuda<input_t, weight_t>(params, stream); causal_conv1d_fwd_cuda<input_t, weight_t>(params, stream);
@ -278,9 +276,7 @@ void causal_conv1d_update(const at::Tensor &x,
params.conv_state_indices_ptr = nullptr; params.conv_state_indices_ptr = nullptr;
} }
// Otherwise the kernel will be launched from cuda:0 device const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
// Cast to char to avoid compiler warning about narrowing
at::cuda::CUDAGuard device_guard{(char)x.get_device()};
auto stream = at::cuda::getCurrentCUDAStream().stream(); auto stream = at::cuda::getCurrentCUDAStream().stream();
DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_update", [&] { DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_update", [&] {
causal_conv1d_update_cuda<input_t, weight_t>(params, stream); causal_conv1d_update_cuda<input_t, weight_t>(params, stream);

View File

@ -647,9 +647,7 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
); );
// Otherwise the kernel will be launched from cuda:0 device const at::cuda::OptionalCUDAGuard device_guard(device_of(u));
// Cast to char to avoid compiler warning about narrowing
at::cuda::CUDAGuard device_guard{(char)u.get_device()};
auto stream = at::cuda::getCurrentCUDAStream().stream(); auto stream = at::cuda::getCurrentCUDAStream().stream();
DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] { DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
selective_scan_fwd_cuda<input_t, weight_t>(params, stream); selective_scan_fwd_cuda<input_t, weight_t>(params, stream);

View File

@ -13,232 +13,45 @@
namespace vllm { namespace vllm {
namespace moe { namespace moe {
namespace {
__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
int32_t col) {
// don't worry about overflow because num_experts is relatively small
return row * total_col + col;
}
} // namespace
template <typename scalar_t, typename token_cnts_t>
__global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
int32_t* sorted_token_ids,
int32_t* expert_ids,
int32_t* total_tokens_post_pad,
int32_t num_experts,
int32_t block_size, size_t numel) {
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
const size_t start_idx = threadIdx.x * tokens_per_thread;
extern __shared__ int32_t shared_mem[];
int32_t* cumsum = shared_mem; // 1d tensor with shape (num_experts + 1)
token_cnts_t* tokens_cnts =
(token_cnts_t*)(shared_mem + num_experts +
1); // 2d tensor with shape (blockDim.x + 1, num_experts)
for (int i = 0; i < num_experts; ++i) {
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
}
/**
* In the first step we compute token_cnts[thread_index + 1][expert_index],
* which counts how many tokens in the token shard of thread_index are
* assigned to expert expert_index.
*/
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
}
__syncthreads();
// For each expert we accumulate the token counts from the different threads.
if (threadIdx.x < num_experts) {
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
for (int i = 1; i <= blockDim.x; ++i) {
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
}
}
__syncthreads();
// We accumulate the token counts of all experts in thread 0.
if (threadIdx.x == 0) {
cumsum[0] = 0;
for (int i = 1; i <= num_experts; ++i) {
cumsum[i] = cumsum[i - 1] +
CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
block_size) *
block_size;
}
*total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
}
__syncthreads();
/**
* For each expert, each thread processes the tokens of the corresponding
* blocks and stores the corresponding expert_id for each block.
*/
if (threadIdx.x < num_experts) {
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
i += block_size) {
expert_ids[i / block_size] = threadIdx.x;
}
}
/**
* Each thread processes a token shard, calculating the index of each token
* after sorting by expert number. Given the example topk_ids =
* [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *,
* *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a
* padding value(preset in python).
*/
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
int32_t expert_id = topk_ids[i];
/** The cumsum[expert_id] stores the starting index of the tokens that the
* expert with expert_id needs to process, and
* tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
* processed by the expert with expert_id within the current thread's token
* shard.
*/
int32_t rank_post_pad =
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
cumsum[expert_id];
sorted_token_ids[rank_post_pad] = i;
++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
}
}
// TODO(simon): this is temporarily adapted from
// https://github.com/sgl-project/sglang/commit/31548116a8dc8c6df7e146e0587335a59fc5b9d7
// we did this to unblock Deepseek V3 but there should be a better
// implementation to manage shared memory.
template <typename scalar_t> template <typename scalar_t>
__global__ void moe_align_block_size_global_mem_kernel( __global__ void moe_align_block_size_kernel(
scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids, const scalar_t* __restrict__ topk_ids,
int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts, int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
int32_t block_size, size_t numel, int32_t* tokens_cnts, int32_t* cumsum) { int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x); int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
const size_t start_idx = threadIdx.x * tokens_per_thread; size_t numel, int32_t* __restrict__ cumsum) {
extern __shared__ int32_t shared_counts[];
for (int i = 0; i < num_experts; ++i) { const int warp_id = threadIdx.x / WARP_SIZE;
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
}
/**
* In the first step we compute token_cnts[thread_index + 1][expert_index],
* which counts how many tokens in the token shard of thread_index are
* assigned to expert expert_index.
*/
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
}
__syncthreads();
// For each expert we accumulate the token counts from the different threads.
if (threadIdx.x < num_experts) {
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
for (int i = 1; i <= blockDim.x; ++i) {
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
}
}
__syncthreads();
// We accumulate the token counts of all experts in thread 0.
if (threadIdx.x == 0) {
cumsum[0] = 0;
for (int i = 1; i <= num_experts; ++i) {
cumsum[i] = cumsum[i - 1] +
CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
block_size) *
block_size;
}
*total_tokens_post_pad = cumsum[num_experts];
}
__syncthreads();
/**
* For each expert, each thread processes the tokens of the corresponding
* blocks and stores the corresponding expert_id for each block.
*/
if (threadIdx.x < num_experts) {
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
i += block_size) {
expert_ids[i / block_size] = threadIdx.x;
}
}
/**
* Each thread processes a token shard, calculating the index of each token
* after sorting by expert number. Given the example topk_ids =
* [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *,
* *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a
* padding value(preset in python).
*/
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
int32_t expert_id = topk_ids[i];
/** The cumsum[expert_id] stores the starting index of the tokens that the
* expert with expert_id needs to process, and
* tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
* processed by the expert with expert_id within the current thread's token
* shard.
*/
int32_t rank_post_pad =
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
cumsum[expert_id];
sorted_token_ids[rank_post_pad] = i;
++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
}
}
// taken from
// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957
template <typename scalar_t>
__global__ void sgl_moe_align_block_size_kernel(
scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
int32_t block_size, size_t numel, int32_t* cumsum) {
__shared__ int32_t shared_counts[32][8];
const int warp_id = threadIdx.x / 32;
const int experts_per_warp = 8;
const int my_expert_start = warp_id * experts_per_warp; const int my_expert_start = warp_id * experts_per_warp;
// Initialize shared_counts for this warp's experts
for (int i = 0; i < experts_per_warp; ++i) { for (int i = 0; i < experts_per_warp; ++i) {
if (my_expert_start + i < num_experts) { if (my_expert_start + i < padded_num_experts) {
shared_counts[warp_id][i] = 0; shared_counts[warp_id * experts_per_warp + i] = 0;
} }
} }
__syncthreads(); __syncthreads();
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x); const size_t tid = threadIdx.x;
const size_t start_idx = threadIdx.x * tokens_per_thread; const size_t stride = blockDim.x;
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { for (size_t i = tid; i < numel; i += stride) {
int expert_id = topk_ids[i]; int expert_id = topk_ids[i];
int warp_idx = expert_id / experts_per_warp; int warp_idx = expert_id / experts_per_warp;
int expert_offset = expert_id % experts_per_warp; int expert_offset = expert_id % experts_per_warp;
atomicAdd(&shared_counts[warp_idx][expert_offset], 1); atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
} }
__syncthreads(); __syncthreads();
// Single thread computes cumulative sum and total tokens
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
cumsum[0] = 0; cumsum[0] = 0;
for (int i = 1; i <= num_experts; ++i) { for (int i = 1; i <= num_experts; ++i) {
int expert_count = 0; int expert_count = 0;
int warp_idx = (i - 1) / experts_per_warp; int warp_idx = (i - 1) / experts_per_warp;
int expert_offset = (i - 1) % experts_per_warp; int expert_offset = (i - 1) % experts_per_warp;
expert_count = shared_counts[warp_idx][expert_offset]; expert_count = shared_counts[warp_idx * experts_per_warp + expert_offset];
cumsum[i] = cumsum[i] =
cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size; cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size;
@ -248,7 +61,6 @@ __global__ void sgl_moe_align_block_size_kernel(
__syncthreads(); __syncthreads();
// Assign expert IDs to blocks
if (threadIdx.x < num_experts) { if (threadIdx.x < num_experts) {
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
i += block_size) { i += block_size) {
@ -257,13 +69,11 @@ __global__ void sgl_moe_align_block_size_kernel(
} }
} }
// taken from
// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957
template <typename scalar_t> template <typename scalar_t>
__global__ void sgl_moe_token_sort_kernel(scalar_t* __restrict__ topk_ids, __global__ void count_and_sort_expert_tokens_kernel(
int32_t* sorted_token_ids, const scalar_t* __restrict__ topk_ids,
int32_t* cumsum_buffer, int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
size_t numel) { size_t numel) {
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const size_t stride = blockDim.x * gridDim.x; const size_t stride = blockDim.x * gridDim.x;
@ -290,132 +100,138 @@ __global__ void moe_sum_kernel(
} }
} }
template <typename scalar_t>
__global__ void moe_align_block_size_small_batch_expert_kernel(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
int32_t block_size, size_t numel) {
const size_t tid = threadIdx.x;
const size_t stride = blockDim.x;
extern __shared__ int32_t shared_mem[];
int32_t* cumsum = shared_mem;
int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
for (int i = 0; i < num_experts; ++i) {
tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0;
}
for (size_t i = tid; i < numel; i += stride) {
++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i]];
}
__syncthreads();
if (threadIdx.x < num_experts) {
tokens_cnts[threadIdx.x] = 0;
for (int i = 1; i <= blockDim.x; ++i) {
tokens_cnts[i * num_experts + threadIdx.x] +=
tokens_cnts[(i - 1) * num_experts + threadIdx.x];
}
}
__syncthreads();
if (threadIdx.x == 0) {
cumsum[0] = 0;
for (int i = 1; i <= num_experts; ++i) {
cumsum[i] =
cumsum[i - 1] +
CEILDIV(tokens_cnts[blockDim.x * num_experts + i - 1], block_size) *
block_size;
}
*total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
}
__syncthreads();
if (threadIdx.x < num_experts) {
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
i += block_size) {
expert_ids[i / block_size] = threadIdx.x;
}
}
for (size_t i = tid; i < numel; i += stride) {
int32_t expert_id = topk_ids[i];
int32_t rank_post_pad =
tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];
sorted_token_ids[rank_post_pad] = i;
++tokens_cnts[threadIdx.x * num_experts + expert_id];
}
}
} // namespace moe } // namespace moe
} // namespace vllm } // namespace vllm
// taken from
// https://github.com/sgl-project/sglang/blob/8b5f83ed3b7d2a49ad5c5cd5aa61c5d502f47dbc
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size, torch::Tensor sorted_token_ids, int64_t block_size, torch::Tensor sorted_token_ids,
torch::Tensor experts_ids, torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad) { torch::Tensor num_tokens_post_pad) {
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
int device_max_shared_mem; int64_t padded_num_experts =
auto dev = topk_ids.get_device(); ((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
cudaDeviceGetAttribute(&device_max_shared_mem, int experts_per_warp = WARP_SIZE;
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); int threads = 1024;
threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
const int32_t shared_mem_i32 =
((num_thread + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t);
const int32_t shared_mem_i16 =
((num_thread + 1) * num_experts) * sizeof(uint16_t) +
(num_experts + 1) * sizeof(int32_t);
bool use_global_memory = false;
bool use_i16 = false; // Use uint16_t for shared memory token counts
if (shared_mem_i32 < device_max_shared_mem) {
// Do nothing in this case. We're all set to use int32_t token counts
} else if (shared_mem_i16 < device_max_shared_mem &&
topk_ids.numel() <= 65535) {
// when nelements of topk_ids is smaller than 65535 (max value of uint16),
// element value of token_cnts would also smaller than 65535,
// so we can use uint16 as dtype of token_cnts
use_i16 = true;
} else {
use_global_memory = true;
}
if (use_global_memory) {
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] {
// calc needed amount of shared mem for `tokens_cnts` and `cumsum`
// tensors
const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
auto options_int = torch::TensorOptions()
.dtype(torch::kInt)
.device(topk_ids.device());
torch::Tensor token_cnts_buffer =
torch::empty({(num_experts + 1) * num_experts}, options_int);
torch::Tensor cumsum_buffer =
torch::empty({num_experts + 1}, options_int);
auto kernel =
vllm::moe::moe_align_block_size_global_mem_kernel<scalar_t>;
kernel<<<1, num_thread, 0, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
topk_ids.numel(), token_cnts_buffer.data_ptr<int32_t>(),
cumsum_buffer.data_ptr<int32_t>());
});
} else if (use_i16) {
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
// set dynamic shared mem
auto kernel =
vllm::moe::moe_align_block_size_kernel<scalar_t, uint16_t>;
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
(void*)kernel, shared_mem_i16));
kernel<<<1, num_thread, shared_mem_i16, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
topk_ids.numel());
});
} else {
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
auto kernel =
vllm::moe::moe_align_block_size_kernel<scalar_t, int32_t>;
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
(void*)kernel, shared_mem_i32));
kernel<<<1, num_thread, shared_mem_i32, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
topk_ids.numel());
});
}
}
void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size,
torch::Tensor sorted_token_ids,
torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad) {
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
TORCH_CHECK(num_experts == 256,
"sgl_moe_align_block_size kernel only supports deepseek v3.");
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES( VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] { topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
// calc needed amount of shared mem for `cumsum` tensors // calc needed amount of shared mem for `cumsum` tensors
auto options_int = auto options_int =
torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device()); torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
torch::Tensor cumsum_buffer = torch::Tensor cumsum_buffer =
torch::zeros({num_experts + 1}, options_int); torch::zeros({num_experts + 1}, options_int);
bool small_batch_expert_mode =
(topk_ids.numel() < 1024) && (num_experts <= 64);
auto align_kernel = if (small_batch_expert_mode) {
vllm::moe::sgl_moe_align_block_size_kernel<scalar_t>; const int32_t threads = max((int32_t)num_experts, WARP_SIZE);
align_kernel<<<1, 1024, 0, stream>>>( const int32_t shared_mem_size =
topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(), ((threads + 1) * num_experts + (num_experts + 1)) *
experts_ids.data_ptr<int32_t>(), sizeof(int32_t);
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>());
const int block_threads = 256; auto small_batch_expert_kernel =
const int num_blocks = vllm::moe::moe_align_block_size_small_batch_expert_kernel<
(topk_ids.numel() + block_threads - 1) / block_threads; scalar_t>;
const int max_blocks = 65535; small_batch_expert_kernel<<<1, threads, shared_mem_size, stream>>>(
const int actual_blocks = std::min(num_blocks, max_blocks); topk_ids.data_ptr<scalar_t>(),
auto sort_kernel = vllm::moe::sgl_moe_token_sort_kernel<scalar_t>; sorted_token_ids.data_ptr<int32_t>(),
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>( experts_ids.data_ptr<int32_t>(),
topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(), num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel()); topk_ids.numel());
} else {
auto align_kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
size_t num_warps = CEILDIV(padded_num_experts, experts_per_warp);
size_t shared_mem_size =
num_warps * experts_per_warp * sizeof(int32_t);
align_kernel<<<1, threads, shared_mem_size, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts,
padded_num_experts, experts_per_warp, block_size,
topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>());
const int block_threads = std::min(256, (int)threads);
const int num_blocks =
(topk_ids.numel() + block_threads - 1) / block_threads;
const int max_blocks = 65535;
const int actual_blocks = std::min(num_blocks, max_blocks);
auto sort_kernel =
vllm::moe::count_and_sort_expert_tokens_kernel<scalar_t>;
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
}
}); });
} }

View File

@ -12,12 +12,6 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size, torch::Tensor sorted_token_ids, int64_t block_size, torch::Tensor sorted_token_ids,
torch::Tensor experts_ids, torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad); torch::Tensor num_tokens_post_pad);
void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size,
torch::Tensor sorted_token_ids,
torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad);
#ifndef USE_ROCM #ifndef USE_ROCM
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output, torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
torch::Tensor b_qweight, torch::Tensor b_scales, torch::Tensor b_qweight, torch::Tensor b_scales,

View File

@ -22,15 +22,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
" Tensor! num_tokens_post_pad) -> ()"); " Tensor! num_tokens_post_pad) -> ()");
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size); m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
// temporarily adapted from
// https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a
m.def(
"sgl_moe_align_block_size(Tensor topk_ids, int num_experts,"
" int block_size, Tensor! sorted_token_ids,"
" Tensor! experts_ids,"
" Tensor! num_tokens_post_pad) -> ()");
m.impl("sgl_moe_align_block_size", torch::kCUDA, &sgl_moe_align_block_size);
#ifndef USE_ROCM #ifndef USE_ROCM
m.def( m.def(
"moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, " "moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "

View File

@ -360,3 +360,14 @@ std::tuple<int64_t, torch::Tensor> allocate_shared_buffer_and_handle(
int64_t size); int64_t size);
int64_t open_mem_handle(torch::Tensor& mem_handle); int64_t open_mem_handle(torch::Tensor& mem_handle);
void free_shared_buffer(int64_t buffer); void free_shared_buffer(int64_t buffer);
#ifdef USE_ROCM
fptr_t init_custom_qr(int64_t rank, int64_t world_size,
std::optional<int64_t> qr_max_size = std::nullopt);
void qr_destroy(fptr_t _fa);
torch::Tensor qr_get_handle(fptr_t _fa);
void qr_open_handles(fptr_t _fa, const std::vector<torch::Tensor>& handles);
void qr_all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
int64_t quant_level, bool cast_bf2half = false);
int64_t qr_max_size();
#endif

View File

@ -29,26 +29,12 @@ struct sm100_fp8_config_default {
template <typename InType, typename OutType, template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue> template <typename, typename, typename> typename Epilogue>
struct sm100_fp8_config_M256 { struct sm100_fp8_config_M256 {
// M in (128, 256] // M in (64, 256]
static_assert(std::is_same<InType, cutlass::float_e4m3_t>()); static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
using TileShape = Shape<_128, _128, _128>; using TileShape = Shape<_128, _128, _128>;
using ClusterShape = Shape<_2, _2, _1>; using ClusterShape = Shape<_2, _1, _1>;
using Cutlass3xGemm =
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
KernelSchedule, EpilogueSchedule>;
};
template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue>
struct sm100_fp8_config_M128 {
// M in (64, 128]
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
using TileShape = Shape<_128, _128, _256>;
using ClusterShape = Shape<_2, _4, _1>;
using Cutlass3xGemm = using Cutlass3xGemm =
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape, cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
KernelSchedule, EpilogueSchedule>; KernelSchedule, EpilogueSchedule>;
@ -57,12 +43,26 @@ struct sm100_fp8_config_M128 {
template <typename InType, typename OutType, template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue> template <typename, typename, typename> typename Epilogue>
struct sm100_fp8_config_M64 { struct sm100_fp8_config_M64 {
// M in [1, 64] // M in (16, 64]
static_assert(std::is_same<InType, cutlass::float_e4m3_t>()); static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
using TileShape = Shape<_64, _64, _256>; using TileShape = Shape<_64, _64, _128>;
using ClusterShape = Shape<_1, _8, _1>; using ClusterShape = Shape<_1, _1, _1>;
using Cutlass3xGemm =
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
KernelSchedule, EpilogueSchedule>;
};
template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue>
struct sm100_fp8_config_M16 {
// M in [1, 16]
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
using TileShape = Shape<_64, _64, _128>;
using ClusterShape = Shape<_1, _4, _1>;
using Cutlass3xGemm = using Cutlass3xGemm =
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape, cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
KernelSchedule, EpilogueSchedule>; KernelSchedule, EpilogueSchedule>;
@ -82,27 +82,27 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
using Cutlass3xGemmDefault = using Cutlass3xGemmDefault =
typename sm100_fp8_config_default<InType, OutType, typename sm100_fp8_config_default<InType, OutType,
Epilogue>::Cutlass3xGemm; Epilogue>::Cutlass3xGemm;
using Cutlass3xGemmM16 =
typename sm100_fp8_config_M16<InType, OutType, Epilogue>::Cutlass3xGemm;
using Cutlass3xGemmM64 = using Cutlass3xGemmM64 =
typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm; typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
using Cutlass3xGemmM128 =
typename sm100_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
using Cutlass3xGemmM256 = using Cutlass3xGemmM256 =
typename sm100_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm; typename sm100_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
uint32_t const m = a.size(0); uint32_t const m = a.size(0);
uint32_t const mp2 = uint32_t const mp2 =
std::max(static_cast<uint32_t>(64), next_pow_2(m)); // next power of 2 std::max(static_cast<uint32_t>(16), next_pow_2(m)); // next power of 2
if (mp2 <= 64) { if (mp2 <= 16) {
// m in [1, 64] // m in [1, 16]
return cutlass_gemm_caller<Cutlass3xGemmM16>(
out, a, b, std::forward<EpilogueArgs>(args)...);
} else if (mp2 <= 64) {
// m in (16, 64]
return cutlass_gemm_caller<Cutlass3xGemmM64>( return cutlass_gemm_caller<Cutlass3xGemmM64>(
out, a, b, std::forward<EpilogueArgs>(args)...); out, a, b, std::forward<EpilogueArgs>(args)...);
} else if (mp2 <= 128) {
// m in (64, 128]
return cutlass_gemm_caller<Cutlass3xGemmM128>(
out, a, b, std::forward<EpilogueArgs>(args)...);
} else if (mp2 <= 256) { } else if (mp2 <= 256) {
// m in (128, 256] // m in (64, 256]
return cutlass_gemm_caller<Cutlass3xGemmM256>( return cutlass_gemm_caller<Cutlass3xGemmM256>(
out, a, b, std::forward<EpilogueArgs>(args)...); out, a, b, std::forward<EpilogueArgs>(args)...);
} else { } else {

View File

@ -241,7 +241,7 @@ void get_cutlass_moe_mm_data(
// mm to run it for. // mm to run it for.
int32_t version_num = get_sm_version_num(); int32_t version_num = get_sm_version_num();
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \ #if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
(defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM90) (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1, get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
problem_sizes2, input_permutation, problem_sizes2, input_permutation,
output_permutation, num_experts, n, k, output_permutation, num_experts, n, k,
@ -252,7 +252,7 @@ void get_cutlass_moe_mm_data(
false, false,
"No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for " "No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for "
"CUDA device capability: ", "CUDA device capability: ",
version_num, ". Required capability: 90"); version_num, ". Required capability: 90 or 100");
} }
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets, void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
@ -265,7 +265,8 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
// This function currently gets compiled only if we have a valid cutlass moe // This function currently gets compiled only if we have a valid cutlass moe
// mm to run it for. // mm to run it for.
int32_t version_num = get_sm_version_num(); int32_t version_num = get_sm_version_num();
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90 #if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1, get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
problem_sizes2, expert_num_tokens, problem_sizes2, expert_num_tokens,
num_local_experts, padded_m, n, k); num_local_experts, padded_m, n, k);
@ -275,7 +276,7 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
false, false,
"No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel " "No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
"for CUDA device capability: ", "for CUDA device capability: ",
version_num, ". Required capability: 90"); version_num, ". Required capability: 90 or 100");
} }
void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a, void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,

View File

@ -561,7 +561,7 @@ void scaled_fp4_experts_quant_sm100a(
TORCH_CHECK(output_scale.size(1) * 4 == padded_k); TORCH_CHECK(output_scale.size(1) * 4 == padded_k);
auto in_dtype = input.dtype(); auto in_dtype = input.dtype();
at::cuda::CUDAGuard device_guard{(char)input.get_device()}; const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
const cudaStream_t stream = const cudaStream_t stream =
at::cuda::getCurrentCUDAStream(input.get_device()); at::cuda::getCurrentCUDAStream(input.get_device());
if (in_dtype == at::ScalarType::Half) { if (in_dtype == at::ScalarType::Half) {
@ -579,4 +579,4 @@ void scaled_fp4_experts_quant_sm100a(
} else { } else {
TORCH_CHECK(false, "Expected input data type to be half or bfloat16"); TORCH_CHECK(false, "Expected input data type to be half or bfloat16");
} }
} }

View File

@ -347,7 +347,7 @@ void scaled_fp4_quant_sm100a(torch::Tensor const& output,
auto input_sf_ptr = static_cast<float const*>(input_sf.data_ptr()); auto input_sf_ptr = static_cast<float const*>(input_sf.data_ptr());
auto sf_out = static_cast<int32_t*>(output_sf.data_ptr()); auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
auto output_ptr = static_cast<int64_t*>(output.data_ptr()); auto output_ptr = static_cast<int64_t*>(output.data_ptr());
at::cuda::CUDAGuard device_guard{(char)input.get_device()}; const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
auto stream = at::cuda::getCurrentCUDAStream(input.get_device()); auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
// We don't support e8m0 scales at this moment. // We don't support e8m0 scales at this moment.

View File

@ -267,7 +267,7 @@ void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
B_sf.sizes()[1], ")"); B_sf.sizes()[1], ")");
auto out_dtype = D.dtype(); auto out_dtype = D.dtype();
at::cuda::CUDAGuard device_guard{(char)A.get_device()}; const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(A.get_device()); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(A.get_device());
if (out_dtype == at::ScalarType::Half) { if (out_dtype == at::ScalarType::Half) {

338
csrc/quickreduce/base.h Normal file
View File

@ -0,0 +1,338 @@
#pragma once
#include <cstdint>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <hip/hip_bf16.h>
#define __quickreduce_device_inline__ __device__ __forceinline__
#define __quickreduce_launch_bounds_two_shot__ __launch_bounds__(256, 4)
#define __quickreduce_launch_bounds_one_shot__ __launch_bounds__(512, 4)
namespace quickreduce {
typedef __hip_bfloat16 nv_bfloat16;
typedef __hip_bfloat162 nv_bfloat162;
using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
// Setup acquire-release semantics for vector memory reads (mubuf instruction)
// as per architecture.
#if defined(__gfx942__)
// CDNA3: Scope bits sc0, sc1
#define MUBUF_ACQUIRE 16
#define MUBUF_RELEASE 16
#elif (defined(__gfx908__) || defined(__gfx90a__))
// CDNA1 and CDNA2 - glc bit
#define MUBUF_ACQUIRE 1
#define MUBUF_RELEASE 0
#endif
static constexpr int kNegOne = 0xBC00BC00; // {-1, -1}, fp16x2_t
// Number of atoms (4xf16x2_t) processed by a single thread
static constexpr int kAtoms = 8;
// We use a workgroup of 256 threads
static constexpr int kBlockSize = 256;
static constexpr int kAtomStride = kBlockSize;
// Size and atom stride of source/destination data that the block will
// process.
// Workgroup scope = Tile = (256 threads x 8 atoms x 16B)
static constexpr int kTileSize = kBlockSize * kAtoms * sizeof(int32x4_t);
// Max number of blocks. 304 CUs on MI300
static constexpr int kMaxNumBlocks = 304 * 4;
// Standard CDNA wavefront size.
static constexpr int kWavefront = 64;
// 256 thread, 4 wavefronts.
static dim3 constexpr kBlockTwoShot = {kWavefront, kBlockSize / kWavefront, 1};
// Number of threads in a group for quantization
// It corresponds to 32 F16 elements in quantization block
static constexpr int kThreadGroupSize = 8;
// Methods
__quickreduce_device_inline__ __host__ unsigned long divceil(unsigned long x,
unsigned long y) {
return ((x + y - 1) / y);
}
union BufferResource {
__quickreduce_device_inline__ constexpr BufferResource()
: config(0x00020000U) {}
__quickreduce_device_inline__ constexpr BufferResource(void* buffer_address,
uint32_t buffer_size)
: address(buffer_address), range(buffer_size), config(0x00020000U) {}
int32x4_t descriptor;
struct {
void* address; // 8B, out of which first 48b is address, and 16b is stride
// (unused)
uint32_t range; // Byte range for the buffer resource
uint32_t config; // Constant, DFMT=32b
};
};
__quickreduce_device_inline__ static int32x4_t buffer_load_dwordx4(
int32x4_t srsrc, int32_t voffset, int32_t soffset,
int32_t aux) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
__quickreduce_device_inline__ static void buffer_store_dwordx4(
int32x4_t data, int32x4_t srsrc, int32_t voffset, int32_t soffset,
int32_t aux) __asm("llvm.amdgcn.raw.buffer.store.v4i32");
__quickreduce_device_inline__ static void set_fp16_ovfl(bool const value) {
#if defined(__gfx942__)
if (value) {
asm volatile("s_setreg_imm32_b32 0xdc1, 1;" ::);
} else {
asm volatile("s_setreg_imm32_b32 0xdc1, 0;" ::);
}
#endif
}
union bf162_int_union {
int i;
nv_bfloat162 bf2;
};
template <typename T>
__quickreduce_device_inline__ void packed_assign_add(int32x4_t* A,
int32x4_t* B);
template <>
__quickreduce_device_inline__ void packed_assign_add<half>(int32x4_t* A,
int32x4_t* B) {
int32x4_t& tR_fragment = A[0];
int32x4_t& tA_fragment = B[0];
asm volatile("v_pk_add_f16 %0, %1, %2"
: "=v"(tR_fragment[0])
: "v"(tR_fragment[0]), "v"(tA_fragment[0]));
asm volatile("v_pk_add_f16 %0, %1, %2"
: "=v"(tR_fragment[1])
: "v"(tR_fragment[1]), "v"(tA_fragment[1]));
asm volatile("v_pk_add_f16 %0, %1, %2"
: "=v"(tR_fragment[2])
: "v"(tR_fragment[2]), "v"(tA_fragment[2]));
asm volatile("v_pk_add_f16 %0, %1, %2"
: "=v"(tR_fragment[3])
: "v"(tR_fragment[3]), "v"(tA_fragment[3]));
}
template <>
__quickreduce_device_inline__ void packed_assign_add<nv_bfloat16>(
int32x4_t* A, int32x4_t* B) {
nv_bfloat162* tA = reinterpret_cast<nv_bfloat162*>(A);
nv_bfloat162* tB = reinterpret_cast<nv_bfloat162*>(B);
#pragma unroll
for (int i = 0; i < 4; i++) {
tA[i] = __hadd2(tA[i], tB[i]);
}
}
template <typename T>
__quickreduce_device_inline__ int packed_max(int a, int b);
template <>
__quickreduce_device_inline__ int packed_max<half>(int a, int b) {
int result;
asm volatile("v_pk_max_f16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
return result;
}
template <>
__quickreduce_device_inline__ int packed_max<nv_bfloat16>(int a, int b) {
bf162_int_union A, B, R;
A.i = a;
B.i = b;
R.bf2 = __hmax2(A.bf2, B.bf2);
return R.i;
}
template <typename T>
__quickreduce_device_inline__ int packed_min(int a, int b);
template <>
__quickreduce_device_inline__ int packed_min<half>(int a, int b) {
int result;
asm volatile("v_pk_min_f16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
return result;
}
template <>
__quickreduce_device_inline__ int packed_min<nv_bfloat16>(int a, int b) {
bf162_int_union A, B, R;
A.i = a;
B.i = b;
R.bf2 = __hmin2(A.bf2, B.bf2);
return R.i;
}
template <typename T>
__quickreduce_device_inline__ int packed_abs_max(int a, int b);
template <>
__quickreduce_device_inline__ int packed_abs_max<half>(int a, int b) {
half2 wmaxh2 = __builtin_bit_cast(half2, a);
half2 wminh2 = __builtin_bit_cast(half2, b);
half2 wblockmaxh2;
wblockmaxh2.x =
__hgt(__habs(wmaxh2.x), __habs(wminh2.x)) ? wmaxh2.x : wminh2.x;
wblockmaxh2.y =
__hgt(__habs(wmaxh2.y), __habs(wminh2.y)) ? wmaxh2.y : wminh2.y;
return __builtin_bit_cast(int, wblockmaxh2);
}
template <>
__quickreduce_device_inline__ int packed_abs_max<nv_bfloat16>(int a, int b) {
bf162_int_union A, B, R;
A.i = a;
B.i = b;
R.bf2.x = __hgt(__habs(A.bf2.x), __habs(B.bf2.x)) ? A.bf2.x : B.bf2.x;
R.bf2.y = __hgt(__habs(A.bf2.y), __habs(B.bf2.y)) ? A.bf2.y : B.bf2.y;
return R.i;
}
template <typename T>
__quickreduce_device_inline__ int packed_add(int a, int b);
template <>
__quickreduce_device_inline__ int packed_add<half>(int a, int b) {
int result;
asm volatile("v_pk_add_f16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
return result;
}
template <>
__quickreduce_device_inline__ int packed_add<nv_bfloat16>(int a, int b) {
bf162_int_union A, B, R;
A.i = a;
B.i = b;
R.bf2 = __hadd2(A.bf2, B.bf2);
return R.i;
}
template <>
__quickreduce_device_inline__ int packed_add<int16_t>(int a, int b) {
int result;
asm volatile("v_pk_add_i16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
return result;
}
template <typename T>
__quickreduce_device_inline__ int packed_sub(int a, int b);
template <>
__quickreduce_device_inline__ int packed_sub<half>(int a, int b) {
int result;
// MI300 lacks packed fp16 sub instruction. So we do -1 * min + max
asm volatile("v_pk_fma_f16 %0, %1, %2 %3"
: "=v"(result)
: "v"(kNegOne), "v"(b), "v"(a));
return result;
}
template <>
__quickreduce_device_inline__ int packed_sub<nv_bfloat16>(int a, int b) {
bf162_int_union A, B, R;
A.i = a;
B.i = b;
R.bf2 = __hsub2(A.bf2, B.bf2);
return R.i;
}
template <typename T>
__quickreduce_device_inline__ int packed_mul(int a, int b);
template <>
__quickreduce_device_inline__ int packed_mul<half>(int a, int b) {
int result;
asm volatile("v_pk_mul_f16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
return result;
}
template <>
__quickreduce_device_inline__ int packed_mul<nv_bfloat16>(int a, int b) {
nv_bfloat162* tA = reinterpret_cast<nv_bfloat162*>(&a);
nv_bfloat162* tB = reinterpret_cast<nv_bfloat162*>(&b);
nv_bfloat162 tR = __hmul2(*tA, *tB);
return *(reinterpret_cast<int*>(&tR));
}
template <typename T>
__quickreduce_device_inline__ int packed_rcp(int a);
template <>
__quickreduce_device_inline__ int packed_rcp<half>(int a) {
return __builtin_bit_cast(int, h2rcp(__builtin_bit_cast(half2, a)));
}
template <>
__quickreduce_device_inline__ int packed_rcp<nv_bfloat16>(int a) {
bf162_int_union A, R;
A.i = a;
R.bf2 = h2rcp(A.bf2);
return R.i;
}
// changes dtype
__quickreduce_device_inline__ float T2float_cast(half a) {
return __half2float(a);
}
__quickreduce_device_inline__ float T2float_cast(nv_bfloat16 a) {
return __bfloat162float(a);
}
template <typename T>
__quickreduce_device_inline__ int group_abs_max(int32x4_t atom) {
const int group_leader = (threadIdx.x / kThreadGroupSize) * kThreadGroupSize;
int wmax, wmin, wblockmax;
int a, b;
a = packed_max<T>(atom[0], atom[1]);
b = packed_max<T>(atom[2], atom[3]);
wmax = packed_max<T>(a, b);
a = packed_min<T>(atom[0], atom[1]);
b = packed_min<T>(atom[2], atom[3]);
wmin = packed_min<T>(a, b);
// Reduce the max among a group of threads
// Note: This is basically 2 blocks of values setup as the
// upper/lower halves of the f16x2_t
for (int i = 1; i < kThreadGroupSize; i <<= 1) {
int x = __shfl_down(wmax, i);
wmax = packed_max<T>(wmax, x);
int y = __shfl_down(wmin, i);
wmin = packed_min<T>(wmin, y);
}
wblockmax = packed_abs_max<T>(wmax, wmin);
// Share with the cohort
wblockmax = __shfl(wblockmax, group_leader);
return wblockmax;
}
__quickreduce_device_inline__ void set_sync_flag(uint32_t* flag_ptr,
uint32_t flag) {
__atomic_store_n(flag_ptr, flag, __ATOMIC_RELEASE);
}
__quickreduce_device_inline__ void wait_sync_flag(uint32_t* flag_ptr,
uint32_t flag) {
while (__atomic_load_n(flag_ptr, __ATOMIC_RELAXED) != flag) {
}
}
} // namespace quickreduce

View File

@ -0,0 +1,196 @@
#pragma once
#include <vector>
#include <hip/hip_runtime.h>
#include "quick_reduce_impl.cuh"
#define HIP_CHECK(err) \
do { \
hipError_t err_ = (err); \
if (err_ != hipSuccess) { \
std::printf("HIP error %d at %s:%d. %s\n", err_, __FILE__, __LINE__, \
hipGetErrorString(err_)); \
throw std::runtime_error("HIP error"); \
} \
} while (0)
namespace quickreduce {
using fptr_t = int64_t;
static_assert(sizeof(void*) == sizeof(fptr_t));
template <typename AllReduceKernel, typename T>
__global__ __quickreduce_launch_bounds_two_shot__ static void
allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
int rank, uint8_t** dbuffer_list,
uint32_t data_offset, uint32_t flag_color) {
int block = blockIdx.x;
int grid = gridDim.x;
while (block < num_blocks) {
AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset,
flag_color);
block += grid;
flag_color++;
}
}
#define TWOSHOT_DISPATCH(__codec) \
if (world_size == 2) { \
using LineCodec = __codec<T, 2>; \
using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
num_blocks, rank, dbuffer_list, data_offset, \
flag_color); \
} else if (world_size == 4) { \
using LineCodec = __codec<T, 4>; \
using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
num_blocks, rank, dbuffer_list, data_offset, \
flag_color); \
} else if (world_size == 8) { \
using LineCodec = __codec<T, 8>; \
using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
num_blocks, rank, dbuffer_list, data_offset, \
flag_color); \
}
enum QuickReduceQuantLevel {
F16 = 0,
INT8 = 1,
INT6 = 2,
INT4 = 3,
};
struct DeviceComms {
// Max problem size is 2GB (in bytes) or half of uint32_t max value.
int64_t kMaxProblemSize =
static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + 1;
// Max TP-8
static int constexpr kMaxWorldSize = 8;
bool initialized = false;
uint32_t flag_color = 1;
int world_size;
int rank;
uint8_t* dbuffer;
uint8_t** dbuffer_list;
hipIpcMemHandle_t buffer_ipc_handle;
std::vector<hipIpcMemHandle_t> all_buffer_ipc_handles;
std::vector<uint8_t*> buffer_list;
uint32_t data_offset;
DeviceComms() : initialized(false), world_size(1), rank(0) {}
~DeviceComms() { destroy(); }
void init(int world_size, int rank,
std::optional<int64_t> max_problem_size = std::nullopt) {
destroy();
this->world_size = world_size;
this->rank = rank;
if (max_problem_size.has_value() && max_problem_size.value() > 0) {
this->kMaxProblemSize = max_problem_size.value();
}
// Allocate buffer size for worst case: F16 2-stage buffer.
uint32_t flags_buffer_size =
2 * world_size * kMaxNumBlocks * sizeof(uint32_t);
static int64_t data_buffer_size = 2 * this->kMaxProblemSize;
int64_t total_buffer_size = flags_buffer_size + data_buffer_size;
data_offset = flags_buffer_size;
HIP_CHECK(hipExtMallocWithFlags((void**)&dbuffer, total_buffer_size,
hipDeviceMallocUncached));
// Clear the flags buffer.
HIP_CHECK(hipMemset(dbuffer, 0, flags_buffer_size));
// Device-side list of IPC buffers.
buffer_list.resize(world_size);
HIP_CHECK(hipMalloc(&dbuffer_list, world_size * sizeof(uint8_t*)));
// Create IPC handles for rank's communication buffer.
all_buffer_ipc_handles.resize(world_size);
HIP_CHECK(hipIpcGetMemHandle(&buffer_ipc_handle, dbuffer));
initialized = true;
}
int get_world_size() { return world_size; }
int get_rank() { return rank; }
bool status() { return initialized; }
hipIpcMemHandle_t const get_handle() { return buffer_ipc_handle; }
void destroy() {
if (initialized) {
for (int i = 0; i < world_size; i++) {
if (i != rank) {
HIP_CHECK(hipIpcCloseMemHandle(dbuffer_list[i]));
}
}
HIP_CHECK(hipFree(dbuffer));
HIP_CHECK(hipFree(dbuffer_list));
initialized = false;
}
}
void open_ipc_handles(std::vector<hipIpcMemHandle_t> const& ipc_handles) {
assert(ipc_handles.size() == all_buffer_ipc_handles.size());
for (int i = 0; i < world_size; i++) {
all_buffer_ipc_handles[i] = ipc_handles[i];
}
// Open device memory access to the IPC communication buffers.
// Note: For our own rank, we do not need to open a handle.
for (int i = 0; i < world_size; i++) {
if (i != rank) {
HIP_CHECK(hipIpcOpenMemHandle((void**)&buffer_list[i],
all_buffer_ipc_handles[i],
hipIpcMemLazyEnablePeerAccess));
} else {
buffer_list[i] = dbuffer;
}
}
HIP_CHECK(hipMemcpy(dbuffer_list, buffer_list.data(),
world_size * sizeof(uint8_t*), hipMemcpyHostToDevice));
}
template <typename T, bool cast_bf2half>
void allreduce(T const* A, T* B, uint32_t N, int quant_level,
hipStream_t stream) {
if (world_size != 2 && world_size != 4 && world_size != 8) {
throw std::runtime_error("All Reduce not supported for world_size = " +
std::to_string(world_size));
}
// Configuration.
uint32_t msg_size = N * sizeof(T);
uint32_t num_blocks = divceil(msg_size, kTileSize);
uint32_t grid = min(kMaxNumBlocks, num_blocks);
auto quant_level_ = static_cast<QuickReduceQuantLevel>(quant_level);
switch (quant_level_) {
case QuickReduceQuantLevel::INT8:
TWOSHOT_DISPATCH(CodecQ8)
break;
case QuickReduceQuantLevel::INT6:
TWOSHOT_DISPATCH(CodecQ6)
break;
case QuickReduceQuantLevel::INT4:
TWOSHOT_DISPATCH(CodecQ4)
break;
default:
TWOSHOT_DISPATCH(CodecFP)
break;
}
HIP_CHECK(cudaGetLastError());
// Rotate the flag color.
flag_color += divceil(N, grid);
}
};
} // namespace quickreduce

View File

@ -0,0 +1,698 @@
#pragma once
#include <hip/hip_runtime.h>
#include "base.h"
namespace quickreduce {
struct CodecBase {
const int thread;
const int rank;
const int group_leader;
__quickreduce_device_inline__ CodecBase(int thread, int rank)
: thread(thread),
rank(rank),
group_leader((threadIdx.x / kThreadGroupSize) * kThreadGroupSize) {
set_fp16_ovfl(true);
}
};
// Default full precision codec.
template <typename T, int world_size>
struct CodecFP : public CodecBase {
static constexpr int kWorldSize = world_size;
static constexpr int kRankAtoms = kAtoms / kWorldSize;
// Codec tile size process by this workgroup.
// Each thread processes atoms of f16x8_t (16B).
static constexpr int kRankTransmittedTileSize =
kBlockSize * kRankAtoms * sizeof(int32x4_t);
static_assert(kRankTransmittedTileSize % 16 == 0,
"kRankTransmittedTileSize must be 16B aligned.");
// Total tile size for the collective communication.
static constexpr int kTransmittedTileSize =
kRankTransmittedTileSize * kWorldSize;
__quickreduce_device_inline__ CodecFP(int thread, int rank)
: CodecBase(thread, rank) {}
__quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer,
const int32x4_t* __restrict__ data) {
for (int i = 0; i < kRankAtoms; i++) {
__builtin_nontemporal_store(data[i], send_buffer + thread);
send_buffer += kAtomStride;
}
}
__quickreduce_device_inline__ void recv(int32x4_t** __restrict__ recv_buffer,
int32x4_t* __restrict__ data) {
for (int i = 0; i < kRankAtoms; i++) {
data[i] = __builtin_nontemporal_load(*recv_buffer + thread);
*recv_buffer += kAtomStride;
}
}
};
// Int4 symmetric quantization codec.
// We quantize the FP16 data to block-scaled Int4 in blocks of 4 *
// kThreadGroupSize.
template <typename T, int world_size>
struct CodecQ4 : public CodecBase {
static constexpr int kWorldSize = world_size;
// Codec tile size process by this workgroup.
// Each threads processes a fragment of fp16x8_t (16B),
// into a int4x8_t (4B) and a fp16 scale shared among 32 values.
static constexpr int kRankAtoms = kAtoms / kWorldSize;
static constexpr int kRankTileStride = 1152;
static constexpr int kRankTileScaleOffset = 1024;
static constexpr int kRankTransmittedTileSize = kRankTileStride * kRankAtoms;
static_assert(kRankTransmittedTileSize % 16 == 0,
"kRankTransmittedTileSize must be 16B aligned.");
static constexpr int kRankBufferTileStride =
kRankTileStride / sizeof(int32x4_t);
// Total tile size for the collective communication.
static constexpr int kTransmittedTileSize =
kRankTransmittedTileSize * kWorldSize;
// Constants configuration
// {-1/8.0h, -1/8.0h}, f16x2_t
static constexpr int kScaleFactor =
std::is_same<T, half>::value ? 0xB000B000 : 0xBE00BE00;
// {1e-7, 1e-7}, f16x2_t
static constexpr int kScaleEpsilon =
std::is_same<T, half>::value ? 0x00010001 : 0x33D733D7;
// {-8, -8}, f16x2_t
static constexpr int kRangeMin =
std::is_same<T, half>::value ? 0xC800C800 : 0xC100C100;
// {+7, +7}, f16x2_t
static constexpr int kRangeMax =
std::is_same<T, half>::value ? 0x47004700 : 0x40E040E0;
// {+8, +8}, int16x2_t
static constexpr int kRangeBias = 0x00080008;
__quickreduce_device_inline__ CodecQ4(int thread, int rank)
: CodecBase(thread, rank) {}
__quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer,
const int32x4_t* __restrict__ data) {
for (int k = 0; k < kRankAtoms; k++) {
int32x4_t const atom = data[k];
// Compute the absolute maximum of the atom in the thread group
// In 2 blocks of values, upper/lower halves of the f16x2_t
int wblockmax = group_abs_max<T>(atom);
// Derive scales
int decoding_scale;
int encoding_scale;
decoding_scale = packed_mul<T>(wblockmax, kScaleFactor);
encoding_scale = packed_add<T>(decoding_scale, kScaleEpsilon);
encoding_scale = packed_rcp<T>(encoding_scale);
// Apply scales to get quantized values
int32x4_t w;
for (int i = 0; i < 4; i++) {
w[i] = packed_mul<T>(atom[i], encoding_scale);
w[i] = packed_max<T>(w[i], kRangeMin);
w[i] = packed_min<T>(w[i], kRangeMax);
}
// Convert from f16x2_t to uint16x2_t
int32x4_t q;
{
int16_t* qi = reinterpret_cast<int16_t*>(&q);
T* wh = reinterpret_cast<T*>(&w);
for (int i = 0; i < 8; i++) qi[i] = (int16_t)rintf(T2float_cast(wh[i]));
for (int i = 0; i < 4; i++) {
q[i] = packed_add<int16_t>(q[i], kRangeBias);
}
}
// Pack 8 x q4 into int32_t
int qw = q[0] | (q[1] << 4) | (q[2] << 8) | (q[3] << 12);
// Write quantized atom to send_buffer
// note: only the group leader stores the scale
uint8_t* atom_ptr =
reinterpret_cast<uint8_t*>(send_buffer + k * kRankBufferTileStride);
int32_t* qw_ptr = reinterpret_cast<int32_t*>(atom_ptr) + thread;
int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
(thread / 8);
__builtin_nontemporal_store(qw, qw_ptr);
if (threadIdx.x == group_leader) {
__builtin_nontemporal_store(decoding_scale, qs_ptr);
}
}
}
__quickreduce_device_inline__ void recv(int32x4_t** __restrict__ recv_buffer,
int32x4_t* __restrict__ data) {
for (int k = 0; k < kRankAtoms; k++) {
// Directly read quantized atom from recv_buffer
uint8_t* atom_ptr = reinterpret_cast<uint8_t*>(*recv_buffer);
int32_t* qw_ptr = reinterpret_cast<int32_t*>(atom_ptr) + thread;
int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
(thread / 8);
int32_t qw = __builtin_nontemporal_load(qw_ptr);
int qs = __builtin_nontemporal_load(qs_ptr);
*recv_buffer += kRankBufferTileStride;
// Unpack q4 into f16x8_t
int32x4_t w;
{
static constexpr uint kMask000F = 0x000F000F;
static constexpr uint kHalf2_1024 =
0x64006400; // {1024.0, 1024.0}, fp16x2_t
static uint constexpr kHalf2_1032 =
0xE408E408; // {-1032.0, -1032.0}, fp16x2_t
for (int i = 0; i < 4; i++) {
if constexpr (std::is_same<T, half>::value) {
int32_t q4 = ((qw >> (i * 4)) & kMask000F) | kHalf2_1024;
w[i] = packed_add<half>(q4, kHalf2_1032);
} else {
int32_t int16_2 = (qw >> (i * 4)) & kMask000F;
int16_t low = static_cast<int16_t>(int16_2 & 0xFFFF);
int16_t high = static_cast<int16_t>((int16_2 >> 16) & 0xFFFF);
nv_bfloat16 bf_low = __float2bfloat16(static_cast<float>(low));
nv_bfloat16 bf_high = __float2bfloat16(static_cast<float>(high));
nv_bfloat162 bf2 = __halves2bfloat162(bf_low, bf_high);
int32_t packed_bf16 = *reinterpret_cast<int32_t*>(&bf2);
w[i] = packed_add<nv_bfloat16>(packed_bf16, kRangeMin);
}
}
}
// Apply decoding scales
for (int i = 0; i < 4; i++) {
w[i] = packed_mul<T>(w[i], qs);
}
data[k] = w;
}
}
};
// Int6 symmetric quantization codec.
// We quantize the FP16 data to block-scaled Int6 in blocks of 4 *
// kThreadGroupSize.
template <typename T, int world_size>
struct CodecQ6 : public CodecBase {
static constexpr int kWorldSize = world_size;
// Codec tile size process by this workgroup.
// Each threads processes a fragment of fp16x8_t (16B),
// into a int6x8_t (4B + 2B) and a fp16 scale shared among 32 values.
static constexpr int kRankAtoms = kAtoms / kWorldSize;
static constexpr int kRankTileStride = 1664;
static constexpr int kRankTileQ2Offset = 1024;
static constexpr int kRankTileScaleOffset = 1536;
static constexpr int kRankTransmittedTileSize = kRankTileStride * kRankAtoms;
static_assert(kRankTransmittedTileSize % 16 == 0,
"kRankTransmittedTileSize must be 16B aligned.");
static constexpr int kRankBufferTileStride =
kRankTileStride / sizeof(int32x4_t);
// Total tile size for the collective communication.
static constexpr int kTransmittedTileSize =
kRankTransmittedTileSize * kWorldSize;
// Constants configuration
// {-1/32.0h, -1/32.0h}, fp16x2_t
static constexpr int kScaleFactor =
std::is_same<T, half>::value ? 0xA800A800 : 0xBD00BD00;
// {1e-7, 1e-7}, fp16x2_t
static constexpr int kScaleEpsilon =
std::is_same<T, half>::value ? 0x00010001 : 0x33D733D7;
// {-32, -32}, fp16x2_t
static constexpr int kRangeMin =
std::is_same<T, half>::value ? 0xD000D000 : 0xC200C200;
// {+31, +31}, fp16x2_t
static constexpr int kRangeMax =
std::is_same<T, half>::value ? 0x4FC04FC0 : 0x41F841F8;
// {+32, +32}, int16x2_t
static constexpr int kRangeBias = 0x00200020;
__quickreduce_device_inline__ CodecQ6(int thread, int rank)
: CodecBase(thread, rank) {}
__quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer,
const int32x4_t* __restrict__ data) {
for (int k = 0; k < kRankAtoms; k++) {
int32x4_t const atom = data[k];
// Compute the absolute maximum of the atom in the thread group
// In 2 blocks of values, upper/lower halves of the f16x2_t
int wblockmax = group_abs_max<T>(atom);
// Derive scales
int decoding_scale;
int encoding_scale;
decoding_scale = packed_mul<T>(wblockmax, kScaleFactor);
encoding_scale = packed_add<T>(decoding_scale, kScaleEpsilon);
encoding_scale = packed_rcp<T>(encoding_scale);
// Apply scales to get quantized values
int32x4_t w;
for (int i = 0; i < 4; i++) {
w[i] = packed_mul<T>(atom[i], encoding_scale);
w[i] = packed_max<T>(w[i], kRangeMin);
w[i] = packed_min<T>(w[i], kRangeMax);
}
// Convert from f16x2_t to uint16x2_t
int32x4_t q;
{
int16_t* qi = reinterpret_cast<int16_t*>(&q);
T* wh = reinterpret_cast<T*>(&w);
for (int i = 0; i < 8; i++) qi[i] = (int16_t)rintf(T2float_cast(wh[i]));
for (int i = 0; i < 4; i++) {
q[i] = packed_add<int16_t>(q[i], kRangeBias);
}
}
// Pack 8 x q6 into int32_t + int16_t
uint32_t q4w;
uint16_t q2w = 0;
q4w = (q[0] & 0x000F000F) | ((q[1] & 0x000F000F) << 4) |
((q[2] & 0x000F000F) << 8) | ((q[3] & 0x000F000F) << 12);
{
int16_t* tw = reinterpret_cast<int16_t*>(&q);
#pragma unroll
for (int i = 0; i < 8; i++) {
q2w |= (tw[i] >> 4) << (i * 2);
}
}
// Write quantized atom to send_buffer
// note: only the group leader stores the scale
uint8_t* atom_ptr =
reinterpret_cast<uint8_t*>(send_buffer + k * kRankBufferTileStride);
uint32_t* q4w_ptr = reinterpret_cast<uint32_t*>(atom_ptr) + thread;
uint16_t* q2w_ptr =
reinterpret_cast<uint16_t*>(atom_ptr + kRankTileQ2Offset) + thread;
int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
(thread / 8);
__builtin_nontemporal_store(q4w, q4w_ptr);
__builtin_nontemporal_store(q2w, q2w_ptr);
if (threadIdx.x == group_leader) {
__builtin_nontemporal_store(decoding_scale, qs_ptr);
}
}
}
__quickreduce_device_inline__ void recv(int32x4_t** __restrict__ recv_buffer,
int32x4_t* __restrict__ data) {
for (int k = 0; k < kRankAtoms; k++) {
// Directly read quantized atom from recv_buffer
uint8_t* atom_ptr = reinterpret_cast<uint8_t*>(*recv_buffer);
uint32_t* q4w_ptr = reinterpret_cast<uint32_t*>(atom_ptr) + thread;
uint16_t* q2w_ptr =
reinterpret_cast<uint16_t*>(atom_ptr + kRankTileQ2Offset) + thread;
int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
(thread / 8);
uint32_t q4w = __builtin_nontemporal_load(q4w_ptr);
uint16_t q2w = __builtin_nontemporal_load(q2w_ptr);
int qs = __builtin_nontemporal_load(qs_ptr);
*recv_buffer += kRankBufferTileStride;
// Unpack q6 into fp16x8_t
int32x4_t w;
{
static uint constexpr kMask000F = 0x000F000F;
static uint constexpr kHalf2_1024 =
0x64006400; // {1024.0, 1024.0}, fp16x2_t
static uint constexpr kHalf2_1056 =
0xE420E420; // {-1056.0, -1056.0}, fp16x2_t
#pragma unroll
for (int i = 0; i < 4; i++) {
int32_t q4 = q4w & kMask000F;
int32_t q2 = (q2w & 0x3) | ((q2w & 0xC) << 14);
q4w >>= 4;
q2w >>= 4;
if constexpr (std::is_same<T, half>::value) {
int32_t q6 = q4 | (q2 << 4) | kHalf2_1024;
asm volatile("v_pk_add_f16 %0, %1, %2"
: "=v"(w[i])
: "v"(q6), "v"(kHalf2_1056));
} else {
int32_t int16_2 = q4 | (q2 << 4);
int16_t low = static_cast<int16_t>(int16_2 & 0xFFFF);
int16_t high = static_cast<int16_t>((int16_2 >> 16) & 0xFFFF);
nv_bfloat16 bf_low = __float2bfloat16(static_cast<float>(low));
nv_bfloat16 bf_high = __float2bfloat16(static_cast<float>(high));
nv_bfloat162 bf2 = __halves2bfloat162(bf_low, bf_high);
int32_t packed_bf16 = *reinterpret_cast<int32_t*>(&bf2);
w[i] = packed_add<nv_bfloat16>(packed_bf16, kRangeMin);
}
}
}
// Apply decoding scales
for (int i = 0; i < 4; i++) {
w[i] = packed_mul<T>(w[i], qs);
}
// That's pretty much it...
data[k] = w;
}
}
};
// Int8 symmetric quantization codec.
// We quantize the FP16 data to block-scaled Int8 in blocks of 4 *
// kThreadGroupSize.
template <typename T, int world_size>
struct CodecQ8 : public CodecBase {
static constexpr int kWorldSize = world_size;
// Codec tile size process by this workgroup.
// Each threads processes a fragment of f16x8_t (16B),
// into a int8x8_t (8B) and a f16 scale shared among 32 values.
static constexpr int kRankAtoms = kAtoms / kWorldSize;
static constexpr int kRankTileStride = 2176;
static constexpr int kRankTileScaleOffset = 2048;
static constexpr int kRankTransmittedTileSize = kRankTileStride * kRankAtoms;
static_assert(kRankTransmittedTileSize % 16 == 0,
"kRankTileSize must be 16B aligned.");
static constexpr int kRankBufferTileStride =
kRankTileStride / sizeof(int32x4_t);
// Total tile size for the collective communication.
static constexpr int kTransmittedTileSize =
kRankTransmittedTileSize * kWorldSize;
// Constants configuration
// {-1/128.0h, -1/128.0h}, f16x2_t
static constexpr int kScaleFactor =
std::is_same<T, half>::value ? 0xA000A000 : 0xBC00BC00;
// {1e-7, 1e-7}, f16x2_t
static constexpr int kScaleEpsilon =
std::is_same<T, half>::value ? 0x00010001 : 0x33D733D7;
// {-128, -128}, f16x2_t
static constexpr int kRangeMin =
std::is_same<T, half>::value ? 0xD800D800 : 0xC300C300;
// {+127, +127}, f16x2_t
static constexpr int kRangeMax =
std::is_same<T, half>::value ? 0x57F057F0 : 0x42FE42FE;
// {+128, +128}, int16x2_t
static constexpr int kRangeBias = 0x00800080;
__quickreduce_device_inline__ CodecQ8(int thread, int rank)
: CodecBase(thread, rank) {}
__quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer,
int32x4_t const* __restrict__ data) {
for (int k = 0; k < kRankAtoms; k++) {
int32x4_t const atom = data[k];
// Compute the absolute maximum of the atom in the thread group
// In 2 blocks of values, upper/lower halves of the f16x2_t
int wblockmax = group_abs_max<T>(atom);
// Derive scales
int decoding_scale;
int encoding_scale;
decoding_scale = packed_mul<T>(wblockmax, kScaleFactor);
encoding_scale = packed_add<T>(decoding_scale, kScaleEpsilon);
encoding_scale = packed_rcp<T>(encoding_scale);
// Apply scales to get quantized values
int32x4_t w;
for (int i = 0; i < 4; i++) {
w[i] = packed_mul<T>(atom[i], encoding_scale);
w[i] = packed_max<T>(w[i], kRangeMin);
w[i] = packed_min<T>(w[i], kRangeMax);
}
// Convert from f16x2_t to uint16x2_t
int32x4_t q;
{
int16_t* qi = reinterpret_cast<int16_t*>(&q);
T* wh = reinterpret_cast<T*>(&w);
for (int i = 0; i < 8; i++) qi[i] = (int16_t)rintf(T2float_cast(wh[i]));
for (int i = 0; i < 4; i++) {
q[i] = packed_add<int16_t>(q[i], kRangeBias);
}
}
// Pack 8 x q8 into int32x2_t
int32x2_t qw;
qw[0] = q[0] | (q[1] << 8);
qw[1] = q[2] | (q[3] << 8);
// Write quantized atom to send_buffer
// note: only the group leader stores the scale
uint8_t* atom_ptr =
reinterpret_cast<uint8_t*>(send_buffer + k * kRankBufferTileStride);
int32x2_t* qw_ptr = reinterpret_cast<int32x2_t*>(atom_ptr) + thread;
int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
(thread / 8);
__builtin_nontemporal_store(qw, qw_ptr);
if (threadIdx.x == group_leader) {
__builtin_nontemporal_store(decoding_scale, qs_ptr);
}
}
}
__quickreduce_device_inline__ void recv(int32x4_t** __restrict__ recv_buffer,
int32x4_t* __restrict__ data) {
for (int k = 0; k < kRankAtoms; k++) {
// Directly read quantized atom from recv_buffer
uint8_t* atom_ptr = reinterpret_cast<uint8_t*>(*recv_buffer);
int32x2_t* qw_ptr = reinterpret_cast<int32x2_t*>(atom_ptr) + thread;
int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
(thread / 8);
int32x2_t qw = __builtin_nontemporal_load(qw_ptr);
int qs = __builtin_nontemporal_load(qs_ptr);
*recv_buffer += kRankBufferTileStride;
// Unpack q8 into fp16x8_t
int32x4_t w;
{
static uint constexpr kMask00FF = 0x00FF00FF;
// {1024.0, 1024.0}, fp16x2_t
static uint constexpr kHalf2_1024 = 0x64006400;
// {-1152.0, -1152.0}, fp16x2_t
static uint constexpr kHalf2_1152 = 0xE480E480;
#pragma unroll
for (int i = 0; i < 4; i++) {
if constexpr (std::is_same<T, half>::value) {
int32_t q8 =
((qw[i / 2] >> ((i % 2) * 8)) & kMask00FF) | kHalf2_1024;
w[i] = packed_add<half>(q8, kHalf2_1152);
} else {
int32_t int16_2 = (qw[i / 2] >> ((i % 2) * 8)) & kMask00FF;
int16_t low = static_cast<int16_t>(int16_2 & 0xFFFF);
int16_t high = static_cast<int16_t>((int16_2 >> 16) & 0xFFFF);
nv_bfloat16 bf_low = __float2bfloat16(static_cast<float>(low));
nv_bfloat16 bf_high = __float2bfloat16(static_cast<float>(high));
nv_bfloat162 bf2 = __halves2bfloat162(bf_low, bf_high);
int32_t packed_bf16 = *reinterpret_cast<int32_t*>(&bf2);
w[i] = packed_add<nv_bfloat16>(packed_bf16, kRangeMin);
}
}
}
// Apply decoding scales
for (int i = 0; i < 4; i++) {
w[i] = packed_mul<T>(w[i], qs);
}
data[k] = w;
}
}
};
// Twoshot All Reduce
template <typename T, class Codec, bool cast_bf2half>
struct AllReduceTwoshot {
static_assert(sizeof(T) == 2);
static constexpr int kWorldSize = Codec::kWorldSize;
__device__ static void run(
T const* __restrict__ input, T* __restrict__ output,
uint32_t const N, // number of elements
int const block, // block index
int const rank, // rank index
uint8_t** __restrict__ buffer_list, // communication buffers
uint32_t const data_offset, // offset to start of the data buffer
uint32_t flag_color) {
// Topology
int thread = threadIdx.x + threadIdx.y * kWavefront;
uint8_t* rank_buffer = buffer_list[rank];
Codec codec(thread, rank);
int block_id = blockIdx.x;
int grid_size = gridDim.x;
// --------------------------------------------------------
// Read input into registers
int32x4_t tA[kAtoms];
BufferResource src_buffer(const_cast<T*>(input), N * sizeof(T));
uint32_t src_offset = block * kTileSize + thread * sizeof(int32x4_t);
for (int i = 0; i < kAtoms; i++) {
tA[i] = buffer_load_dwordx4(src_buffer.descriptor, src_offset, 0, 0);
src_offset += kAtomStride * sizeof(int32x4_t);
if constexpr (cast_bf2half) {
const nv_bfloat162* bf_buf =
reinterpret_cast<const nv_bfloat162*>(&tA[i]);
half2 half_buf[4];
#pragma unroll
for (int j = 0; j < 4; ++j) {
float2 f = __bfloat1622float2(bf_buf[j]);
half_buf[j] = __float22half2_rn(f);
}
tA[i] = *reinterpret_cast<const int32x4_t*>(half_buf);
}
}
// --------------------------------------------------------
// Phase-1A: Write segment data into the communication buffer of the target
// rank responsible for this segment.
uint32_t comm_data0_offset =
data_offset + block_id * Codec::kTransmittedTileSize;
uint32_t comm_data1_offset =
grid_size * Codec::kTransmittedTileSize + comm_data0_offset;
uint32_t comm_flags0_offset = block_id * (kWorldSize * sizeof(uint32_t));
uint32_t comm_flags1_offset =
grid_size * (kWorldSize * sizeof(uint32_t)) + comm_flags0_offset;
for (int r = 0; r < kWorldSize; r++) {
int32x4_t* send_buffer =
reinterpret_cast<int32x4_t*>(buffer_list[r] + comm_data0_offset +
rank * Codec::kRankTransmittedTileSize);
codec.send(send_buffer, &tA[r * Codec::kRankAtoms]);
}
__syncthreads();
if (thread < kWorldSize) {
int r = thread;
uint32_t* flag_ptr = reinterpret_cast<uint32_t*>(
buffer_list[r] + comm_flags0_offset + rank * sizeof(uint32_t));
set_sync_flag(flag_ptr, flag_color);
}
// --------------------------------------------------------
// Phase-1B: Reduce the segment data from the communication buffers.
int32x4_t tR[Codec::kRankAtoms] = {};
{
// Read the data from the communication buffer.
int32x4_t* recv_buffer =
reinterpret_cast<int32x4_t*>(rank_buffer + comm_data0_offset);
uint32_t* flag_ptr =
reinterpret_cast<uint32_t*>(rank_buffer + comm_flags0_offset);
for (int r = 0; r < kWorldSize; r++) {
// Wait for the flags to be set.
if (thread == 0) {
wait_sync_flag(&flag_ptr[r], flag_color);
}
__syncthreads();
// note: we reuse tA as temp buffer here
codec.recv(&recv_buffer, tA);
for (int i = 0; i < Codec::kRankAtoms; i++) {
packed_assign_add<T>(&tR[i], &tA[i]);
}
}
}
// Phase-2: Write the reduced segment to every other rank
for (int r = 0; r < kWorldSize; r++) {
int32x4_t* send_buffer =
reinterpret_cast<int32x4_t*>(buffer_list[r] + comm_data1_offset +
rank * Codec::kRankTransmittedTileSize);
codec.send(send_buffer, tR);
}
__syncthreads();
if (thread < kWorldSize) {
int r = thread;
uint32_t* flag_ptr = reinterpret_cast<uint32_t*>(
buffer_list[r] + comm_flags1_offset + rank * sizeof(uint32_t));
set_sync_flag(flag_ptr, flag_color);
}
// Phase-2: Read the gather segments from the rank's communication buffer.
{
// Read the data from the communication buffer.
int32x4_t* recv_buffer =
reinterpret_cast<int32x4_t*>(rank_buffer + comm_data1_offset);
uint32_t* flag_ptr =
reinterpret_cast<uint32_t*>(rank_buffer + comm_flags1_offset);
for (int r = 0; r < kWorldSize; r++) {
// Wait for the flags to be set.
if (thread == 0) {
wait_sync_flag(&flag_ptr[r], flag_color);
}
__syncthreads();
// Gather all reduced and final rank segments into tA.
codec.recv(&recv_buffer, &tA[r * Codec::kRankAtoms]);
}
}
// --------------------------------------------------------
// Write the result to output.
BufferResource dst_buffer(output, N * sizeof(T));
uint32_t dst_offset = block * kTileSize + thread * sizeof(int32x4_t);
for (int i = 0; i < kAtoms; i++) {
if constexpr (cast_bf2half) {
const half2* half_buf = reinterpret_cast<const half2*>(&tA[i]);
nv_bfloat162 bf16_buf[4];
#pragma unroll
for (int j = 0; j < 4; ++j) {
float2 f = __half22float2(half_buf[j]);
bf16_buf[j] = __float22bfloat162_rn(f);
}
buffer_store_dwordx4(*reinterpret_cast<const int32x4_t*>(bf16_buf),
dst_buffer.descriptor, dst_offset, 0, 0);
} else {
buffer_store_dwordx4(tA[i], dst_buffer.descriptor, dst_offset, 0, 0);
}
dst_offset += kAtomStride * sizeof(int32x4_t);
}
}
};
} // namespace quickreduce

View File

@ -1598,7 +1598,6 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
const int warpid = threadIdx.x / WARP_SIZE; const int warpid = threadIdx.x / WARP_SIZE;
const int laneid = threadIdx.x % WARP_SIZE; const int laneid = threadIdx.x % WARP_SIZE;
const int lane2id = laneid % 2; const int lane2id = laneid % 2;
const int lane4id = laneid % 4;
const int lane16id = laneid % 16; const int lane16id = laneid % 16;
const int rowid = laneid / 16; const int rowid = laneid / 16;
@ -1745,7 +1744,6 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride; const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
const int klocal_token_idx = const int klocal_token_idx =
TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id; TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE; const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX; const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;
@ -2368,7 +2366,6 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
const int warpid = threadIdx.x / WARP_SIZE; const int warpid = threadIdx.x / WARP_SIZE;
const int laneid = threadIdx.x % WARP_SIZE; const int laneid = threadIdx.x % WARP_SIZE;
const int lane2id = laneid % 2; const int lane2id = laneid % 2;
const int lane4id = laneid % 4;
const int lane16id = laneid % 16; const int lane16id = laneid % 16;
const int rowid = laneid / 16; const int rowid = laneid / 16;
@ -2514,7 +2511,6 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride; const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
const int klocal_token_idx = const int klocal_token_idx =
TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id; TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE; const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX; const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;

View File

@ -725,6 +725,24 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
custom_ar.impl("open_mem_handle", torch::kCPU, &open_mem_handle); custom_ar.impl("open_mem_handle", torch::kCPU, &open_mem_handle);
custom_ar.def("free_shared_buffer", &free_shared_buffer); custom_ar.def("free_shared_buffer", &free_shared_buffer);
#ifdef USE_ROCM
// Quick Reduce all-reduce kernels
custom_ar.def(
"qr_all_reduce(int fa, Tensor inp, Tensor out, int quant_level, bool "
"cast_bf2half) -> ()");
custom_ar.impl("qr_all_reduce", torch::kCUDA, &qr_all_reduce);
custom_ar.def("init_custom_qr", &init_custom_qr);
custom_ar.def("qr_destroy", &qr_destroy);
custom_ar.def("qr_get_handle", &qr_get_handle);
custom_ar.def("qr_open_handles(int _fa, Tensor[](b!) handles) -> ()");
custom_ar.impl("qr_open_handles", torch::kCPU, &qr_open_handles);
// Max input size in bytes
custom_ar.def("qr_max_size", &qr_max_size);
#endif
} }
REGISTER_EXTENSION(TORCH_EXTENSION_NAME) REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

View File

@ -6,30 +6,106 @@
# docs/assets/contributing/dockerfile-stages-dependency.png # docs/assets/contributing/dockerfile-stages-dependency.png
ARG CUDA_VERSION=12.8.1 ARG CUDA_VERSION=12.8.1
ARG PYTHON_VERSION=3.12
# By parameterizing the base images, we allow third-party to use their own
# base images. One use case is hermetic builds with base images stored in
# private registries that use a different repository naming conventions.
#
# Example:
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
# their own mirror. When doing so, we don't benefit from the transparent
# installation of the GPG key of the PPA, as done by add-apt-repository, so we
# also need a URL for the GPG key.
ARG DEADSNAKES_MIRROR_URL
ARG DEADSNAKES_GPGKEY_URL
# The PyPA get-pip.py script is a self contained script+zip file, that provides
# both the installer script and the pip base85-encoded zip archive. This allows
# bootstrapping pip in environment where a dsitribution package does not exist.
#
# By parameterizing the URL for get-pip.py installation script, we allow
# third-party to use their own copy of the script stored in a private mirror.
# We set the default value to the PyPA owned get-pip.py script.
#
# Reference: https://pip.pypa.io/en/stable/installation/#get-pip-py
ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py"
# PIP supports fetching the packages from custom indexes, allowing third-party
# to host the packages in private mirrors. The PIP_INDEX_URL and
# PIP_EXTRA_INDEX_URL are standard PIP environment variables to override the
# default indexes. By letting them empty by default, PIP will use its default
# indexes if the build process doesn't override the indexes.
#
# Uv uses different variables. We set them by default to the same values as
# PIP, but they can be overridden.
ARG PIP_INDEX_URL
ARG PIP_EXTRA_INDEX_URL
ARG UV_INDEX_URL=${PIP_INDEX_URL}
ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
# PyTorch provides its own indexes for standard and nightly builds
ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
# PIP supports multiple authentication schemes, including keyring
# By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to
# disabled by default, we allow third-party to use keyring authentication for
# their private Python indexes, while not changing the default behavior which
# is no authentication.
#
# Reference: https://pip.pypa.io/en/stable/topics/authentication/#keyring-support
ARG PIP_KEYRING_PROVIDER=disabled
ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER}
#################### BASE BUILD IMAGE #################### #################### BASE BUILD IMAGE ####################
# prepare basic build environment # prepare basic build environment
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base FROM ${BUILD_BASE_IMAGE} AS base
ARG CUDA_VERSION=12.8.1 ARG CUDA_VERSION
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION
ARG TARGETPLATFORM ARG TARGETPLATFORM
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ARG DEADSNAKES_MIRROR_URL
ARG DEADSNAKES_GPGKEY_URL
ARG GET_PIP_URL
# Install Python and other dependencies # Install Python and other dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo \ && apt-get install -y ccache software-properties-common git curl sudo \
&& for i in 1 2 3; do \ && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
add-apt-repository -y ppa:deadsnakes/ppa && break || \ if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \ mkdir -p -m 0755 /etc/apt/keyrings ; \
done \ curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
fi ; \
else \
for i in 1 2 3; do \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
done ; \
fi \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version && python3 --version && python3 -m pip --version
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
# Install uv for faster pip installs # Install uv for faster pip installs
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
python3 -m pip install uv python3 -m pip install uv
@ -63,21 +139,25 @@ WORKDIR /workspace
# after this step # after this step
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \ uv pip install --system \
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \ --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
"torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \
uv pip install --system \
--index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
--pre pytorch_triton==3.3.0+gitab727c40; \
fi fi
COPY requirements/common.txt requirements/common.txt COPY requirements/common.txt requirements/common.txt
COPY requirements/cuda.txt requirements/cuda.txt COPY requirements/cuda.txt requirements/cuda.txt
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/cuda.txt \ uv pip install --system -r requirements/cuda.txt \
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# cuda arch list used by torch # cuda arch list used by torch
# can be useful for both `dev` and `test` # can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2 # explicitly set the list to avoid issues with torch 2.2
# see https://github.com/pytorch/pytorch/pull/123243 # see https://github.com/pytorch/pytorch/pull/123243
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0+PTX' ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# Override the arch list for flash-attn to reduce the binary size # Override the arch list for flash-attn to reduce the binary size
ARG vllm_fa_cmake_gpu_arches='80-real;90-real' ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
@ -88,6 +168,10 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
FROM base AS build FROM base AS build
ARG TARGETPLATFORM ARG TARGETPLATFORM
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
# install build dependencies # install build dependencies
COPY requirements/build.txt requirements/build.txt COPY requirements/build.txt requirements/build.txt
@ -98,7 +182,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/build.txt \ uv pip install --system -r requirements/build.txt \
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
COPY . . COPY . .
ARG GIT_REPO_CHECK=0 ARG GIT_REPO_CHECK=0
@ -113,6 +197,8 @@ ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads ENV NVCC_THREADS=$nvcc_threads
ARG USE_SCCACHE ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2 ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0 ARG SCCACHE_S3_NO_CREDENTIALS=0
@ -121,10 +207,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \ --mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" = "1" ]; then \ if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \ echo "Installing sccache..." \
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \ && curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
&& tar -xzf sccache.tar.gz \ && tar -xzf sccache.tar.gz \
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \ && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \ && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
&& if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \ && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \ && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \ && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
@ -162,6 +249,10 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
#################### DEV IMAGE #################### #################### DEV IMAGE ####################
FROM base as dev FROM base as dev
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
@ -176,21 +267,25 @@ COPY requirements/test.txt requirements/test.txt
COPY requirements/dev.txt requirements/dev.txt COPY requirements/dev.txt requirements/dev.txt
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/dev.txt \ uv pip install --system -r requirements/dev.txt \
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
#################### DEV IMAGE #################### #################### DEV IMAGE ####################
#################### vLLM installation IMAGE #################### #################### vLLM installation IMAGE ####################
# image with vLLM installed # image with vLLM installed
# TODO: Restore to base image after FlashInfer AOT wheel fixed # TODO: Restore to base image after FlashInfer AOT wheel fixed
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base FROM ${FINAL_BASE_IMAGE} AS vllm-base
ARG CUDA_VERSION=12.8.1 ARG CUDA_VERSION
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION
WORKDIR /vllm-workspace WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ARG TARGETPLATFORM ARG TARGETPLATFORM
SHELL ["/bin/bash", "-c"] SHELL ["/bin/bash", "-c"]
ARG DEADSNAKES_MIRROR_URL
ARG DEADSNAKES_GPGKEY_URL
ARG GET_PIP_URL
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
@ -200,17 +295,33 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \ && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& for i in 1 2 3; do \ && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
add-apt-repository -y ppa:deadsnakes/ppa && break || \ if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \ mkdir -p -m 0755 /etc/apt/keyrings ; \
done \ curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
fi ; \
else \
for i in 1 2 3; do \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
done ; \
fi \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version && python3 --version && python3 -m pip --version
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
# Install uv for faster pip installs # Install uv for faster pip installs
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
python3 -m pip install uv python3 -m pip install uv
@ -232,19 +343,23 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# after this step # after this step
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \ uv pip install --system \
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \ --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
"torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \
uv pip install --system \
--index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
--pre pytorch_triton==3.3.0+gitab727c40 ; \
fi fi
# Install vllm wheel first, so that torch etc will be installed. # Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/uv \ --mount=type=cache,target=/root/.cache/uv \
uv pip install --system dist/*.whl --verbose \ uv pip install --system dist/*.whl --verbose \
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# If we need to build FlashInfer wheel before its release: # If we need to build FlashInfer wheel before its release:
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+ # $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a' # $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0'
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive # $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
# $ cd flashinfer # $ cd flashinfer
# $ git checkout v0.2.6.post1 # $ git checkout v0.2.6.post1
@ -254,15 +369,20 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
# -rw-rw-r-- 1 mgoin mgoin 205M Jun 9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl # -rw-rw-r-- 1 mgoin mgoin 205M Jun 9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
# Allow specifying a version, Git revision or local .whl file
ARG FLASHINFER_CUDA128_INDEX_URL="https://download.pytorch.org/whl/cu128/flashinfer"
ARG FLASHINFER_CUDA128_WHEEL="flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl"
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
ARG FLASHINFER_GIT_REF="v0.2.6.post1"
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
. /etc/environment && \ . /etc/environment && \
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
# FlashInfer already has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use # FlashInfer already has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
if [[ "$CUDA_VERSION" == 12.8* ]]; then \ if [[ "$CUDA_VERSION" == 12.8* ]]; then \
uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl; \ uv pip install --system ${FLASHINFER_CUDA128_INDEX_URL}/${FLASHINFER_CUDA128_WHEEL} ; \
else \ else \
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a' && \ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0' && \
git clone https://github.com/flashinfer-ai/flashinfer.git --single-branch --branch v0.2.6.post1 --recursive && \ git clone ${FLASHINFER_GIT_REPO} --single-branch --branch ${FLASHINFER_GIT_REF} --recursive && \
# Needed to build AOT kernels # Needed to build AOT kernels
(cd flashinfer && \ (cd flashinfer && \
python3 -m flashinfer.aot && \ python3 -m flashinfer.aot && \
@ -286,7 +406,7 @@ uv pip list
COPY requirements/build.txt requirements/build.txt COPY requirements/build.txt requirements/build.txt
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/build.txt \ uv pip install --system -r requirements/build.txt \
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
#################### vLLM installation IMAGE #################### #################### vLLM installation IMAGE ####################
@ -297,6 +417,11 @@ FROM vllm-base AS test
ADD . /vllm-workspace/ ADD . /vllm-workspace/
ARG PYTHON_VERSION
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
@ -307,7 +432,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4" uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
# install development dependencies (for testing) # install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
if [ "$CUDA_MAJOR" -ge 12 ]; then \ if [ "$CUDA_MAJOR" -ge 12 ]; then \
uv pip install --system -r requirements/dev.txt; \ uv pip install --system -r requirements/dev.txt; \
@ -323,7 +448,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
ENV HF_HUB_ENABLE_HF_TRANSFER 1 ENV HF_HUB_ENABLE_HF_TRANSFER 1
# Copy in the v1 package for testing (it isn't distributed yet) # Copy in the v1 package for testing (it isn't distributed yet)
COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1 COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
# doc requires source code # doc requires source code
# we hide them inside `test_docs/` , so that this source code # we hide them inside `test_docs/` , so that this source code
@ -340,6 +465,9 @@ RUN mv mkdocs.yaml test_docs/
FROM vllm-base AS vllm-openai-base FROM vllm-base AS vllm-openai-base
ARG TARGETPLATFORM ARG TARGETPLATFORM
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500

View File

@ -66,7 +66,7 @@ ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \ --mount=type=bind,src=requirements/cpu-build.txt,target=requirements/build.txt \
uv pip install -r requirements/build.txt uv pip install -r requirements/build.txt
COPY . . COPY . .
@ -79,6 +79,22 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \ --mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
######################### TEST DEPS #########################
FROM base AS vllm-test-deps
WORKDIR /workspace/vllm
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
cp requirements/test.in requirements/cpu-test.in && \
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
sed -i 's/torch==.*/torch==2.6.0/g' requirements/cpu-test.in && \
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -r requirements/cpu-test.txt
######################### DEV IMAGE ######################### ######################### DEV IMAGE #########################
FROM vllm-build AS vllm-dev FROM vllm-build AS vllm-dev
@ -97,28 +113,19 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \ --mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py develop VLLM_TARGET_DEVICE=cpu python3 setup.py develop
COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=requirements/test.in,target=requirements/test.in \
cp requirements/test.in requirements/test-cpu.in && \
sed -i '/mamba_ssm/d' requirements/test-cpu.in && \
uv pip compile requirements/test-cpu.in -o requirements/test.txt && \
uv pip install -r requirements/dev.txt && \ uv pip install -r requirements/dev.txt && \
pre-commit install --hook-type pre-commit --hook-type commit-msg pre-commit install --hook-type pre-commit --hook-type commit-msg
ENTRYPOINT ["bash"] ENTRYPOINT ["bash"]
######################### TEST IMAGE ######################### ######################### TEST IMAGE #########################
FROM base AS vllm-test FROM vllm-test-deps AS vllm-test
WORKDIR /workspace/ WORKDIR /workspace/
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=requirements/test.in,target=requirements/test.in \
cp requirements/test.in requirements/test-cpu.in && \
sed -i '/mamba_ssm/d' requirements/test-cpu.in && \
uv pip compile requirements/test-cpu.in -o requirements/cpu-test.txt && \
uv pip install -r requirements/cpu-test.txt
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \ --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
uv pip install dist/*.whl uv pip install dist/*.whl

View File

@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG FA_BRANCH="1a7f4dfa" ARG FA_BRANCH="1a7f4dfa"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
ARG AITER_BRANCH="c1debd8" ARG AITER_BRANCH="6487649"
ARG AITER_REPO="https://github.com/ROCm/aiter.git" ARG AITER_REPO="https://github.com/ROCm/aiter.git"
FROM ${BASE_IMAGE} AS base FROM ${BASE_IMAGE} AS base

View File

@ -35,6 +35,7 @@ RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
ENV VLLM_TARGET_DEVICE=xpu ENV VLLM_TARGET_DEVICE=xpu
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \ --mount=type=bind,source=.git,target=.git \

View File

@ -48,7 +48,12 @@ nav:
- General: - General:
- glob: contributing/* - glob: contributing/*
flatten_single_child_sections: true flatten_single_child_sections: true
- Model Implementation: contributing/model - Model Implementation:
- contributing/model/README.md
- contributing/model/basic.md
- contributing/model/registration.md
- contributing/model/tests.md
- contributing/model/multimodal.md
- Design Documents: - Design Documents:
- V0: design - V0: design
- V1: design/v1 - V1: design/v1

View File

@ -40,7 +40,7 @@ vLLM is flexible and easy to use with:
- OpenAI-compatible API server - OpenAI-compatible API server
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators. - Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
- Prefix caching support - Prefix caching support
- Multi-lora support - Multi-LoRA support
For more information, check out the following: For more information, check out the following:

View File

@ -91,7 +91,7 @@ source to unblock the update process.
### FlashInfer ### FlashInfer
Here is how to build and install it from source with torch2.7.0+cu128 in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271): Here is how to build and install it from source with torch2.7.0+cu128 in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271):
``` ```bash
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX' export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
export FLASHINFER_ENABLE_SM90=1 export FLASHINFER_ENABLE_SM90=1
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1" uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1"
@ -105,14 +105,14 @@ team if you want to get the package published there.
### xFormers ### xFormers
Similar to FlashInfer, here is how to build and install xFormers from source: Similar to FlashInfer, here is how to build and install xFormers from source:
``` ```bash
export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX' export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
MAX_JOBS=16 uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30" MAX_JOBS=16 uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
``` ```
### Mamba ### Mamba
``` ```bash
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4" uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
``` ```

View File

@ -16,35 +16,33 @@ vllm {chat,complete,serve,bench,collect-env,run-batch}
Start the vLLM OpenAI Compatible API server. Start the vLLM OpenAI Compatible API server.
Examples: ??? Examples
```bash ```bash
# Start with a model # Start with a model
vllm serve meta-llama/Llama-2-7b-hf vllm serve meta-llama/Llama-2-7b-hf
# Specify the port # Specify the port
vllm serve meta-llama/Llama-2-7b-hf --port 8100 vllm serve meta-llama/Llama-2-7b-hf --port 8100
# Check with --help for more options # Check with --help for more options
# To list all groups # To list all groups
vllm serve --help=listgroup vllm serve --help=listgroup
# To view a argument group # To view a argument group
vllm serve --help=ModelConfig vllm serve --help=ModelConfig
# To view a single argument # To view a single argument
vllm serve --help=max-num-seqs vllm serve --help=max-num-seqs
# To search by keyword # To search by keyword
vllm serve --help=max vllm serve --help=max
``` ```
## chat ## chat
Generate chat completions via the running API server. Generate chat completions via the running API server.
Examples:
```bash ```bash
# Directly connect to localhost API without arguments # Directly connect to localhost API without arguments
vllm chat vllm chat
@ -60,8 +58,6 @@ vllm chat --quick "hi"
Generate text completions based on the given prompt via the running API server. Generate text completions based on the given prompt via the running API server.
Examples:
```bash ```bash
# Directly connect to localhost API without arguments # Directly connect to localhost API without arguments
vllm complete vllm complete
@ -73,6 +69,8 @@ vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1
vllm complete --quick "The future of AI is" vllm complete --quick "The future of AI is"
``` ```
</details>
## bench ## bench
Run benchmark tests for latency online serving throughput and offline inference throughput. Run benchmark tests for latency online serving throughput and offline inference throughput.
@ -89,8 +87,6 @@ vllm bench {latency, serve, throughput}
Benchmark the latency of a single batch of requests. Benchmark the latency of a single batch of requests.
Example:
```bash ```bash
vllm bench latency \ vllm bench latency \
--model meta-llama/Llama-3.2-1B-Instruct \ --model meta-llama/Llama-3.2-1B-Instruct \
@ -104,8 +100,6 @@ vllm bench latency \
Benchmark the online serving throughput. Benchmark the online serving throughput.
Example:
```bash ```bash
vllm bench serve \ vllm bench serve \
--model meta-llama/Llama-3.2-1B-Instruct \ --model meta-llama/Llama-3.2-1B-Instruct \
@ -120,8 +114,6 @@ vllm bench serve \
Benchmark offline inference throughput. Benchmark offline inference throughput.
Example:
```bash ```bash
vllm bench throughput \ vllm bench throughput \
--model meta-llama/Llama-3.2-1B-Instruct \ --model meta-llama/Llama-3.2-1B-Instruct \
@ -143,7 +135,8 @@ vllm collect-env
Run batch prompts and write results to file. Run batch prompts and write results to file.
Examples: <details>
<summary>Examples</summary>
```bash ```bash
# Running with a local file # Running with a local file
@ -159,6 +152,8 @@ vllm run-batch \
--model meta-llama/Meta-Llama-3-8B-Instruct --model meta-llama/Meta-Llama-3-8B-Instruct
``` ```
</details>
## More Help ## More Help
For detailed options of any subcommand, use: For detailed options of any subcommand, use:

View File

@ -0,0 +1,6 @@
---
title: Contact Us
---
[](){ #contactus }
--8<-- "README.md:contact-us"

View File

@ -57,19 +57,21 @@ By default, we optimize model inference using CUDA graphs which take up extra me
You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage: You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
```python ??? Code
from vllm import LLM
from vllm.config import CompilationConfig, CompilationLevel
llm = LLM( ```python
model="meta-llama/Llama-3.1-8B-Instruct", from vllm import LLM
compilation_config=CompilationConfig( from vllm.config import CompilationConfig, CompilationLevel
level=CompilationLevel.PIECEWISE,
# By default, it goes up to max_num_seqs llm = LLM(
cudagraph_capture_sizes=[1, 2, 4, 8, 16], model="meta-llama/Llama-3.1-8B-Instruct",
), compilation_config=CompilationConfig(
) level=CompilationLevel.PIECEWISE,
``` # By default, it goes up to max_num_seqs
cudagraph_capture_sizes=[1, 2, 4, 8, 16],
),
)
```
You can disable graph capturing completely via the `enforce_eager` flag: You can disable graph capturing completely via the `enforce_eager` flag:
@ -127,18 +129,20 @@ reduce the size of the processed multi-modal inputs, which in turn saves memory.
Here are some examples: Here are some examples:
```python ??? Code
from vllm import LLM
# Available for Qwen2-VL series models ```python
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", from vllm import LLM
mm_processor_kwargs={
"max_pixels": 768 * 768, # Default is 1280 * 28 * 28
})
# Available for InternVL series models # Available for Qwen2-VL series models
llm = LLM(model="OpenGVLab/InternVL2-2B", llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
mm_processor_kwargs={ mm_processor_kwargs={
"max_dynamic_patch": 4, # Default is 12 "max_pixels": 768 * 768, # Default is 1280 * 28 * 28
}) })
```
# Available for InternVL series models
llm = LLM(model="OpenGVLab/InternVL2-2B",
mm_processor_kwargs={
"max_dynamic_patch": 4, # Default is 12
})
```

View File

@ -7,6 +7,8 @@ vLLM uses the following environment variables to configure the system:
All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables). All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
```python ??? Code
--8<-- "vllm/envs.py:env-vars-definition"
``` ```python
--8<-- "vllm/envs.py:env-vars-definition"
```

View File

@ -29,6 +29,8 @@ See <gh-file:LICENSE>.
Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
Check out the [building from source][build-from-source] documentation for details. Check out the [building from source][build-from-source] documentation for details.
For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
### Building the docs with MkDocs ### Building the docs with MkDocs
#### Introduction to MkDocs #### Introduction to MkDocs
@ -93,25 +95,27 @@ For additional features and advanced configurations, refer to the official [MkDo
## Testing ## Testing
```bash ??? note "Commands"
pip install -r requirements/dev.txt
# Linting, formatting and static type checking ```bash
pre-commit install --hook-type pre-commit --hook-type commit-msg pip install -r requirements/dev.txt
# You can manually run pre-commit with # Linting, formatting and static type checking
pre-commit run --all-files pre-commit install --hook-type pre-commit --hook-type commit-msg
# To manually run something from CI that does not run # You can manually run pre-commit with
# locally by default, you can run: pre-commit run --all-files
pre-commit run mypy-3.9 --hook-stage manual --all-files
# Unit tests # To manually run something from CI that does not run
pytest tests/ # locally by default, you can run:
pre-commit run mypy-3.9 --hook-stage manual --all-files
# Run tests for a single test file with detailed output # Unit tests
pytest -s -v tests/test_logger.py pytest tests/
```
# Run tests for a single test file with detailed output
pytest -s -v tests/test_logger.py
```
!!! tip !!! tip
Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12. Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
@ -147,6 +151,14 @@ the terms of the DCO.
Using `-s` with `git commit` will automatically add this header. Using `-s` with `git commit` will automatically add this header.
!!! tip
You can enable automatic sign-off via your IDE:
- **PyCharm**: Click on the `Show Commit Options` icon to the right of the `Commit and Push...` button in the `Commit` window.
It will bring up a `git` window where you can modify the `Author` and enable `Sign-off commit`.
- **VSCode**: Open the [Settings editor](https://code.visualstudio.com/docs/configure/settings)
and enable the `Git: Always Sign Off` (`git.alwaysSignOff`) field.
### PR Title and Classification ### PR Title and Classification
Only specific types of PRs will be reviewed. The PR title is prefixed Only specific types of PRs will be reviewed. The PR title is prefixed
@ -186,6 +198,7 @@ The PR needs to meet the following code quality standards:
### Adding or Changing Kernels ### Adding or Changing Kernels
When actively developing or modifying kernels, using the [Incremental Compilation Workflow](./incremental_build.md) is highly recommended for faster build times.
Each custom kernel needs a schema and one or more implementations to be registered with PyTorch. Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.
- Make sure custom ops are registered following PyTorch guidelines: - Make sure custom ops are registered following PyTorch guidelines:

View File

@ -0,0 +1,138 @@
# Incremental Compilation Workflow
When working on vLLM's C++/CUDA kernels located in the `csrc/` directory, recompiling the entire project with `uv pip install -e .` for every change can be time-consuming. An incremental compilation workflow using CMake allows for faster iteration by only recompiling the necessary components after an initial setup. This guide details how to set up and use such a workflow, which complements your editable Python installation.
## Prerequisites
Before setting up the incremental build:
1. **vLLM Editable Install:** Ensure you have vLLM installed from source in an editable mode. Using pre-compiled wheels for the initial editable setup can be faster, as the CMake workflow will handle subsequent kernel recompilations.
```console
uv venv --python 3.12 --seed
source .venv/bin/activate
VLLM_USE_PRECOMPILED=1 uv pip install -U -e . --torch-backend=auto
```
2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu/cuda.inc.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary.
3. **Build Tools:** It is highly recommended to install `ccache` for fast rebuilds by caching compilation results (e.g., `sudo apt install ccache` or `conda install ccache`). Also, ensure the core build dependencies like `cmake` and `ninja` are installed. These are installable through `requirements/build.txt` or your system's package manager.
```console
uv pip install -r requirements/build.txt --torch-backend=auto
```
## Setting up the CMake Build Environment
The incremental build process is managed through CMake. You can configure your build settings using a `CMakeUserPresets.json` file at the root of the vLLM repository.
### Generate `CMakeUserPresets.json` using the helper script
To simplify the setup, vLLM provides a helper script that attempts to auto-detect your system's configuration (like CUDA path, Python environment, and CPU cores) and generates the `CMakeUserPresets.json` file for you.
**Run the script:**
Navigate to the root of your vLLM clone and execute the following command:
```console
python tools/generate_cmake_presets.py
```
The script will prompt you if it cannot automatically determine certain paths (e.g., `nvcc` or a specific Python executable for your vLLM development environment). Follow the on-screen prompts. If an existing `CMakeUserPresets.json` is found, the script will ask for confirmation before overwriting it.
After running the script, a `CMakeUserPresets.json` file will be created in the root of your vLLM repository.
### Example `CMakeUserPresets.json`
Below is an example of what the generated `CMakeUserPresets.json` might look like. The script will tailor these values based on your system and any input you provide.
```json
{
"version": 6,
"cmakeMinimumRequired": {
"major": 3,
"minor": 26,
"patch": 1
},
"configurePresets": [
{
"name": "release",
"generator": "Ninja",
"binaryDir": "${sourceDir}/cmake-build-release",
"cacheVariables": {
"CMAKE_CUDA_COMPILER": "/usr/local/cuda/bin/nvcc",
"CMAKE_C_COMPILER_LAUNCHER": "ccache",
"CMAKE_CXX_COMPILER_LAUNCHER": "ccache",
"CMAKE_CUDA_COMPILER_LAUNCHER": "ccache",
"CMAKE_BUILD_TYPE": "Release",
"VLLM_PYTHON_EXECUTABLE": "/home/user/venvs/vllm/bin/python",
"CMAKE_INSTALL_PREFIX": "${sourceDir}",
"CMAKE_CUDA_FLAGS": "",
"NVCC_THREADS": "4",
"CMAKE_JOB_POOLS": "compile=32"
}
}
],
"buildPresets": [
{
"name": "release",
"configurePreset": "release",
"jobs": 32
}
]
}
```
**What do the various configurations mean?**
- `CMAKE_CUDA_COMPILER`: Path to your `nvcc` binary. The script attempts to find this automatically.
- `CMAKE_C_COMPILER_LAUNCHER`, `CMAKE_CXX_COMPILER_LAUNCHER`, `CMAKE_CUDA_COMPILER_LAUNCHER`: Setting these to `ccache` (or `sccache`) significantly speeds up rebuilds by caching compilation results. Ensure `ccache` is installed (e.g., `sudo apt install ccache` or `conda install ccache`). The script sets these by default.
- `VLLM_PYTHON_EXECUTABLE`: Path to the Python executable in your vLLM development environment. The script will prompt for this, defaulting to the current Python environment if suitable.
- `CMAKE_INSTALL_PREFIX: "${sourceDir}"`: Specifies that the compiled components should be installed back into your vLLM source directory. This is crucial for the editable install, as it makes the newly built kernels immediately available to your Python environment.
- `CMAKE_JOB_POOLS` and `jobs` in build presets: Control the parallelism of the build. The script sets these based on the number of CPU cores detected on your system.
- `binaryDir`: Specifies where the build artifacts will be stored (e.g., `cmake-build-release`).
## Building and Installing with CMake
Once your `CMakeUserPresets.json` is configured:
1. **Initialize the CMake build environment:**
This step configures the build system according to your chosen preset (e.g., `release`) and creates the build directory at `binaryDir`
```console
cmake --preset release
```
2. **Build and install the vLLM components:**
This command compiles the code and installs the resulting binaries into your vLLM source directory, making them available to your editable Python installation.
```console
cmake --build --preset release --target install
```
3. **Make changes and repeat!**
Now you start using your editable install of vLLM, testing and making changes as needed. If you need to build again to update based on changes, simply run the CMake command again to build only the affected files.
```console
cmake --build --preset release --target install
```
## Verifying the Build
After a successful build, you will find a populated build directory (e.g., `cmake-build-release/` if you used the `release` preset and the example configuration).
```console
> ls cmake-build-release/
bin cmake_install.cmake _deps machete_generation.log
build.ninja CPackConfig.cmake detect_cuda_compute_capabilities.cu marlin_generation.log
_C.abi3.so CPackSourceConfig.cmake detect_cuda_version.cc _moe_C.abi3.so
CMakeCache.txt ctest _flashmla_C.abi3.so moe_marlin_generation.log
CMakeFiles cumem_allocator.abi3.so install_local_manifest.txt vllm-flash-attn
```
The `cmake --build ... --target install` command copies the compiled shared libraries (like `_C.abi3.so`, `_moe_C.abi3.so`, etc.) into the appropriate `vllm` package directory within your source tree. This updates your editable installation with the newly compiled kernels.
## Additional Tips
- **Adjust Parallelism:** Fine-tune the `CMAKE_JOB_POOLS` in `configurePresets` and `jobs` in `buildPresets` in your `CMakeUserPresets.json`. Too many jobs can overload systems with limited RAM or CPU cores, leading to slower builds or system instability. Too few won't fully utilize available resources.
- **Clean Builds When Necessary:** If you encounter persistent or strange build errors, especially after significant changes or switching branches, consider removing the CMake build directory (e.g., `rm -rf cmake-build-release`) and re-running the `cmake --preset` and `cmake --build` commands.
- **Specific Target Builds:** For even faster iterations when working on a specific module, you can sometimes build a specific target instead of the full `install` target, though `install` ensures all necessary components are updated in your Python environment. Refer to CMake documentation for more advanced target management.

View File

@ -1,21 +1,23 @@
--- ---
title: Adding a New Model title: Summary
--- ---
[](){ #new-model } [](){ #new-model }
This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM. !!! important
Many decoder language models can now be automatically loaded using the [Transformers backend][transformers-backend] without having to implement them in vLLM. See if `vllm serve <model>` works first!
Contents: vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features][compatibility-matrix] to optimize their performance.
- [Basic](basic.md) The complexity of integrating a model into vLLM depends heavily on the model's architecture.
- [Registration](registration.md) The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
- [Tests](tests.md) However, this can be more complex for models that include new operators (e.g., a new attention mechanism).
- [Multimodal](multimodal.md)
!!! note Read through these pages for a step-by-step guide:
The complexity of adding a new model depends heavily on the model's architecture.
The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. - [Basic Model](basic.md)
However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. - [Registering a Model](registration.md)
- [Unit Testing](tests.md)
- [Multi-Modal Support](multimodal.md)
!!! tip !!! tip
If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues) If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)

View File

@ -1,5 +1,5 @@
--- ---
title: Implementing a Basic Model title: Basic Model
--- ---
[](){ #new-model-basic } [](){ #new-model-basic }
@ -27,33 +27,35 @@ All vLLM modules within the model must include a `prefix` argument in their cons
The initialization code should look like this: The initialization code should look like this:
```python ??? Code
from torch import nn
from vllm.config import VllmConfig
from vllm.attention import Attention
class MyAttention(nn.Module): ```python
def __init__(self, vllm_config: VllmConfig, prefix: str): from torch import nn
super().__init__() from vllm.config import VllmConfig
self.attn = Attention(prefix=f"{prefix}.attn") from vllm.attention import Attention
class MyDecoderLayer(nn.Module): class MyAttention(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str): def __init__(self, vllm_config: VllmConfig, prefix: str):
super().__init__() super().__init__()
self.self_attn = MyAttention(prefix=f"{prefix}.self_attn") self.attn = Attention(prefix=f"{prefix}.attn")
class MyModel(nn.Module): class MyDecoderLayer(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str): def __init__(self, vllm_config: VllmConfig, prefix: str):
super().__init__() super().__init__()
self.layers = nn.ModuleList( self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
[MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
)
class MyModelForCausalLM(nn.Module): class MyModel(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, vllm_config: VllmConfig, prefix: str):
super().__init__() super().__init__()
self.model = MyModel(vllm_config, prefix=f"{prefix}.model") self.layers = nn.ModuleList(
``` [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
)
class MyModelForCausalLM(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
```
### Computation Code ### Computation Code

View File

@ -25,59 +25,63 @@ Further update the model as follows:
- Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs. - Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
```python ??? Code
class YourModelForImage2Seq(nn.Module):
...
def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor: ```python
class YourModelForImage2Seq(nn.Module):
...
assert self.vision_encoder is not None def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
image_features = self.vision_encoder(image_input)
return self.multi_modal_projector(image_features)
def get_multimodal_embeddings( assert self.vision_encoder is not None
self, **kwargs: object) -> Optional[MultiModalEmbeddings]: image_features = self.vision_encoder(image_input)
return self.multi_modal_projector(image_features)
# Validate the multimodal input keyword arguments def get_multimodal_embeddings(
image_input = self._parse_and_validate_image_input(**kwargs) self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
if image_input is None:
return None
# Run multimodal inputs through encoder and projector # Validate the multimodal input keyword arguments
vision_embeddings = self._process_image_input(image_input) image_input = self._parse_and_validate_image_input(**kwargs)
return vision_embeddings if image_input is None:
``` return None
# Run multimodal inputs through encoder and projector
vision_embeddings = self._process_image_input(image_input)
return vision_embeddings
```
!!! important !!! important
The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request. The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
- Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings. - Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
```python ??? Code
from .utils import merge_multimodal_embeddings
class YourModelForImage2Seq(nn.Module): ```python
... from .utils import merge_multimodal_embeddings
def get_input_embeddings( class YourModelForImage2Seq(nn.Module):
self, ...
input_ids: torch.Tensor,
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
# `get_input_embeddings` should already be implemented for the language def get_input_embeddings(
# model as one of the requirements of basic vLLM model implementation. self,
inputs_embeds = self.language_model.get_input_embeddings(input_ids) input_ids: torch.Tensor,
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
if multimodal_embeddings is not None: # `get_input_embeddings` should already be implemented for the language
inputs_embeds = merge_multimodal_embeddings( # model as one of the requirements of basic vLLM model implementation.
input_ids=input_ids, inputs_embeds = self.language_model.get_input_embeddings(input_ids)
inputs_embeds=inputs_embeds,
multimodal_embeddings=multimodal_embeddings,
placeholder_token_id=self.config.image_token_index)
return inputs_embeds if multimodal_embeddings is not None:
``` inputs_embeds = merge_multimodal_embeddings(
input_ids=input_ids,
inputs_embeds=inputs_embeds,
multimodal_embeddings=multimodal_embeddings,
placeholder_token_id=self.config.image_token_index)
return inputs_embeds
```
- Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model. - Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model.
@ -135,42 +139,46 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
Looking at the code of HF's `LlavaForConditionalGeneration`: Looking at the code of HF's `LlavaForConditionalGeneration`:
```python ??? Code
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
n_image_features = image_features.shape[0] * image_features.shape[1]
if n_image_tokens != n_image_features: ```python
raise ValueError( # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
n_image_features = image_features.shape[0] * image_features.shape[1]
if n_image_tokens != n_image_features:
raise ValueError(
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
)
special_image_mask = (
(input_ids == self.config.image_token_index)
.unsqueeze(-1)
.expand_as(inputs_embeds)
.to(inputs_embeds.device)
) )
special_image_mask = ( image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
(input_ids == self.config.image_token_index) inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
.unsqueeze(-1) ```
.expand_as(inputs_embeds)
.to(inputs_embeds.device)
)
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
```
The number of placeholder feature tokens per image is `image_features.shape[1]`. The number of placeholder feature tokens per image is `image_features.shape[1]`.
`image_features` is calculated inside the `get_image_features` method: `image_features` is calculated inside the `get_image_features` method:
```python ??? Code
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
selected_image_feature = image_outputs.hidden_states[vision_feature_layer] ```python
if vision_feature_select_strategy == "default": # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
selected_image_feature = selected_image_feature[:, 1:] image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
elif vision_feature_select_strategy == "full":
selected_image_feature = selected_image_feature selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
else: if vision_feature_select_strategy == "default":
raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") selected_image_feature = selected_image_feature[:, 1:]
image_features = self.multi_modal_projector(selected_image_feature) elif vision_feature_select_strategy == "full":
return image_features selected_image_feature = selected_image_feature
``` else:
raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
image_features = self.multi_modal_projector(selected_image_feature)
return image_features
```
We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower
(`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model). (`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model).
@ -193,20 +201,22 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`: To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:
```python ??? Code
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
target_dtype = self.patch_embedding.weight.dtype
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
class_embeds = self.class_embedding.expand(batch_size, 1, -1) ```python
embeddings = torch.cat([class_embeds, patch_embeds], dim=1) # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
if interpolate_pos_encoding: target_dtype = self.patch_embedding.weight.dtype
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
else: patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
embeddings = embeddings + self.position_embedding(self.position_ids)
return embeddings class_embeds = self.class_embedding.expand(batch_size, 1, -1)
``` embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
if interpolate_pos_encoding:
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
else:
embeddings = embeddings + self.position_embedding(self.position_ids)
return embeddings
```
We can infer that `embeddings.shape[1] == self.num_positions`, where We can infer that `embeddings.shape[1] == self.num_positions`, where
@ -218,55 +228,59 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
Overall, the number of placeholder feature tokens for an image can be calculated as: Overall, the number of placeholder feature tokens for an image can be calculated as:
```python ??? Code
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
hf_config = self.get_hf_config()
hf_processor = self.get_hf_processor()
image_size = hf_config.vision_config.image_size ```python
patch_size = hf_config.vision_config.patch_size def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
hf_config = self.get_hf_config()
hf_processor = self.get_hf_processor()
num_image_tokens = (image_size // patch_size) ** 2 + 1 image_size = hf_config.vision_config.image_size
if hf_processor.vision_feature_select_strategy == "default": patch_size = hf_config.vision_config.patch_size
num_image_tokens -= 1
return num_image_tokens num_image_tokens = (image_size // patch_size) ** 2 + 1
``` if hf_processor.vision_feature_select_strategy == "default":
num_image_tokens -= 1
return num_image_tokens
```
Notice that the number of image tokens doesn't depend on the image width and height. Notice that the number of image tokens doesn't depend on the image width and height.
We can simply use a dummy `image_size` to calculate the multimodal profiling data: We can simply use a dummy `image_size` to calculate the multimodal profiling data:
```python ??? Code
# NOTE: In actuality, this is usually implemented as part of the
# model's subclass of `BaseProcessingInfo`, but we show it as is
# here for simplicity.
def get_image_size_with_most_features(self) -> ImageSize:
hf_config = self.get_hf_config()
width = height = hf_config.image_size
return ImageSize(width=width, height=height)
def get_dummy_mm_data( ```python
self, # NOTE: In actuality, this is usually implemented as part of the
seq_len: int, # model's subclass of `BaseProcessingInfo`, but we show it as is
mm_counts: Mapping[str, int], # here for simplicity.
) -> MultiModalDataDict: def get_image_size_with_most_features(self) -> ImageSize:
num_images = mm_counts.get("image", 0) hf_config = self.get_hf_config()
width = height = hf_config.image_size
return ImageSize(width=width, height=height)
target_width, target_height = \ def get_dummy_mm_data(
self.info.get_image_size_with_most_features() self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
return { target_width, target_height = \
"image": self.info.get_image_size_with_most_features()
self._get_dummy_images(width=target_width,
height=target_height, return {
num_images=num_images) "image":
} self._get_dummy_images(width=target_width,
``` height=target_height,
num_images=num_images)
}
```
For the text, we simply expand the multimodal image token from the model config to match the desired number of images. For the text, we simply expand the multimodal image token from the model config to match the desired number of images.
@ -284,21 +298,23 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
Looking at the code of HF's `FuyuForCausalLM`: Looking at the code of HF's `FuyuForCausalLM`:
```python ??? Code
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
if image_patches is not None and past_key_values is None: ```python
patch_embeddings = [ # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype)) if image_patches is not None and past_key_values is None:
.squeeze(0) patch_embeddings = [
.to(inputs_embeds.device) self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
for patch in image_patches .squeeze(0)
] .to(inputs_embeds.device)
inputs_embeds = self.gather_continuous_embeddings( for patch in image_patches
word_embeddings=inputs_embeds, ]
continuous_embeddings=patch_embeddings, inputs_embeds = self.gather_continuous_embeddings(
image_patch_input_indices=image_patches_indices, word_embeddings=inputs_embeds,
) continuous_embeddings=patch_embeddings,
``` image_patch_input_indices=image_patches_indices,
)
```
The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`, The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`,
which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`. which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`.
@ -312,92 +328,98 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`, In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`,
returning the dimensions after resizing (but before padding) as metadata. returning the dimensions after resizing (but before padding) as metadata.
```python ??? Code
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
batch_images = image_encoding["images"]
image_unpadded_heights = image_encoding["image_unpadded_heights"]
image_unpadded_widths = image_encoding["image_unpadded_widths"]
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L ```python
if do_resize: # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
batch_images = [ image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
[self.resize(image, size=size, input_data_format=input_data_format) for image in images] batch_images = image_encoding["images"]
for images in batch_images image_unpadded_heights = image_encoding["image_unpadded_heights"]
] image_unpadded_widths = image_encoding["image_unpadded_widths"]
image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images] # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L
image_unpadded_heights = [[image_size[0]] for image_size in image_sizes] if do_resize:
image_unpadded_widths = [[image_size[1]] for image_size in image_sizes] batch_images = [
[self.resize(image, size=size, input_data_format=input_data_format) for image in images]
if do_pad: for images in batch_images
batch_images = [
[
self.pad_image(
image,
size=size,
mode=padding_mode,
constant_values=padding_value,
input_data_format=input_data_format,
)
for image in images
] ]
for images in batch_images
] image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
``` image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
if do_pad:
batch_images = [
[
self.pad_image(
image,
size=size,
mode=padding_mode,
constant_values=padding_value,
input_data_format=input_data_format,
)
for image in images
]
for images in batch_images
]
```
In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata: In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata:
```python ??? Code
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
model_image_input = self.image_processor.preprocess_with_tokenizer_info(
image_input=tensor_batch_images,
image_present=image_present,
image_unpadded_h=image_unpadded_heights,
image_unpadded_w=image_unpadded_widths,
image_placeholder_id=image_placeholder_id,
image_newline_id=image_newline_id,
variable_sized=True,
)
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658 ```python
image_height, image_width = image.shape[1], image.shape[2] # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
if variable_sized: # variable_sized=True model_image_input = self.image_processor.preprocess_with_tokenizer_info(
new_h = min( image_input=tensor_batch_images,
image_height, image_present=image_present,
math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height, image_unpadded_h=image_unpadded_heights,
image_unpadded_w=image_unpadded_widths,
image_placeholder_id=image_placeholder_id,
image_newline_id=image_newline_id,
variable_sized=True,
) )
new_w = min(
image_width,
math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
)
image = image[:, :new_h, :new_w]
image_height, image_width = new_h, new_w
num_patches = self.get_num_patches(image_height=image_height, image_width=image_width) # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658
tensor_of_image_ids = torch.full( image_height, image_width = image.shape[1], image.shape[2]
[num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device if variable_sized: # variable_sized=True
) new_h = min(
patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0) image_height,
assert num_patches == patches.shape[0] math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
``` )
new_w = min(
image_width,
math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
)
image = image[:, :new_h, :new_w]
image_height, image_width = new_h, new_w
num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
tensor_of_image_ids = torch.full(
[num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
)
patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
assert num_patches == patches.shape[0]
```
The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`: The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`:
```python ??? Code
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
patch_size = patch_size if patch_size is not None else self.patch_size
patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
if image_height % patch_height != 0: ```python
raise ValueError(f"{image_height=} must be divisible by {patch_height}") # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
if image_width % patch_width != 0: patch_size = patch_size if patch_size is not None else self.patch_size
raise ValueError(f"{image_width=} must be divisible by {patch_width}") patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
num_patches_per_dim_h = image_height // patch_height if image_height % patch_height != 0:
num_patches_per_dim_w = image_width // patch_width raise ValueError(f"{image_height=} must be divisible by {patch_height}")
num_patches = num_patches_per_dim_h * num_patches_per_dim_w if image_width % patch_width != 0:
``` raise ValueError(f"{image_width=} must be divisible by {patch_width}")
num_patches_per_dim_h = image_height // patch_height
num_patches_per_dim_w = image_width // patch_width
num_patches = num_patches_per_dim_h * num_patches_per_dim_w
```
These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized
to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`. to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
@ -419,23 +441,25 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
For the multimodal image profiling data, the logic is very similar to LLaVA: For the multimodal image profiling data, the logic is very similar to LLaVA:
```python ??? Code
def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> MultiModalDataDict:
target_width, target_height = \
self.info.get_image_size_with_most_features()
num_images = mm_counts.get("image", 0)
return { ```python
"image": def get_dummy_mm_data(
self._get_dummy_images(width=target_width, self,
height=target_height, seq_len: int,
num_images=num_images) mm_counts: Mapping[str, int],
} ) -> MultiModalDataDict:
``` target_width, target_height = \
self.info.get_image_size_with_most_features()
num_images = mm_counts.get("image", 0)
return {
"image":
self._get_dummy_images(width=target_width,
height=target_height,
num_images=num_images)
}
```
## 4. Specify processing details ## 4. Specify processing details
@ -455,6 +479,7 @@ return a schema of the tensors outputted by the HF processor that are related to
The output of `CLIPImageProcessor` is a simple tensor with shape The output of `CLIPImageProcessor` is a simple tensor with shape
`(num_images, num_channels, image_height, image_width)`: `(num_images, num_channels, image_height, image_width)`:
```python ```python
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345 # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345
images = [ images = [
@ -505,40 +530,49 @@ return a schema of the tensors outputted by the HF processor that are related to
In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA, In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA,
we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]: we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]:
```python ??? Code
def _call_hf_processor(
self,
prompt: str,
mm_data: Mapping[str, object],
mm_kwargs: Mapping[str, object],
) -> BatchFeature:
processed_outputs = super()._call_hf_processor(
prompt=prompt,
mm_data=mm_data,
mm_kwargs=mm_kwargs,
)
image_patches = processed_outputs.get("image_patches") ```python
if image_patches is not None: def _call_hf_processor(
images = mm_data["images"] self,
assert isinstance(images, list) prompt: str,
mm_data: Mapping[str, object],
mm_kwargs: Mapping[str, object],
tok_kwargs: Mapping[str, object],
) -> BatchFeature:
processed_outputs = super()._call_hf_processor(
prompt=prompt,
mm_data=mm_data,
mm_kwargs=mm_kwargs,
tok_kwargs=tok_kwargs,
)
# Original output: (1, num_images, Pn, Px * Py * C) image_patches = processed_outputs.get("image_patches")
# New output: (num_images, Pn, Px * Py * C) if image_patches is not None:
assert (isinstance(image_patches, list) images = mm_data["images"]
and len(image_patches) == 1) assert isinstance(images, list)
assert (isinstance(image_patches[0], torch.Tensor)
and len(image_patches[0]) == len(images))
processed_outputs["image_patches"] = image_patches[0] # Original output: (1, num_images, Pn, Px * Py * C)
# New output: (num_images, Pn, Px * Py * C)
assert (isinstance(image_patches, list)
and len(image_patches) == 1)
assert (isinstance(image_patches[0], torch.Tensor)
and len(image_patches[0]) == len(images))
return processed_outputs processed_outputs["image_patches"] = image_patches[0]
```
return processed_outputs
```
!!! note !!! note
Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
for text-only inputs to prevent unnecessary warnings from HF processor. for text-only inputs to prevent unnecessary warnings from HF processor.
!!! note
The `_call_hf_processor` method specifies both `mm_kwargs` and `tok_kwargs` for
processing. `mm_kwargs` is used to both initialize and call the huggingface
processor, whereas `tok_kwargs` is only used to call the huggingface processor.
This lets us override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] as follows: This lets us override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] as follows:
```python ```python
@ -573,35 +607,37 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`). It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows: Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows:
```python ??? Code
def _get_prompt_updates(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index
def get_replacement(item_idx: int): ```python
images = mm_items.get_items("image", ImageProcessorItems) def _get_prompt_updates(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index
image_size = images.get_image_size(item_idx) def get_replacement(item_idx: int):
num_image_tokens = self.info.get_num_image_tokens( images = mm_items.get_items("image", ImageProcessorItems)
image_width=image_size.width,
image_height=image_size.height,
)
return [image_token_id] * num_image_tokens image_size = images.get_image_size(item_idx)
num_image_tokens = self.info.get_num_image_tokens(
image_width=image_size.width,
image_height=image_size.height,
)
return [ return [image_token_id] * num_image_tokens
PromptReplacement(
modality="image", return [
target=[image_token_id], PromptReplacement(
replacement=get_replacement, modality="image",
), target=[image_token_id],
] replacement=get_replacement,
``` ),
]
```
=== "Handling additional tokens: Fuyu" === "Handling additional tokens: Fuyu"
@ -616,117 +652,90 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
We define a helper function to return `ncols` and `nrows` directly: We define a helper function to return `ncols` and `nrows` directly:
```python ??? Code
def get_image_feature_grid_size(
self,
*,
image_width: int,
image_height: int,
) -> tuple[int, int]:
image_processor = self.get_image_processor()
target_width = image_processor.size["width"]
target_height = image_processor.size["height"]
patch_width = image_processor.patch_size["width"]
patch_height = image_processor.patch_size["height"]
if not (image_width <= target_width and image_height <= target_height): ```python
height_scale_factor = target_height / image_height def get_image_feature_grid_size(
width_scale_factor = target_width / image_width self,
optimal_scale_factor = min(height_scale_factor, width_scale_factor) *,
image_width: int,
image_height: int,
) -> tuple[int, int]:
image_processor = self.get_image_processor()
target_width = image_processor.size["width"]
target_height = image_processor.size["height"]
patch_width = image_processor.patch_size["width"]
patch_height = image_processor.patch_size["height"]
image_height = int(image_height * optimal_scale_factor) if not (image_width <= target_width and image_height <= target_height):
image_width = int(image_width * optimal_scale_factor) height_scale_factor = target_height / image_height
width_scale_factor = target_width / image_width
optimal_scale_factor = min(height_scale_factor, width_scale_factor)
ncols = math.ceil(image_width / patch_width) image_height = int(image_height * optimal_scale_factor)
nrows = math.ceil(image_height / patch_height) image_width = int(image_width * optimal_scale_factor)
return ncols, nrows
``` ncols = math.ceil(image_width / patch_width)
nrows = math.ceil(image_height / patch_height)
return ncols, nrows
```
Based on this, we can initially define our replacement tokens as: Based on this, we can initially define our replacement tokens as:
```python ??? Code
def get_replacement(item_idx: int):
images = mm_items.get_items("image", ImageProcessorItems)
image_size = images.get_image_size(item_idx)
ncols, nrows = self.info.get_image_feature_grid_size( ```python
image_width=image_size.width, def get_replacement(item_idx: int):
image_height=image_size.height, images = mm_items.get_items("image", ImageProcessorItems)
) image_size = images.get_image_size(item_idx)
# `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|` ncols, nrows = self.info.get_image_feature_grid_size(
# `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|` image_width=image_size.width,
return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows image_height=image_size.height,
``` )
# `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
# `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
```
However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called, However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
a BOS token (`<s>`) is also added to the promopt: a BOS token (`<s>`) is also added to the promopt:
```python ??? Code
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
model_image_input = self.image_processor.preprocess_with_tokenizer_info( ```python
image_input=tensor_batch_images, # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
image_present=image_present, model_image_input = self.image_processor.preprocess_with_tokenizer_info(
image_unpadded_h=image_unpadded_heights, image_input=tensor_batch_images,
image_unpadded_w=image_unpadded_widths, image_present=image_present,
image_placeholder_id=image_placeholder_id, image_unpadded_h=image_unpadded_heights,
image_newline_id=image_newline_id, image_unpadded_w=image_unpadded_widths,
variable_sized=True, image_placeholder_id=image_placeholder_id,
) image_newline_id=image_newline_id,
prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch( variable_sized=True,
tokenizer=self.tokenizer, )
prompts=prompts, prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
scale_factors=scale_factors, tokenizer=self.tokenizer,
max_tokens_to_generate=self.max_tokens_to_generate, prompts=prompts,
max_position_embeddings=self.max_position_embeddings, scale_factors=scale_factors,
add_BOS=True, max_tokens_to_generate=self.max_tokens_to_generate,
add_beginning_of_answer_token=True, max_position_embeddings=self.max_position_embeddings,
) add_BOS=True,
``` add_beginning_of_answer_token=True,
)
```
To assign the vision embeddings to only the image tokens, instead of a string To assign the vision embeddings to only the image tokens, instead of a string
you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]: you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]:
```python ??? Code
hf_config = self.info.get_hf_config()
bos_token_id = hf_config.bos_token_id # `<s>`
assert isinstance(bos_token_id, int)
def get_replacement_fuyu(item_idx: int): ```python
images = mm_items.get_items("image", ImageProcessorItems)
image_size = images.get_image_size(item_idx)
ncols, nrows = self.info.get_image_feature_grid_size(
image_width=image_size.width,
image_height=image_size.height,
)
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows
return PromptUpdateDetails.select_token_id(
image_tokens + [bos_token_id],
embed_token_id=_IMAGE_TOKEN_ID,
)
```
Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
we can search for it to conduct the replacement at the start of the string:
```python
def _get_prompt_updates(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
bos_token_id = hf_config.bos_token_id bos_token_id = hf_config.bos_token_id # `<s>`
assert isinstance(bos_token_id, int) assert isinstance(bos_token_id, int)
tokenizer = self.info.get_tokenizer()
eot_token_id = tokenizer.bos_token_id
assert isinstance(eot_token_id, int)
def get_replacement_fuyu(item_idx: int): def get_replacement_fuyu(item_idx: int):
images = mm_items.get_items("image", ImageProcessorItems) images = mm_items.get_items("image", ImageProcessorItems)
image_size = images.get_image_size(item_idx) image_size = images.get_image_size(item_idx)
@ -742,15 +751,52 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
image_tokens + [bos_token_id], image_tokens + [bos_token_id],
embed_token_id=_IMAGE_TOKEN_ID, embed_token_id=_IMAGE_TOKEN_ID,
) )
```
return [ Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
PromptReplacement( we can search for it to conduct the replacement at the start of the string:
modality="image",
target=[eot_token_id], ??? Code
replacement=get_replacement_fuyu,
) ```python
] def _get_prompt_updates(
``` self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config()
bos_token_id = hf_config.bos_token_id
assert isinstance(bos_token_id, int)
tokenizer = self.info.get_tokenizer()
eot_token_id = tokenizer.bos_token_id
assert isinstance(eot_token_id, int)
def get_replacement_fuyu(item_idx: int):
images = mm_items.get_items("image", ImageProcessorItems)
image_size = images.get_image_size(item_idx)
ncols, nrows = self.info.get_image_feature_grid_size(
image_width=image_size.width,
image_height=image_size.height,
)
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows
return PromptUpdateDetails.select_token_id(
image_tokens + [bos_token_id],
embed_token_id=_IMAGE_TOKEN_ID,
)
return [
PromptReplacement(
modality="image",
target=[eot_token_id],
replacement=get_replacement_fuyu,
)
]
```
## 5. Register processor-related classes ## 5. Register processor-related classes

View File

@ -1,5 +1,5 @@
--- ---
title: Registering a Model to vLLM title: Registering a Model
--- ---
[](){ #new-model-registration } [](){ #new-model-registration }

View File

@ -1,5 +1,5 @@
--- ---
title: Writing Unit Tests title: Unit Testing
--- ---
[](){ #new-model-tests } [](){ #new-model-tests }

View File

@ -30,13 +30,21 @@ Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example
#### OpenAI Server #### OpenAI Server
```bash ```bash
VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B VLLM_TORCH_PROFILER_DIR=./vllm_profile \
python -m vllm.entrypoints.openai.api_server \
--model meta-llama/Meta-Llama-3-70B
``` ```
benchmark_serving.py: benchmark_serving.py:
```bash ```bash
python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2 python benchmarks/benchmark_serving.py \
--backend vllm \
--model meta-llama/Meta-Llama-3-70B \
--dataset-name sharegpt \
--dataset-path sharegpt.json \
--profile \
--num-prompts 2
``` ```
## Profile with NVIDIA Nsight Systems ## Profile with NVIDIA Nsight Systems
@ -64,7 +72,16 @@ For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fo
The following is an example using the `benchmarks/benchmark_latency.py` script: The following is an example using the `benchmarks/benchmark_latency.py` script:
```bash ```bash
nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node python benchmarks/benchmark_latency.py --model meta-llama/Llama-3.1-8B-Instruct --num-iters-warmup 5 --num-iters 1 --batch-size 16 --input-len 512 --output-len 8 nsys profile -o report.nsys-rep \
--trace-fork-before-exec=true \
--cuda-graph-trace=node \
python benchmarks/benchmark_latency.py \
--model meta-llama/Llama-3.1-8B-Instruct \
--num-iters-warmup 5 \
--num-iters 1 \
--batch-size 16 \
--input-len 512 \
--output-len 8
``` ```
#### OpenAI Server #### OpenAI Server
@ -73,10 +90,21 @@ To profile the server, you will want to prepend your `vllm serve` command with `
```bash ```bash
# server # server
nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node --delay 30 --duration 60 vllm serve meta-llama/Llama-3.1-8B-Instruct nsys profile -o report.nsys-rep \
--trace-fork-before-exec=true \
--cuda-graph-trace=node \
--delay 30 \
--duration 60 \
vllm serve meta-llama/Llama-3.1-8B-Instruct
# client # client
python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 1 --dataset-name random --random-input 1024 --random-output 512 python benchmarks/benchmark_serving.py \
--backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \
--num-prompts 1 \
--dataset-name random \
--random-input 1024 \
--random-output 512
``` ```
In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run: In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run:
@ -97,26 +125,26 @@ to manually kill the profiler and generate your `nsys-rep` report.
You can view these profiles either as summaries in the CLI, using `nsys stats [profile-file]`, or in the GUI by installing Nsight [locally following the directions here](https://developer.nvidia.com/nsight-systems/get-started). You can view these profiles either as summaries in the CLI, using `nsys stats [profile-file]`, or in the GUI by installing Nsight [locally following the directions here](https://developer.nvidia.com/nsight-systems/get-started).
CLI example: ??? CLI example
```bash ```bash
nsys stats report1.nsys-rep nsys stats report1.nsys-rep
... ...
** CUDA GPU Kernel Summary (cuda_gpu_kern_sum): ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- ----------- ----------- -------- --------- ----------- ---------------------------------------------------------------------------------------------------- -------- --------------- --------- ----------- ----------- -------- --------- ----------- ----------------------------------------------------------------------------------------------------
46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of… 46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of… 14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
12.1 2,692,284,876 14,280 188,535.4 83,904.0 19,328 2,862,237 497,999.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x128x64_warpgroupsize1x1x1_execute_segment_k_off… 12.1 2,692,284,876 14,280 188,535.4 83,904.0 19,328 2,862,237 497,999.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x128x64_warpgroupsize1x1x1_execute_segment_k_off…
9.5 2,116,600,578 33,920 62,399.8 21,504.0 15,326 2,532,285 290,954.1 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x64x64_warpgroupsize1x1x1_execute_segment_k_off_… 9.5 2,116,600,578 33,920 62,399.8 21,504.0 15,326 2,532,285 290,954.1 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x64x64_warpgroupsize1x1x1_execute_segment_k_off_…
5.0 1,119,749,165 18,912 59,208.4 9,056.0 6,784 2,578,366 271,581.7 void vllm::act_and_mul_kernel<c10::BFloat16, &vllm::silu_kernel<c10::BFloat16>, (bool)1>(T1 *, cons… 5.0 1,119,749,165 18,912 59,208.4 9,056.0 6,784 2,578,366 271,581.7 void vllm::act_and_mul_kernel<c10::BFloat16, &vllm::silu_kernel<c10::BFloat16>, (bool)1>(T1 *, cons…
4.1 916,662,515 21,312 43,011.6 19,776.0 8,928 2,586,205 199,790.1 void cutlass::device_kernel<flash::enable_sm90_or_later<flash::FlashAttnFwdSm90<flash::CollectiveMa 4.1 916,662,515 21,312 43,011.6 19,776.0 8,928 2,586,205 199,790.1 void cutlass::device_kernel<flash::enable_sm90_or_later<flash::FlashAttnFwdSm90<flash::CollectiveMa
2.6 587,283,113 37,824 15,526.7 3,008.0 2,719 2,517,756 139,091.1 std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern… 2.6 587,283,113 37,824 15,526.7 3,008.0 2,719 2,517,756 139,091.1 std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in… 1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0 0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0
... ...
``` ```
GUI example: GUI example:

View File

@ -34,6 +34,7 @@ you may contact the following individuals:
- Simon Mo - simon.mo@hey.com - Simon Mo - simon.mo@hey.com
- Russell Bryant - rbryant@redhat.com - Russell Bryant - rbryant@redhat.com
- Huzaifa Sidhpurwala - huzaifas@redhat.com
## Slack Discussion ## Slack Discussion

View File

@ -10,7 +10,7 @@ title: Using Docker
vLLM offers an official Docker image for deployment. vLLM offers an official Docker image for deployment.
The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags). The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags).
```console ```bash
docker run --runtime nvidia --gpus all \ docker run --runtime nvidia --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \ -v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \ --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
@ -22,7 +22,7 @@ docker run --runtime nvidia --gpus all \
This image can also be used with other container engines such as [Podman](https://podman.io/). This image can also be used with other container engines such as [Podman](https://podman.io/).
```console ```bash
podman run --gpus all \ podman run --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \ -v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
@ -71,7 +71,7 @@ You can add any other [engine-args][engine-args] you need after the image tag (`
You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM: You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
```console ```bash
# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
DOCKER_BUILDKIT=1 docker build . \ DOCKER_BUILDKIT=1 docker build . \
--target vllm-openai \ --target vllm-openai \
@ -97,26 +97,28 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits. flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
Keep an eye on memory usage with parallel jobs as it can be substantial (see example below). Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
```console ??? Command
# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
python3 use_existing_torch.py ```bash
DOCKER_BUILDKIT=1 docker build . \ # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
--file docker/Dockerfile \ python3 use_existing_torch.py
--target vllm-openai \ DOCKER_BUILDKIT=1 docker build . \
--platform "linux/arm64" \ --file docker/Dockerfile \
-t vllm/vllm-gh200-openai:latest \ --target vllm-openai \
--build-arg max_jobs=66 \ --platform "linux/arm64" \
--build-arg nvcc_threads=2 \ -t vllm/vllm-gh200-openai:latest \
--build-arg torch_cuda_arch_list="9.0 10.0+PTX" \ --build-arg max_jobs=66 \
--build-arg vllm_fa_cmake_gpu_arches="90-real" --build-arg nvcc_threads=2 \
``` --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
--build-arg vllm_fa_cmake_gpu_arches="90-real"
```
!!! note !!! note
If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution. If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.
Run the following command on your host machine to register QEMU user static handlers: Run the following command on your host machine to register QEMU user static handlers:
```console ```bash
docker run --rm --privileged multiarch/qemu-user-static --reset -p yes docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
``` ```
@ -126,7 +128,7 @@ DOCKER_BUILDKIT=1 docker build . \
To run vLLM with the custom-built Docker image: To run vLLM with the custom-built Docker image:
```console ```bash
docker run --runtime nvidia --gpus all \ docker run --runtime nvidia --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \ -v ~/.cache/huggingface:/root/.cache/huggingface \
-p 8000:8000 \ -p 8000:8000 \

View File

@ -15,7 +15,7 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```console ```bash
vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096 vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
``` ```

View File

@ -11,7 +11,7 @@ title: AutoGen
- Setup [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment - Setup [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment
```console ```bash
pip install vllm pip install vllm
# Install AgentChat and OpenAI client from Extensions # Install AgentChat and OpenAI client from Extensions
@ -23,58 +23,60 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]"
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```console ```bash
python -m vllm.entrypoints.openai.api_server \ python -m vllm.entrypoints.openai.api_server \
--model mistralai/Mistral-7B-Instruct-v0.2 --model mistralai/Mistral-7B-Instruct-v0.2
``` ```
- Call it with AutoGen: - Call it with AutoGen:
```python ??? Code
import asyncio
from autogen_core.models import UserMessage ```python
from autogen_ext.models.openai import OpenAIChatCompletionClient import asyncio
from autogen_core.models import ModelFamily from autogen_core.models import UserMessage
from autogen_ext.models.openai import OpenAIChatCompletionClient
from autogen_core.models import ModelFamily
async def main() -> None: async def main() -> None:
# Create a model client # Create a model client
model_client = OpenAIChatCompletionClient( model_client = OpenAIChatCompletionClient(
model="mistralai/Mistral-7B-Instruct-v0.2", model="mistralai/Mistral-7B-Instruct-v0.2",
base_url="http://{your-vllm-host-ip}:{your-vllm-host-port}/v1", base_url="http://{your-vllm-host-ip}:{your-vllm-host-port}/v1",
api_key="EMPTY", api_key="EMPTY",
model_info={ model_info={
"vision": False, "vision": False,
"function_calling": False, "function_calling": False,
"json_output": False, "json_output": False,
"family": ModelFamily.MISTRAL, "family": ModelFamily.MISTRAL,
"structured_output": True, "structured_output": True,
}, },
) )
messages = [UserMessage(content="Write a very short story about a dragon.", source="user")] messages = [UserMessage(content="Write a very short story about a dragon.", source="user")]
# Create a stream. # Create a stream.
stream = model_client.create_stream(messages=messages) stream = model_client.create_stream(messages=messages)
# Iterate over the stream and print the responses. # Iterate over the stream and print the responses.
print("Streamed responses:") print("Streamed responses:")
async for response in stream: async for response in stream:
if isinstance(response, str): if isinstance(response, str):
# A partial response is a string. # A partial response is a string.
print(response, flush=True, end="") print(response, flush=True, end="")
else: else:
# The last response is a CreateResult object with the complete message. # The last response is a CreateResult object with the complete message.
print("\n\n------------\n") print("\n\n------------\n")
print("The complete response:", flush=True) print("The complete response:", flush=True)
print(response.content, flush=True) print(response.content, flush=True)
# Close the client when done. # Close the client when done.
await model_client.close() await model_client.close()
asyncio.run(main()) asyncio.run(main())
``` ```
For details, see the tutorial: For details, see the tutorial:

View File

@ -11,14 +11,14 @@ vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebr
To install the Cerebrium client, run: To install the Cerebrium client, run:
```console ```bash
pip install cerebrium pip install cerebrium
cerebrium login cerebrium login
``` ```
Next, create your Cerebrium project, run: Next, create your Cerebrium project, run:
```console ```bash
cerebrium init vllm-project cerebrium init vllm-project
``` ```
@ -34,75 +34,81 @@ vllm = "latest"
Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`: Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`:
```python ??? Code
from vllm import LLM, SamplingParams
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1") ```python
from vllm import LLM, SamplingParams
def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
sampling_params = SamplingParams(temperature=temperature, top_p=top_p) def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
outputs = llm.generate(prompts, sampling_params)
# Print the outputs. sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
results = [] outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
results.append({"prompt": prompt, "generated_text": generated_text})
return {"results": results} # Print the outputs.
``` results = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
results.append({"prompt": prompt, "generated_text": generated_text})
return {"results": results}
```
Then, run the following code to deploy it to the cloud: Then, run the following code to deploy it to the cloud:
```console ```bash
cerebrium deploy cerebrium deploy
``` ```
If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`) If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`)
```python ??? Command
curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
-H 'Content-Type: application/json' \ ```python
-H 'Authorization: <JWT TOKEN>' \ curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
--data '{ -H 'Content-Type: application/json' \
"prompts": [ -H 'Authorization: <JWT TOKEN>' \
"Hello, my name is", --data '{
"The president of the United States is", "prompts": [
"The capital of France is", "Hello, my name is",
"The future of AI is" "The president of the United States is",
] "The capital of France is",
}' "The future of AI is"
``` ]
}'
```
You should get a response like: You should get a response like:
```python ??? Response
{
"run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262", ```python
"result": { {
"result": [ "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
{ "result": {
"prompt": "Hello, my name is", "result": [
"generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of" {
}, "prompt": "Hello, my name is",
{ "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of"
"prompt": "The president of the United States is", },
"generated_text": " elected every four years. This is a democratic system.\n\n5. What" {
}, "prompt": "The president of the United States is",
{ "generated_text": " elected every four years. This is a democratic system.\n\n5. What"
"prompt": "The capital of France is", },
"generated_text": " Paris.\n" {
}, "prompt": "The capital of France is",
{ "generated_text": " Paris.\n"
"prompt": "The future of AI is", },
"generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective." {
} "prompt": "The future of AI is",
] "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective."
}, }
"run_time_ms": 152.53663063049316 ]
} },
``` "run_time_ms": 152.53663063049316
}
```
You now have an autoscaling endpoint where you only pay for the compute you use! You now have an autoscaling endpoint where you only pay for the compute you use!

View File

@ -15,7 +15,7 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```console ```bash
vllm serve qwen/Qwen1.5-0.5B-Chat vllm serve qwen/Qwen1.5-0.5B-Chat
``` ```

View File

@ -18,13 +18,13 @@ This guide walks you through deploying Dify using a vLLM backend.
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```console ```bash
vllm serve Qwen/Qwen1.5-7B-Chat vllm serve Qwen/Qwen1.5-7B-Chat
``` ```
- Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)): - Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)):
```console ```bash
git clone https://github.com/langgenius/dify.git git clone https://github.com/langgenius/dify.git
cd dify cd dify
cd docker cd docker

View File

@ -11,14 +11,14 @@ vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/),
To install dstack client, run: To install dstack client, run:
```console ```bash
pip install "dstack[all] pip install "dstack[all]
dstack server dstack server
``` ```
Next, to configure your dstack project, run: Next, to configure your dstack project, run:
```console ```bash
mkdir -p vllm-dstack mkdir -p vllm-dstack
cd vllm-dstack cd vllm-dstack
dstack init dstack init
@ -26,75 +26,81 @@ dstack init
Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
```yaml ??? Config
type: service
python: "3.11" ```yaml
env: type: service
- MODEL=NousResearch/Llama-2-7b-chat-hf
port: 8000 python: "3.11"
resources: env:
gpu: 24GB - MODEL=NousResearch/Llama-2-7b-chat-hf
commands: port: 8000
- pip install vllm resources:
- vllm serve $MODEL --port 8000 gpu: 24GB
model: commands:
format: openai - pip install vllm
type: chat - vllm serve $MODEL --port 8000
name: NousResearch/Llama-2-7b-chat-hf model:
``` format: openai
type: chat
name: NousResearch/Llama-2-7b-chat-hf
```
Then, run the following CLI for provisioning: Then, run the following CLI for provisioning:
```console ??? Command
$ dstack run . -f serve.dstack.yml
⠸ Getting run plan... ```console
Configuration serve.dstack.yml $ dstack run . -f serve.dstack.yml
Project deep-diver-main
User deep-diver
Min resources 2..xCPU, 8GB.., 1xGPU (24GB)
Max price -
Max duration -
Spot policy auto
Retry policy no
# BACKEND REGION INSTANCE RESOURCES SPOT PRICE ⠸ Getting run plan...
1 gcp us-central1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 Configuration serve.dstack.yml
2 gcp us-east1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 Project deep-diver-main
3 gcp us-west1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 User deep-diver
... Min resources 2..xCPU, 8GB.., 1xGPU (24GB)
Shown 3 of 193 offers, $5.876 max Max price -
Max duration -
Spot policy auto
Retry policy no
Continue? [y/n]: y # BACKEND REGION INSTANCE RESOURCES SPOT PRICE
⠙ Submitting run... 1 gcp us-central1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804
⠏ Launching spicy-treefrog-1 (pulling) 2 gcp us-east1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804
spicy-treefrog-1 provisioning completed (running) 3 gcp us-west1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804
Service is published at ... ...
``` Shown 3 of 193 offers, $5.876 max
Continue? [y/n]: y
⠙ Submitting run...
⠏ Launching spicy-treefrog-1 (pulling)
spicy-treefrog-1 provisioning completed (running)
Service is published at ...
```
After the provisioning, you can interact with the model by using the OpenAI SDK: After the provisioning, you can interact with the model by using the OpenAI SDK:
```python ??? Code
from openai import OpenAI
client = OpenAI( ```python
base_url="https://gateway.<gateway domain>", from openai import OpenAI
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
)
completion = client.chat.completions.create( client = OpenAI(
model="NousResearch/Llama-2-7b-chat-hf", base_url="https://gateway.<gateway domain>",
messages=[ api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
{ )
"role": "user",
"content": "Compose a poem that explains the concept of recursion in programming.",
}
]
)
print(completion.choices[0].message.content) completion = client.chat.completions.create(
``` model="NousResearch/Llama-2-7b-chat-hf",
messages=[
{
"role": "user",
"content": "Compose a poem that explains the concept of recursion in programming.",
}
]
)
print(completion.choices[0].message.content)
```
!!! note !!! note
dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm) dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm)

View File

@ -13,7 +13,7 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac
- Setup vLLM and Haystack environment - Setup vLLM and Haystack environment
```console ```bash
pip install vllm haystack-ai pip install vllm haystack-ai
``` ```
@ -21,35 +21,35 @@ pip install vllm haystack-ai
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```console ```bash
vllm serve mistralai/Mistral-7B-Instruct-v0.1 vllm serve mistralai/Mistral-7B-Instruct-v0.1
``` ```
- Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server. - Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server.
```python ??? Code
from haystack.components.generators.chat import OpenAIChatGenerator
from haystack.dataclasses import ChatMessage
from haystack.utils import Secret
generator = OpenAIChatGenerator( ```python
# for compatibility with the OpenAI API, a placeholder api_key is needed from haystack.components.generators.chat import OpenAIChatGenerator
api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"), from haystack.dataclasses import ChatMessage
model="mistralai/Mistral-7B-Instruct-v0.1", from haystack.utils import Secret
api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
generation_kwargs = {"max_tokens": 512}
)
response = generator.run( generator = OpenAIChatGenerator(
messages=[ChatMessage.from_user("Hi. Can you help me plan my next trip to Italy?")] # for compatibility with the OpenAI API, a placeholder api_key is needed
) api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
model="mistralai/Mistral-7B-Instruct-v0.1",
api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
generation_kwargs = {"max_tokens": 512}
)
print("-"*30) response = generator.run(
print(response) messages=[ChatMessage.from_user("Hi. Can you help me plan my next trip to Italy?")]
print("-"*30) )
```
Output e.g.: print("-"*30)
print(response)
print("-"*30)
```
```console ```console
------------------------------ ------------------------------

View File

@ -5,9 +5,9 @@ title: Helm
A Helm chart to deploy vLLM for Kubernetes A Helm chart to deploy vLLM for Kubernetes
Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values. Helm is a package manager for Kubernetes. It helps automate the deployment of vLLM applications on Kubernetes. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.
This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file. This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for Helm installation and documentation on architecture and values file.
## Prerequisites ## Prerequisites
@ -16,21 +16,27 @@ Before you begin, ensure that you have the following:
- A running Kubernetes cluster - A running Kubernetes cluster
- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin) - NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin)
- Available GPU resources in your cluster - Available GPU resources in your cluster
- S3 with the model which will be deployed - An S3 with the model which will be deployed
## Installing the chart ## Installing the chart
To install the chart with the release name `test-vllm`: To install the chart with the release name `test-vllm`:
```console ```bash
helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY helm upgrade --install --create-namespace \
--namespace=ns-vllm test-vllm . \
-f values.yaml \
--set secrets.s3endpoint=$ACCESS_POINT \
--set secrets.s3bucketname=$BUCKET \
--set secrets.s3accesskeyid=$ACCESS_KEY \
--set secrets.s3accesskey=$SECRET_KEY
``` ```
## Uninstalling the Chart ## Uninstalling the chart
To uninstall the `test-vllm` deployment: To uninstall the `test-vllm` deployment:
```console ```bash
helm uninstall test-vllm --namespace=ns-vllm helm uninstall test-vllm --namespace=ns-vllm
``` ```
@ -39,57 +45,59 @@ chart **including persistent volumes** and deletes the release.
## Architecture ## Architecture
![](../../assets/deployment/architecture_helm_deployment.png) ![helm deployment architecture](../../assets/deployment/architecture_helm_deployment.png)
## Values ## Values
| Key | Type | Default | Description | The following table describes configurable parameters of the chart in `values.yaml`:
|--------------------------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------|
| autoscaling | object | {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} | Autoscaling configuration | | Key | Type | Default | Description |
| autoscaling.enabled | bool | false | Enable autoscaling | |-----|------|---------|-------------|
| autoscaling.maxReplicas | int | 100 | Maximum replicas | | autoscaling | object | {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} | Autoscaling configuration |
| autoscaling.minReplicas | int | 1 | Minimum replicas | | autoscaling.enabled | bool | false | Enable autoscaling |
| autoscaling.targetCPUUtilizationPercentage | int | 80 | Target CPU utilization for autoscaling | | autoscaling.maxReplicas | int | 100 | Maximum replicas |
| configs | object | {} | Configmap | | autoscaling.minReplicas | int | 1 | Minimum replicas |
| containerPort | int | 8000 | Container port | | autoscaling.targetCPUUtilizationPercentage | int | 80 | Target CPU utilization for autoscaling |
| customObjects | list | [] | Custom Objects configuration | | configs | object | {} | Configmap |
| deploymentStrategy | object | {} | Deployment strategy configuration | | containerPort | int | 8000 | Container port |
| externalConfigs | list | [] | External configuration | | customObjects | list | [] | Custom Objects configuration |
| extraContainers | list | [] | Additional containers configuration | | deploymentStrategy | object | {} | Deployment strategy configuration |
| extraInit | object | {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} | Additional configuration for the init container | | externalConfigs | list | [] | External configuration |
| extraInit.pvcStorage | string | "50Gi" | Storage size of the s3 | | extraContainers | list | [] | Additional containers configuration |
| extraInit.s3modelpath | string | "relative_s3_model_path/opt-125m" | Path of the model on the s3 which hosts model weights and config files | | extraInit | object | {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} | Additional configuration for the init container |
| extraInit.awsEc2MetadataDisabled | boolean | true | Disables the use of the Amazon EC2 instance metadata service | | extraInit.pvcStorage | string | "1Gi" | Storage size of the s3 |
| extraPorts | list | [] | Additional ports configuration | | extraInit.s3modelpath | string | "relative_s3_model_path/opt-125m" | Path of the model on the s3 which hosts model weights and config files |
| gpuModels | list | ["TYPE_GPU_USED"] | Type of gpu used | | extraInit.awsEc2MetadataDisabled | boolean | true | Disables the use of the Amazon EC2 instance metadata service |
| image | object | {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} | Image configuration | | extraPorts | list | [] | Additional ports configuration |
| image.command | list | ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] | Container launch command | | gpuModels | list | ["TYPE_GPU_USED"] | Type of gpu used |
| image.repository | string | "vllm/vllm-openai" | Image repository | | image | object | {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} | Image configuration |
| image.tag | string | "latest" | Image tag | | image.command | list | ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] | Container launch command |
| livenessProbe | object | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} | Liveness probe configuration | | image.repository | string | "vllm/vllm-openai" | Image repository |
| livenessProbe.failureThreshold | int | 3 | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive | | image.tag | string | "latest" | Image tag |
| livenessProbe.httpGet | object | {"path":"/health","port":8000} | Configuration of the Kubelet http request on the server | | livenessProbe | object | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} | Liveness probe configuration |
| livenessProbe.httpGet.path | string | "/health" | Path to access on the HTTP server | | livenessProbe.failureThreshold | int | 3 | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive |
| livenessProbe.httpGet.port | int | 8000 | Name or number of the port to access on the container, on which the server is listening | | livenessProbe.httpGet | object | {"path":"/health","port":8000} | Configuration of the kubelet http request on the server |
| livenessProbe.initialDelaySeconds | int | 15 | Number of seconds after the container has started before liveness probe is initiated | | livenessProbe.httpGet.path | string | "/health" | Path to access on the HTTP server |
| livenessProbe.periodSeconds | int | 10 | How often (in seconds) to perform the liveness probe | | livenessProbe.httpGet.port | int | 8000 | Name or number of the port to access on the container, on which the server is listening |
| maxUnavailablePodDisruptionBudget | string | "" | Disruption Budget Configuration | | livenessProbe.initialDelaySeconds | int | 15 | Number of seconds after the container has started before liveness probe is initiated |
| readinessProbe | object | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} | Readiness probe configuration | | livenessProbe.periodSeconds | int | 10 | How often (in seconds) to perform the liveness probe |
| readinessProbe.failureThreshold | int | 3 | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready | | maxUnavailablePodDisruptionBudget | string | "" | Disruption Budget Configuration |
| readinessProbe.httpGet | object | {"path":"/health","port":8000} | Configuration of the Kubelet http request on the server | | readinessProbe | object | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} | Readiness probe configuration |
| readinessProbe.httpGet.path | string | "/health" | Path to access on the HTTP server | | readinessProbe.failureThreshold | int | 3 | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready |
| readinessProbe.httpGet.port | int | 8000 | Name or number of the port to access on the container, on which the server is listening | | readinessProbe.httpGet | object | {"path":"/health","port":8000} | Configuration of the kubelet http request on the server |
| readinessProbe.initialDelaySeconds | int | 5 | Number of seconds after the container has started before readiness probe is initiated | | readinessProbe.httpGet.path | string | "/health" | Path to access on the HTTP server |
| readinessProbe.periodSeconds | int | 5 | How often (in seconds) to perform the readiness probe | | readinessProbe.httpGet.port | int | 8000 | Name or number of the port to access on the container, on which the server is listening |
| replicaCount | int | 1 | Number of replicas | | readinessProbe.initialDelaySeconds | int | 5 | Number of seconds after the container has started before readiness probe is initiated |
| resources | object | {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} | Resource configuration | | readinessProbe.periodSeconds | int | 5 | How often (in seconds) to perform the readiness probe |
| resources.limits."nvidia.com/gpu" | int | 1 | Number of gpus used | | replicaCount | int | 1 | Number of replicas |
| resources.limits.cpu | int | 4 | Number of CPUs | | resources | object | {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} | Resource configuration |
| resources.limits.memory | string | "16Gi" | CPU memory configuration | | resources.limits."nvidia.com/gpu" | int | 1 | Number of GPUs used |
| resources.requests."nvidia.com/gpu" | int | 1 | Number of gpus used | | resources.limits.cpu | int | 4 | Number of CPUs |
| resources.requests.cpu | int | 4 | Number of CPUs | | resources.limits.memory | string | "16Gi" | CPU memory configuration |
| resources.requests.memory | string | "16Gi" | CPU memory configuration | | resources.requests."nvidia.com/gpu" | int | 1 | Number of GPUs used |
| secrets | object | {} | Secrets configuration | | resources.requests.cpu | int | 4 | Number of CPUs |
| serviceName | string | Service name | | | resources.requests.memory | string | "16Gi" | CPU memory configuration |
| servicePort | int | 80 | Service port | | secrets | object | {} | Secrets configuration |
| labels.environment | string | test | Environment name | | serviceName | string | "" | Service name |
| servicePort | int | 80 | Service port |
| labels.environment | string | test | Environment name |

View File

@ -18,7 +18,7 @@ And LiteLLM supports all models on VLLM.
- Setup vLLM and litellm environment - Setup vLLM and litellm environment
```console ```bash
pip install vllm litellm pip install vllm litellm
``` ```
@ -28,33 +28,35 @@ pip install vllm litellm
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```console ```bash
vllm serve qwen/Qwen1.5-0.5B-Chat vllm serve qwen/Qwen1.5-0.5B-Chat
``` ```
- Call it with litellm: - Call it with litellm:
```python ??? Code
import litellm
messages = [{ "content": "Hello, how are you?","role": "user"}] ```python
import litellm
# hosted_vllm is prefix key word and necessary messages = [{ "content": "Hello, how are you?","role": "user"}]
response = litellm.completion(
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
messages=messages,
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
temperature=0.2,
max_tokens=80)
print(response) # hosted_vllm is prefix key word and necessary
``` response = litellm.completion(
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
messages=messages,
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
temperature=0.2,
max_tokens=80)
print(response)
```
### Embeddings ### Embeddings
- Start the vLLM server with the supported embedding model, e.g. - Start the vLLM server with the supported embedding model, e.g.
```console ```bash
vllm serve BAAI/bge-base-en-v1.5 vllm serve BAAI/bge-base-en-v1.5
``` ```

View File

@ -17,99 +17,101 @@ vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kuber
Deploy the following yaml file `lws.yaml` Deploy the following yaml file `lws.yaml`
```yaml ??? Yaml
apiVersion: leaderworkerset.x-k8s.io/v1
kind: LeaderWorkerSet ```yaml
metadata: apiVersion: leaderworkerset.x-k8s.io/v1
name: vllm kind: LeaderWorkerSet
spec: metadata:
replicas: 2 name: vllm
leaderWorkerTemplate: spec:
size: 2 replicas: 2
restartPolicy: RecreateGroupOnPodRestart leaderWorkerTemplate:
leaderTemplate: size: 2
metadata: restartPolicy: RecreateGroupOnPodRestart
labels: leaderTemplate:
role: leader metadata:
spec: labels:
containers: role: leader
- name: vllm-leader spec:
image: docker.io/vllm/vllm-openai:latest containers:
env: - name: vllm-leader
- name: HUGGING_FACE_HUB_TOKEN image: docker.io/vllm/vllm-openai:latest
value: <your-hf-token> env:
command: - name: HUGGING_FACE_HUB_TOKEN
- sh value: <your-hf-token>
- -c command:
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); - sh
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2" - -c
resources: - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
limits: python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
nvidia.com/gpu: "8" resources:
memory: 1124Gi limits:
ephemeral-storage: 800Gi nvidia.com/gpu: "8"
requests: memory: 1124Gi
ephemeral-storage: 800Gi ephemeral-storage: 800Gi
cpu: 125 requests:
ports: ephemeral-storage: 800Gi
- containerPort: 8080 cpu: 125
readinessProbe: ports:
tcpSocket: - containerPort: 8080
port: 8080 readinessProbe:
initialDelaySeconds: 15 tcpSocket:
periodSeconds: 10 port: 8080
volumeMounts: initialDelaySeconds: 15
- mountPath: /dev/shm periodSeconds: 10
name: dshm volumeMounts:
volumes: - mountPath: /dev/shm
- name: dshm name: dshm
emptyDir: volumes:
medium: Memory - name: dshm
sizeLimit: 15Gi emptyDir:
workerTemplate: medium: Memory
spec: sizeLimit: 15Gi
containers: workerTemplate:
- name: vllm-worker spec:
image: docker.io/vllm/vllm-openai:latest containers:
command: - name: vllm-worker
- sh image: docker.io/vllm/vllm-openai:latest
- -c command:
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)" - sh
resources: - -c
limits: - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
nvidia.com/gpu: "8" resources:
memory: 1124Gi limits:
ephemeral-storage: 800Gi nvidia.com/gpu: "8"
requests: memory: 1124Gi
ephemeral-storage: 800Gi ephemeral-storage: 800Gi
cpu: 125 requests:
env: ephemeral-storage: 800Gi
- name: HUGGING_FACE_HUB_TOKEN cpu: 125
value: <your-hf-token> env:
volumeMounts: - name: HUGGING_FACE_HUB_TOKEN
- mountPath: /dev/shm value: <your-hf-token>
name: dshm volumeMounts:
volumes: - mountPath: /dev/shm
- name: dshm name: dshm
emptyDir: volumes:
medium: Memory - name: dshm
sizeLimit: 15Gi emptyDir:
--- medium: Memory
apiVersion: v1 sizeLimit: 15Gi
kind: Service ---
metadata: apiVersion: v1
name: vllm-leader kind: Service
spec: metadata:
ports: name: vllm-leader
- name: http spec:
port: 8080 ports:
protocol: TCP - name: http
targetPort: 8080 port: 8080
selector: protocol: TCP
leaderworkerset.sigs.k8s.io/name: vllm targetPort: 8080
role: leader selector:
type: ClusterIP leaderworkerset.sigs.k8s.io/name: vllm
``` role: leader
type: ClusterIP
```
```bash ```bash
kubectl apply -f lws.yaml kubectl apply -f lws.yaml
@ -175,25 +177,27 @@ curl http://localhost:8080/v1/completions \
The output should be similar to the following The output should be similar to the following
```text ??? Output
{
"id": "cmpl-1bb34faba88b43f9862cfbfb2200949d", ```text
"object": "text_completion",
"created": 1715138766,
"model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
"choices": [
{ {
"index": 0, "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
"text": " top destination for foodies, with", "object": "text_completion",
"logprobs": null, "created": 1715138766,
"finish_reason": "length", "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
"stop_reason": null "choices": [
{
"index": 0,
"text": " top destination for foodies, with",
"logprobs": null,
"finish_reason": "length",
"stop_reason": null
}
],
"usage": {
"prompt_tokens": 5,
"total_tokens": 12,
"completion_tokens": 7
}
} }
], ```
"usage": {
"prompt_tokens": 5,
"total_tokens": 12,
"completion_tokens": 7
}
}
```

View File

@ -7,13 +7,13 @@ title: Open WebUI
2. Start the vLLM server with the supported chat completion model, e.g. 2. Start the vLLM server with the supported chat completion model, e.g.
```console ```bash
vllm serve qwen/Qwen1.5-0.5B-Chat vllm serve qwen/Qwen1.5-0.5B-Chat
``` ```
1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port): 1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port):
```console ```bash
docker run -d -p 3000:8080 \ docker run -d -p 3000:8080 \
--name open-webui \ --name open-webui \
-v open-webui:/app/backend/data \ -v open-webui:/app/backend/data \

View File

@ -15,7 +15,7 @@ Here are the integrations:
- Setup vLLM and langchain environment - Setup vLLM and langchain environment
```console ```bash
pip install -U vllm \ pip install -U vllm \
langchain_milvus langchain_openai \ langchain_milvus langchain_openai \
langchain_community beautifulsoup4 \ langchain_community beautifulsoup4 \
@ -26,14 +26,14 @@ pip install -U vllm \
- Start the vLLM server with the supported embedding model, e.g. - Start the vLLM server with the supported embedding model, e.g.
```console ```bash
# Start embedding service (port 8000) # Start embedding service (port 8000)
vllm serve ssmits/Qwen2-7B-Instruct-embed-base vllm serve ssmits/Qwen2-7B-Instruct-embed-base
``` ```
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```console ```bash
# Start chat service (port 8001) # Start chat service (port 8001)
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001 vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
``` ```
@ -52,7 +52,7 @@ python retrieval_augmented_generation_with_langchain.py
- Setup vLLM and llamaindex environment - Setup vLLM and llamaindex environment
```console ```bash
pip install vllm \ pip install vllm \
llama-index llama-index-readers-web \ llama-index llama-index-readers-web \
llama-index-llms-openai-like \ llama-index-llms-openai-like \
@ -64,14 +64,14 @@ pip install vllm \
- Start the vLLM server with the supported embedding model, e.g. - Start the vLLM server with the supported embedding model, e.g.
```console ```bash
# Start embedding service (port 8000) # Start embedding service (port 8000)
vllm serve ssmits/Qwen2-7B-Instruct-embed-base vllm serve ssmits/Qwen2-7B-Instruct-embed-base
``` ```
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```console ```bash
# Start chat service (port 8001) # Start chat service (port 8001)
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001 vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
``` ```

View File

@ -15,7 +15,7 @@ vLLM can be **run and scaled to multiple service replicas on clouds and Kubernet
- Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)). - Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)).
- Check that `sky check` shows clouds or Kubernetes are enabled. - Check that `sky check` shows clouds or Kubernetes are enabled.
```console ```bash
pip install skypilot-nightly pip install skypilot-nightly
sky check sky check
``` ```
@ -24,52 +24,54 @@ sky check
See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml). See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml).
```yaml ??? Yaml
resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
use_spot: True
disk_size: 512 # Ensure model checkpoints can fit.
disk_tier: best
ports: 8081 # Expose to internet traffic.
envs: ```yaml
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct resources:
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass. accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
use_spot: True
disk_size: 512 # Ensure model checkpoints can fit.
disk_tier: best
ports: 8081 # Expose to internet traffic.
setup: | envs:
conda create -n vllm python=3.10 -y MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
conda activate vllm HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
pip install vllm==0.4.0.post1 setup: |
# Install Gradio for web UI. conda create -n vllm python=3.10 -y
pip install gradio openai conda activate vllm
pip install flash-attn==2.5.7
run: | pip install vllm==0.4.0.post1
conda activate vllm # Install Gradio for web UI.
echo 'Starting vllm api server...' pip install gradio openai
python -u -m vllm.entrypoints.openai.api_server \ pip install flash-attn==2.5.7
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log &
echo 'Waiting for vllm api server to start...' run: |
while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done conda activate vllm
echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log &
echo 'Starting gradio server...' echo 'Waiting for vllm api server to start...'
git clone https://github.com/vllm-project/vllm.git || true while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \ echo 'Starting gradio server...'
--port 8811 \ git clone https://github.com/vllm-project/vllm.git || true
--model-url http://localhost:8081/v1 \ python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
--stop-token-ids 128009,128001 -m $MODEL_NAME \
``` --port 8811 \
--model-url http://localhost:8081/v1 \
--stop-token-ids 128009,128001
```
Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...):
```console ```bash
HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
``` ```
@ -81,7 +83,7 @@ Check the output of the command. There will be a shareable gradio link (like the
**Optional**: Serve the 70B model instead of the default 8B and use more GPU: **Optional**: Serve the 70B model instead of the default 8B and use more GPU:
```console ```bash
HF_TOKEN="your-huggingface-token" \ HF_TOKEN="your-huggingface-token" \
sky launch serving.yaml \ sky launch serving.yaml \
--gpus A100:8 \ --gpus A100:8 \
@ -93,72 +95,71 @@ HF_TOKEN="your-huggingface-token" \
SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file. SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.
```yaml ??? Yaml
service:
replicas: 2
# An actual request for readiness probe.
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
max_completion_tokens: 1
```
<details> ```yaml
<summary>Click to see the full recipe YAML</summary> service:
replicas: 2
```yaml # An actual request for readiness probe.
service: readiness_probe:
replicas: 2 path: /v1/chat/completions
# An actual request for readiness probe. post_data:
readiness_probe: model: $MODEL_NAME
path: /v1/chat/completions messages:
post_data: - role: user
model: $MODEL_NAME content: Hello! What is your name?
messages:
- role: user
content: Hello! What is your name?
max_completion_tokens: 1 max_completion_tokens: 1
```
resources: ??? Yaml
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
use_spot: True
disk_size: 512 # Ensure model checkpoints can fit.
disk_tier: best
ports: 8081 # Expose to internet traffic.
envs: ```yaml
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct service:
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass. replicas: 2
# An actual request for readiness probe.
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
max_completion_tokens: 1
setup: | resources:
conda create -n vllm python=3.10 -y accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
conda activate vllm use_spot: True
disk_size: 512 # Ensure model checkpoints can fit.
disk_tier: best
ports: 8081 # Expose to internet traffic.
pip install vllm==0.4.0.post1 envs:
# Install Gradio for web UI. MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
pip install gradio openai HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
pip install flash-attn==2.5.7
run: | setup: |
conda activate vllm conda create -n vllm python=3.10 -y
echo 'Starting vllm api server...' conda activate vllm
python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log
```
</details> pip install vllm==0.4.0.post1
# Install Gradio for web UI.
pip install gradio openai
pip install flash-attn==2.5.7
run: |
conda activate vllm
echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log
```
Start the serving the Llama-3 8B model on multiple replicas: Start the serving the Llama-3 8B model on multiple replicas:
```console ```bash
HF_TOKEN="your-huggingface-token" \ HF_TOKEN="your-huggingface-token" \
sky serve up -n vllm serving.yaml \ sky serve up -n vllm serving.yaml \
--env HF_TOKEN --env HF_TOKEN
@ -166,12 +167,11 @@ HF_TOKEN="your-huggingface-token" \
Wait until the service is ready: Wait until the service is ready:
```console ```bash
watch -n10 sky serve status vllm watch -n10 sky serve status vllm
``` ```
<details> Example outputs:
<summary>Example outputs:</summary>
```console ```console
Services Services
@ -184,29 +184,29 @@ vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) R
vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4
``` ```
</details>
After the service is READY, you can find a single endpoint for the service and access the service with the endpoint: After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
```console ??? Commands
ENDPOINT=$(sky serve status --endpoint 8081 vllm)
curl -L http://$ENDPOINT/v1/chat/completions \ ```bash
-H "Content-Type: application/json" \ ENDPOINT=$(sky serve status --endpoint 8081 vllm)
-d '{ curl -L http://$ENDPOINT/v1/chat/completions \
"model": "meta-llama/Meta-Llama-3-8B-Instruct", -H "Content-Type: application/json" \
"messages": [ -d '{
{ "model": "meta-llama/Meta-Llama-3-8B-Instruct",
"role": "system", "messages": [
"content": "You are a helpful assistant." {
}, "role": "system",
{ "content": "You are a helpful assistant."
"role": "user", },
"content": "Who are you?" {
} "role": "user",
], "content": "Who are you?"
"stop_token_ids": [128009, 128001] }
}' ],
``` "stop_token_ids": [128009, 128001]
}'
```
To enable autoscaling, you could replace the `replicas` with the following configs in `service`: To enable autoscaling, you could replace the `replicas` with the following configs in `service`:
@ -220,67 +220,64 @@ service:
This will scale the service up to when the QPS exceeds 2 for each replica. This will scale the service up to when the QPS exceeds 2 for each replica.
<details> ??? Yaml
<summary>Click to see the full recipe YAML</summary>
```yaml ```yaml
service: service:
replica_policy: replica_policy:
min_replicas: 2 min_replicas: 2
max_replicas: 4 max_replicas: 4
target_qps_per_replica: 2 target_qps_per_replica: 2
# An actual request for readiness probe. # An actual request for readiness probe.
readiness_probe: readiness_probe:
path: /v1/chat/completions path: /v1/chat/completions
post_data: post_data:
model: $MODEL_NAME model: $MODEL_NAME
messages: messages:
- role: user - role: user
content: Hello! What is your name? content: Hello! What is your name?
max_completion_tokens: 1 max_completion_tokens: 1
resources: resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
use_spot: True use_spot: True
disk_size: 512 # Ensure model checkpoints can fit. disk_size: 512 # Ensure model checkpoints can fit.
disk_tier: best disk_tier: best
ports: 8081 # Expose to internet traffic. ports: 8081 # Expose to internet traffic.
envs: envs:
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass. HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
setup: | setup: |
conda create -n vllm python=3.10 -y conda create -n vllm python=3.10 -y
conda activate vllm conda activate vllm
pip install vllm==0.4.0.post1 pip install vllm==0.4.0.post1
# Install Gradio for web UI. # Install Gradio for web UI.
pip install gradio openai pip install gradio openai
pip install flash-attn==2.5.7 pip install flash-attn==2.5.7
run: | run: |
conda activate vllm conda activate vllm
echo 'Starting vllm api server...' echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \ python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \ --port 8081 \
--model $MODEL_NAME \ --model $MODEL_NAME \
--trust-remote-code \ --trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log 2>&1 | tee api_server.log
``` ```
</details>
To update the service with the new config: To update the service with the new config:
```console ```bash
HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN
``` ```
To stop the service: To stop the service:
```console ```bash
sky serve down vllm sky serve down vllm
``` ```
@ -288,42 +285,39 @@ sky serve down vllm
It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas. It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
<details> ??? Yaml
<summary>Click to see the full GUI YAML</summary>
```yaml ```yaml
envs: envs:
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm.
resources: resources:
cpus: 2 cpus: 2
setup: | setup: |
conda create -n vllm python=3.10 -y conda create -n vllm python=3.10 -y
conda activate vllm conda activate vllm
# Install Gradio for web UI. # Install Gradio for web UI.
pip install gradio openai pip install gradio openai
run: | run: |
conda activate vllm conda activate vllm
export PATH=$PATH:/sbin export PATH=$PATH:/sbin
echo 'Starting gradio server...' echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \ python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \ -m $MODEL_NAME \
--port 8811 \ --port 8811 \
--model-url http://$ENDPOINT/v1 \ --model-url http://$ENDPOINT/v1 \
--stop-token-ids 128009,128001 | tee ~/gradio.log --stop-token-ids 128009,128001 | tee ~/gradio.log
``` ```
</details>
1. Start the chat web UI: 1. Start the chat web UI:
```console ```bash
sky launch \ sky launch \
-c gui ./gui.yaml \ -c gui ./gui.yaml \
--env ENDPOINT=$(sky serve status --endpoint vllm) --env ENDPOINT=$(sky serve status --endpoint vllm)

View File

@ -15,13 +15,13 @@ It can be quickly integrated with vLLM as a backend API server, enabling powerfu
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```console ```bash
vllm serve qwen/Qwen1.5-0.5B-Chat vllm serve qwen/Qwen1.5-0.5B-Chat
``` ```
- Install streamlit and openai: - Install streamlit and openai:
```console ```bash
pip install streamlit openai pip install streamlit openai
``` ```
@ -29,7 +29,7 @@ pip install streamlit openai
- Start the streamlit web UI and start to chat: - Start the streamlit web UI and start to chat:
```console ```bash
streamlit run streamlit_openai_chatbot_webserver.py streamlit run streamlit_openai_chatbot_webserver.py
# or specify the VLLM_API_BASE or VLLM_API_KEY # or specify the VLLM_API_BASE or VLLM_API_KEY

View File

@ -7,7 +7,7 @@ vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-sta
To install Llama Stack, run To install Llama Stack, run
```console ```bash
pip install llama-stack -q pip install llama-stack -q
``` ```

View File

@ -60,22 +60,22 @@ And then you can send out a query to the OpenAI-compatible API to check the avai
curl -o- http://localhost:30080/models curl -o- http://localhost:30080/models
``` ```
Expected output: ??? Output
```json ```json
{
"object": "list",
"data": [
{ {
"id": "facebook/opt-125m", "object": "list",
"object": "model", "data": [
"created": 1737428424, {
"owned_by": "vllm", "id": "facebook/opt-125m",
"root": null "object": "model",
"created": 1737428424,
"owned_by": "vllm",
"root": null
}
]
} }
] ```
}
```
To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint: To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint:
@ -89,23 +89,23 @@ curl -X POST http://localhost:30080/completions \
}' }'
``` ```
Expected output: ??? Output
```json ```json
{
"id": "completion-id",
"object": "text_completion",
"created": 1737428424,
"model": "facebook/opt-125m",
"choices": [
{ {
"text": " there was a brave knight who...", "id": "completion-id",
"index": 0, "object": "text_completion",
"finish_reason": "length" "created": 1737428424,
"model": "facebook/opt-125m",
"choices": [
{
"text": " there was a brave knight who...",
"index": 0,
"finish_reason": "length"
}
]
} }
] ```
}
```
### Uninstall ### Uninstall
@ -121,23 +121,25 @@ sudo helm uninstall vllm
The core vLLM production stack configuration is managed with YAML. Here is the example configuration used in the installation above: The core vLLM production stack configuration is managed with YAML. Here is the example configuration used in the installation above:
```yaml ??? Yaml
servingEngineSpec:
runtimeClassName: ""
modelSpec:
- name: "opt125m"
repository: "vllm/vllm-openai"
tag: "latest"
modelURL: "facebook/opt-125m"
replicaCount: 1 ```yaml
servingEngineSpec:
runtimeClassName: ""
modelSpec:
- name: "opt125m"
repository: "vllm/vllm-openai"
tag: "latest"
modelURL: "facebook/opt-125m"
requestCPU: 6 replicaCount: 1
requestMemory: "16Gi"
requestGPU: 1
pvcStorage: "10Gi" requestCPU: 6
``` requestMemory: "16Gi"
requestGPU: 1
pvcStorage: "10Gi"
```
In this YAML configuration: In this YAML configuration:
* **`modelSpec`** includes: * **`modelSpec`** includes:

View File

@ -29,89 +29,93 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model: First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
```bash ??? Config
cat <<EOF |kubectl apply -f -
apiVersion: v1 ```bash
kind: PersistentVolumeClaim cat <<EOF |kubectl apply -f -
metadata: apiVersion: v1
name: vllm-models kind: PersistentVolumeClaim
spec: metadata:
accessModes: name: vllm-models
- ReadWriteOnce spec:
volumeMode: Filesystem accessModes:
resources: - ReadWriteOnce
requests: volumeMode: Filesystem
storage: 50Gi resources:
--- requests:
apiVersion: v1 storage: 50Gi
kind: Secret ---
metadata: apiVersion: v1
name: hf-token-secret kind: Secret
type: Opaque metadata:
data: name: hf-token-secret
token: $(HF_TOKEN) type: Opaque
EOF data:
``` token: $(HF_TOKEN)
EOF
```
Next, start the vLLM server as a Kubernetes Deployment and Service: Next, start the vLLM server as a Kubernetes Deployment and Service:
```bash ??? Config
cat <<EOF |kubectl apply -f -
apiVersion: apps/v1 ```bash
kind: Deployment cat <<EOF |kubectl apply -f -
metadata: apiVersion: apps/v1
name: vllm-server kind: Deployment
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: vllm
template:
metadata: metadata:
labels: name: vllm-server
app.kubernetes.io/name: vllm
spec: spec:
containers: replicas: 1
- name: vllm selector:
image: vllm/vllm-openai:latest matchLabels:
command: ["/bin/sh", "-c"] app.kubernetes.io/name: vllm
args: [ template:
"vllm serve meta-llama/Llama-3.2-1B-Instruct" metadata:
] labels:
env: app.kubernetes.io/name: vllm
- name: HUGGING_FACE_HUB_TOKEN spec:
valueFrom: containers:
secretKeyRef: - name: vllm
name: hf-token-secret image: vllm/vllm-openai:latest
key: token command: ["/bin/sh", "-c"]
ports: args: [
- containerPort: 8000 "vllm serve meta-llama/Llama-3.2-1B-Instruct"
volumeMounts: ]
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
ports:
- containerPort: 8000
volumeMounts:
- name: llama-storage
mountPath: /root/.cache/huggingface
volumes:
- name: llama-storage - name: llama-storage
mountPath: /root/.cache/huggingface persistentVolumeClaim:
volumes: claimName: vllm-models
- name: llama-storage ---
persistentVolumeClaim: apiVersion: v1
claimName: vllm-models kind: Service
--- metadata:
apiVersion: v1 name: vllm-server
kind: Service spec:
metadata: selector:
name: vllm-server app.kubernetes.io/name: vllm
spec: ports:
selector: - protocol: TCP
app.kubernetes.io/name: vllm port: 8000
ports: targetPort: 8000
- protocol: TCP type: ClusterIP
port: 8000 EOF
targetPort: 8000 ```
type: ClusterIP
EOF
```
We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model): We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
```console ```bash
kubectl logs -l app.kubernetes.io/name=vllm kubectl logs -l app.kubernetes.io/name=vllm
... ...
INFO: Started server process [1] INFO: Started server process [1]
@ -128,6 +132,9 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
PVC is used to store the model cache and it is optional, you can use hostPath or other storage options PVC is used to store the model cache and it is optional, you can use hostPath or other storage options
<details>
<summary>Yaml</summary>
```yaml ```yaml
apiVersion: v1 apiVersion: v1
kind: PersistentVolumeClaim kind: PersistentVolumeClaim
@ -144,6 +151,8 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
volumeMode: Filesystem volumeMode: Filesystem
``` ```
</details>
Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models
```yaml ```yaml
@ -156,13 +165,16 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
stringData: stringData:
token: "REPLACE_WITH_TOKEN" token: "REPLACE_WITH_TOKEN"
``` ```
Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model. Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model.
Here are two examples for using NVIDIA GPU and AMD GPU. Here are two examples for using NVIDIA GPU and AMD GPU.
NVIDIA GPU: NVIDIA GPU:
<details>
<summary>Yaml</summary>
```yaml ```yaml
apiVersion: apps/v1 apiVersion: apps/v1
kind: Deployment kind: Deployment
@ -233,10 +245,15 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
periodSeconds: 5 periodSeconds: 5
``` ```
</details>
AMD GPU: AMD GPU:
You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X. You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
<details>
<summary>Yaml</summary>
```yaml ```yaml
apiVersion: apps/v1 apiVersion: apps/v1
kind: Deployment kind: Deployment
@ -305,12 +322,17 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
mountPath: /dev/shm mountPath: /dev/shm
``` ```
</details>
You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>. You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>.
2. Create a Kubernetes Service for vLLM 2. Create a Kubernetes Service for vLLM
Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
<details>
<summary>Yaml</summary>
```yaml ```yaml
apiVersion: v1 apiVersion: v1
kind: Service kind: Service
@ -330,18 +352,20 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
type: ClusterIP type: ClusterIP
``` ```
</details>
3. Deploy and Test 3. Deploy and Test
Apply the deployment and service configurations using `kubectl apply -f <filename>`: Apply the deployment and service configurations using `kubectl apply -f <filename>`:
```console ```bash
kubectl apply -f deployment.yaml kubectl apply -f deployment.yaml
kubectl apply -f service.yaml kubectl apply -f service.yaml
``` ```
To test the deployment, run the following `curl` command: To test the deployment, run the following `curl` command:
```console ```bash
curl http://mistral-7b.default.svc.cluster.local/v1/completions \ curl http://mistral-7b.default.svc.cluster.local/v1/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ -d '{

View File

@ -11,13 +11,13 @@ This document shows how to launch multiple vLLM serving containers and use Nginx
This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory. This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory.
```console ```bash
export vllm_root=`pwd` export vllm_root=`pwd`
``` ```
Create a file named `Dockerfile.nginx`: Create a file named `Dockerfile.nginx`:
```console ```dockerfile
FROM nginx:latest FROM nginx:latest
RUN rm /etc/nginx/conf.d/default.conf RUN rm /etc/nginx/conf.d/default.conf
EXPOSE 80 EXPOSE 80
@ -26,7 +26,7 @@ CMD ["nginx", "-g", "daemon off;"]
Build the container: Build the container:
```console ```bash
docker build . -f Dockerfile.nginx --tag nginx-lb docker build . -f Dockerfile.nginx --tag nginx-lb
``` ```
@ -36,36 +36,38 @@ docker build . -f Dockerfile.nginx --tag nginx-lb
Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`. Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
```console ??? Config
upstream backend {
least_conn; ```console
server vllm0:8000 max_fails=3 fail_timeout=10000s; upstream backend {
server vllm1:8000 max_fails=3 fail_timeout=10000s; least_conn;
} server vllm0:8000 max_fails=3 fail_timeout=10000s;
server { server vllm1:8000 max_fails=3 fail_timeout=10000s;
listen 80;
location / {
proxy_pass http://backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
} }
} server {
``` listen 80;
location / {
proxy_pass http://backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
}
```
[](){ #nginxloadbalancer-nginx-vllm-container } [](){ #nginxloadbalancer-nginx-vllm-container }
## Build vLLM Container ## Build vLLM Container
```console ```bash
cd $vllm_root cd $vllm_root
docker build -f docker/Dockerfile . --tag vllm docker build -f docker/Dockerfile . --tag vllm
``` ```
If you are behind proxy, you can pass the proxy settings to the docker build command as shown below: If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:
```console ```bash
cd $vllm_root cd $vllm_root
docker build \ docker build \
-f docker/Dockerfile . \ -f docker/Dockerfile . \
@ -78,7 +80,7 @@ docker build \
## Create Docker Network ## Create Docker Network
```console ```bash
docker network create vllm_nginx docker network create vllm_nginx
``` ```
@ -93,30 +95,32 @@ Notes:
- The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus device=ID`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command. - The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus device=ID`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command.
- Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`. - Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`.
```console ??? Commands
mkdir -p ~/.cache/huggingface/hub/
hf_cache_dir=~/.cache/huggingface/ ```console
docker run \ mkdir -p ~/.cache/huggingface/hub/
-itd \ hf_cache_dir=~/.cache/huggingface/
--ipc host \ docker run \
--network vllm_nginx \ -itd \
--gpus device=0 \ --ipc host \
--shm-size=10.24gb \ --network vllm_nginx \
-v $hf_cache_dir:/root/.cache/huggingface/ \ --gpus device=0 \
-p 8081:8000 \ --shm-size=10.24gb \
--name vllm0 vllm \ -v $hf_cache_dir:/root/.cache/huggingface/ \
--model meta-llama/Llama-2-7b-chat-hf -p 8081:8000 \
docker run \ --name vllm0 vllm \
-itd \ --model meta-llama/Llama-2-7b-chat-hf
--ipc host \ docker run \
--network vllm_nginx \ -itd \
--gpus device=1 \ --ipc host \
--shm-size=10.24gb \ --network vllm_nginx \
-v $hf_cache_dir:/root/.cache/huggingface/ \ --gpus device=1 \
-p 8082:8000 \ --shm-size=10.24gb \
--name vllm1 vllm \ -v $hf_cache_dir:/root/.cache/huggingface/ \
--model meta-llama/Llama-2-7b-chat-hf -p 8082:8000 \
``` --name vllm1 vllm \
--model meta-llama/Llama-2-7b-chat-hf
```
!!! note !!! note
If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`. If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
@ -125,7 +129,7 @@ docker run \
## Launch Nginx ## Launch Nginx
```console ```bash
docker run \ docker run \
-itd \ -itd \
-p 8000:80 \ -p 8000:80 \
@ -138,7 +142,7 @@ docker run \
## Verify That vLLM Servers Are Ready ## Verify That vLLM Servers Are Ready
```console ```bash
docker logs vllm0 | grep Uvicorn docker logs vllm0 | grep Uvicorn
docker logs vllm1 | grep Uvicorn docker logs vllm1 | grep Uvicorn
``` ```

View File

@ -22,31 +22,33 @@ server.
Here is a sample of `LLM` class usage: Here is a sample of `LLM` class usage:
```python ??? Code
from vllm import LLM, SamplingParams
# Define a list of input prompts ```python
prompts = [ from vllm import LLM, SamplingParams
"Hello, my name is",
"The capital of France is",
"The largest ocean is",
]
# Define sampling parameters # Define a list of input prompts
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) prompts = [
"Hello, my name is",
"The capital of France is",
"The largest ocean is",
]
# Initialize the LLM engine with the OPT-125M model # Define sampling parameters
llm = LLM(model="facebook/opt-125m") sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Generate outputs for the input prompts # Initialize the LLM engine with the OPT-125M model
outputs = llm.generate(prompts, sampling_params) llm = LLM(model="facebook/opt-125m")
# Print the generated outputs # Generate outputs for the input prompts
for output in outputs: outputs = llm.generate(prompts, sampling_params)
prompt = output.prompt
generated_text = output.outputs[0].text # Print the generated outputs
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") for output in outputs:
``` prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs. More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
@ -178,32 +180,34 @@ vision-language model.
To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one: To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
```python ??? Code
class MyOldModel(nn.Module):
def __init__(
self,
config,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
lora_config: Optional[LoRAConfig] = None,
prefix: str = "",
) -> None:
...
from vllm.config import VllmConfig ```python
class MyNewModel(MyOldModel): class MyOldModel(nn.Module):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(
config = vllm_config.model_config.hf_config self,
cache_config = vllm_config.cache_config config,
quant_config = vllm_config.quant_config cache_config: Optional[CacheConfig] = None,
lora_config = vllm_config.lora_config quant_config: Optional[QuantizationConfig] = None,
super().__init__(config, cache_config, quant_config, lora_config, prefix) lora_config: Optional[LoRAConfig] = None,
prefix: str = "",
) -> None:
...
if __version__ >= "0.6.4": from vllm.config import VllmConfig
MyModel = MyNewModel class MyNewModel(MyOldModel):
else: def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
MyModel = MyOldModel config = vllm_config.model_config.hf_config
``` cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
super().__init__(config, cache_config, quant_config, lora_config, prefix)
if __version__ >= "0.6.4":
MyModel = MyNewModel
else:
MyModel = MyOldModel
```
This way, the model can work with both old and new versions of vLLM. This way, the model can work with both old and new versions of vLLM.

View File

@ -448,27 +448,29 @@ elements of the entire head for all context tokens. However, overall,
all results for output have been calculated but are just stored in all results for output have been calculated but are just stored in
different thread register memory. different thread register memory.
```cpp ??? Code
float* out_smem = reinterpret_cast<float*>(shared_mem);
for (int i = NUM_WARPS; i > 1; i /= 2) {
// Upper warps write to shared memory.
...
float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
...
dst[row_idx] = accs[i];
}
// Lower warps update the output. ```cpp
const float* src = &out_smem[warp_idx * HEAD_SIZE]; float* out_smem = reinterpret_cast<float*>(shared_mem);
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { for (int i = NUM_WARPS; i > 1; i /= 2) {
// Upper warps write to shared memory.
... ...
accs[i] += src[row_idx]; float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
} for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
...
dst[row_idx] = accs[i];
}
// Write out the accs. // Lower warps update the output.
} const float* src = &out_smem[warp_idx * HEAD_SIZE];
``` for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
...
accs[i] += src[row_idx];
}
// Write out the accs.
}
```
## Output ## Output

View File

@ -13,28 +13,30 @@ Plugins are user-registered code that vLLM executes. Given vLLM's architecture (
vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin: vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
```python ??? Code
# inside `setup.py` file
from setuptools import setup
setup(name='vllm_add_dummy_model', ```python
version='0.1', # inside `setup.py` file
packages=['vllm_add_dummy_model'], from setuptools import setup
entry_points={
'vllm.general_plugins':
["register_dummy_model = vllm_add_dummy_model:register"]
})
# inside `vllm_add_dummy_model.py` file setup(name='vllm_add_dummy_model',
def register(): version='0.1',
from vllm import ModelRegistry packages=['vllm_add_dummy_model'],
entry_points={
'vllm.general_plugins':
["register_dummy_model = vllm_add_dummy_model:register"]
})
if "MyLlava" not in ModelRegistry.get_supported_archs(): # inside `vllm_add_dummy_model.py` file
ModelRegistry.register_model( def register():
"MyLlava", from vllm import ModelRegistry
"vllm_add_dummy_model.my_llava:MyLlava",
) if "MyLlava" not in ModelRegistry.get_supported_archs():
``` ModelRegistry.register_model(
"MyLlava",
"vllm_add_dummy_model.my_llava:MyLlava",
)
```
For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html). For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).

View File

@ -0,0 +1,357 @@
An implementation of xPyD with dynamic scaling based on point-to-point communication, partly inspired by Dynamo.
# Detailed Design
## Overall Process
As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:
1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.
2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.
3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.
4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.
5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.
6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.
7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**.
![image1](https://github.com/user-attachments/assets/fb01bde6-755b-49f7-ad45-48a94b1e10a7)
## Proxy/Router (Demo)
A simple HTTP service acts as the entry point for client requests and starts a background thread to listen for P/D instances reporting their HTTP IP and PORT, as well as ZMQ IP and PORT. It maintains a dictionary of `http_addr -> zmq_addr`. The `http_addr` is the IP:PORT for the vLLM instance's request, while the `zmq_addr` is the address for KV cache handshake and metadata reception.
The Proxy/Router is responsible for selecting 1P1D based on the characteristics of the client request, such as the prompt, and generating a corresponding `request_id`, for example:
```
cmpl-___prefill_addr_10.0.1.2:21001___decode_addr_10.0.1.3:22001_93923d63113b4b338973f24d19d4bf11-0
```
Currently, to quickly verify whether xPyD can work, a round-robin selection of 1P1D is used. In the future, it is planned to use a trie combined with the load status of instances to select appropriate P and D.
Each P/D instance periodically sends a heartbeat packet to the Proxy/Router (currently every 3 seconds) to register (i.e., report `http_addr -> zmq_addr`) and keep the connection alive. If an instance crashes and fails to send a ping for a certain period of time, the Proxy/Router will remove the timed-out instance (this feature has not yet been developed).
## KV Cache Transfer Methods
There are three methods for KVcache transfer: PUT, GET, and PUT_ASYNC. These methods can be specified using the `--kv-transfer-config` and `kv_connector_extra_config` parameters, specifically through the `send_type` field. Both PUT and PUT_ASYNC involve the P instance actively sending KVcache to the D instance. The difference is that PUT is a synchronous transfer method that blocks the main process, while PUT_ASYNC is an asynchronous transfer method. PUT_ASYNC uses a dedicated thread for sending KVcache, which means it does not block the main process. In contrast, the GET method involves the P instance saving the KVcache to the memory buffer after computing the prefill. The D instance then actively retrieves the computed KVcache from the P instance once it has allocated space for the KVcache.
Experimental results have shown that the performance of these methods, from highest to lowest, is as follows: PUT_ASYNC → GET → PUT.
## P2P Communication via ZMQ & NCCL
As long as the address of the counterpart is known, point-to-point KV cache transfer (using NCCL) can be performed, without being constrained by rank and world size. To support dynamic scaling (expansion and contraction) of instances with PD disaggregation. This means that adding or removing P/D instances does not require a full system restart.
Each P/D instance only needs to create a single `P2pNcclEngine` instance. This instance maintains a ZMQ Server, which runs a dedicated thread to listen on the `zmq_addr` address and receive control flow requests from other instances. These requests include requests to establish an NCCL connection and requests to send KVcache metadata (such as tensor shapes and data types). However, it does not actually transmit the KVcache data itself.
When a P instance and a D instance transmit KVcache for the first time, they need to establish a ZMQ connection and an NCCL group. For subsequent KVcache transmissions, this ZMQ connection and NCCL group are reused. The NCCL group consists of only two ranks, meaning the world size is equal to 2. This design is intended to support dynamic scaling, which means that adding or removing P/D instances does not require a full system restart. As long as the address of the counterpart is known, point-to-point KVcache transmission can be performed, without being restricted by rank or world size.
## NCCL Group Topology
Currently, only symmetric TP (Tensor Parallelism) methods are supported for KVcache transmission. Asymmetric TP and PP (Pipeline Parallelism) methods will be supported in the future. Figure 2 illustrates the 1P2D setup, where each instance has a TP (Tensor Parallelism) degree of 2. There are a total of 7 NCCL groups: three vLLM instances each have one NCCL group with TP=2. Additionally, the 0th GPU card of the P instance establishes an NCCL group with the 0th GPU card of each D instance. Similarly, the 1st GPU card of the P instance establishes an NCCL group with the 1st GPU card of each D instance.
![image2](https://github.com/user-attachments/assets/837e61d6-365e-4cbf-8640-6dd7ab295b36)
Each NCCL group occupies a certain amount of GPU memory buffer for communication, the size of which is primarily influenced by the `NCCL_MAX_NCHANNELS` environment variable. When `NCCL_MAX_NCHANNELS=16`, an NCCL group typically occupies 100MB, while when `NCCL_MAX_NCHANNELS=8`, it usually takes up 52MB. For large-scale xPyD configurations—such as DeepSeek's 96P144D—this implementation is currently not feasible. Moving forward, we are considering using RDMA for point-to-point communication and are also keeping an eye on UCCL.
## GPU Memory Buffer and Tensor Memory Pool
The trade-off in the size of the memory buffer is as follows: For P instances, the memory buffer is not required in PUT and PUT_ASYNC modes, but it is necessary in GET mode. For D instances, a memory buffer is needed in all three modes. The memory buffer for D instances should not be too large. Similarly, for P instances in GET mode, the memory buffer should also not be too large. The memory buffer of D instances is used to temporarily store KVcache sent by P instances. If it is too large, it will reduce the KVcache space available for normal inference by D instances, thereby decreasing the inference batch size and ultimately leading to a reduction in output throughput. The size of the memory buffer is configured by the parameter `kv_buffer_size`, measured in bytes, and is typically set to 5%10% of the memory size.
If the `--max-num-seqs` parameter for P instances is set to a large value, due to the large batch size, P instances will generate a large amount of KVcache simultaneously. This may exceed the capacity of the memory buffer of D instances, resulting in KVcache loss. Once KVcache is lost, D instances need to recompute Prefill, which is equivalent to performing Prefill twice. Consequently, the time-to-first-token (TTFT) will significantly increase, leading to degraded performance.
To address the above issues, I have designed and developed a local Tensor memory pool for storing KVcache, inspired by the buddy system used in Linux memory modules. Since the memory is sufficiently large, typically in the TB range on servers, there is no need to consider prefix caching or using block-based designs to reuse memory, thereby saving space. When the memory buffer is insufficient, KVcache can be directly stored in the Tensor memory pool, and D instances can subsequently retrieve KVcache from it. The read and write speed is that of PCIe, with PCIe 4.0 having a speed of approximately 21 GB/s, which is usually faster than the Prefill speed. Otherwise, solutions like Mooncake and lmcache would not be necessary. The Tensor memory pool acts as a flood diversion area, typically unused except during sudden traffic surges. In the worst-case scenario, my solution performs no worse than the normal situation with a Cache store.
# Install vLLM
??? Commands
```shell
# Enter the home directory or your working directory.
cd /home
# Download the installation package, and I will update the commit-id in time. You can directly copy the command.
wget https://vllm-wheels.s3.us-west-2.amazonaws.com/9112b443a042d8d815880b8780633882ad32b183/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
# Download the code repository.
git clone -b xpyd-v1 https://github.com/Abatom/vllm.git
cd vllm
# Set the installation package path.
export VLLM_PRECOMPILED_WHEEL_LOCATION=/home/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
# installation
pip install -e . -v
```
# Run xPyD
## Instructions
- The following examples are run on an A800 (80GB) device, using the Meta-Llama-3.1-8B-Instruct model.
- Pay attention to the setting of the `kv_buffer_size` (in bytes). The empirical value is 10% of the GPU memory size. This is related to the kvcache size. If it is too small, the GPU memory buffer for temporarily storing the received kvcache will overflow, causing the kvcache to be stored in the tensor memory pool, which increases latency. If it is too large, the kvcache available for inference will be reduced, leading to a smaller batch size and decreased throughput.
- For Prefill instances, when using non-GET mode, the `kv_buffer_size` can be set to 1, as Prefill currently does not need to receive kvcache. However, when using GET mode, a larger `kv_buffer_size` is required because it needs to store the kvcache sent to the D instance.
- You may need to modify the `kv_buffer_size` and `port` in the following commands (if there is a conflict).
- `PUT_ASYNC` offers the best performance and should be prioritized.
- The `--port` must be consistent with the `http_port` in the `--kv-transfer-config`.
- The `disagg_prefill_proxy_xpyd.py` script will use port 10001 (for receiving client requests) and port 30001 (for receiving service discovery from P and D instances).
- The node running the proxy must have `quart` installed.
- Supports multiple nodes; you just need to modify the `proxy_ip` and `proxy_port` in `--kv-transfer-config`.
- In the following examples, it is assumed that **the proxy's IP is 10.0.1.1**.
## Run 1P3D
### Proxy (e.g. 10.0.1.1)
```shell
cd {your vllm directory}/examples/online_serving/disagg_xpyd/
python3 disagg_prefill_proxy_xpyd.py &
```
### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
??? Command
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20005 \
--tensor-parallel-size 1 \
--seed 1024 \
--served-model-name base_model \
--dtype float16 \
--max-model-len 10000 \
--max-num-batched-tokens 10000 \
--max-num-seqs 256 \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--disable-log-request \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
### Decode1 (e.g. 10.0.1.3 or 10.0.1.1)
??? Command
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20009 \
--tensor-parallel-size 1 \
--seed 1024 \
--served-model-name base_model \
--dtype float16 \
--max-model-len 10000 \
--max-num-batched-tokens 10000 \
--max-num-seqs 256 \
--trust-remote-code \
--gpu-memory-utilization 0.7 \
--disable-log-request \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
### Decode2 (e.g. 10.0.1.4 or 10.0.1.1)
??? Command
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20003 \
--tensor-parallel-size 1 \
--seed 1024 \
--served-model-name base_model \
--dtype float16 \
--max-model-len 10000 \
--max-num-batched-tokens 10000 \
--max-num-seqs 256 \
--trust-remote-code \
--gpu-memory-utilization 0.7 \
--disable-log-request \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
### Decode3 (e.g. 10.0.1.5 or 10.0.1.1)
??? Command
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20008 \
--tensor-parallel-size 1 \
--seed 1024 \
--served-model-name base_model \
--dtype float16 \
--max-model-len 10000 \
--max-num-batched-tokens 10000 \
--max-num-seqs 256 \
--trust-remote-code \
--gpu-memory-utilization 0.7 \
--disable-log-request \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
## Run 3P1D
### Proxy (e.g. 10.0.1.1)
```shell
cd {your vllm directory}/examples/online_serving/disagg_xpyd/
python3 disagg_prefill_proxy_xpyd.py &
```
### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
??? Command
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20005 \
--tensor-parallel-size 1 \
--seed 1024 \
--served-model-name base_model \
--dtype float16 \
--max-model-len 10000 \
--max-num-batched-tokens 10000 \
--max-num-seqs 256 \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--disable-log-request \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1)
??? Command
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20009 \
--tensor-parallel-size 1 \
--seed 1024 \
--served-model-name base_model \
--dtype float16 \
--max-model-len 10000 \
--max-num-batched-tokens 10000 \
--max-num-seqs 256 \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--disable-log-request \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1)
??? Command
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20003 \
--tensor-parallel-size 1 \
--seed 1024 \
--served-model-name base_model \
--dtype float16 \
--max-model-len 10000 \
--max-num-batched-tokens 10000 \
--max-num-seqs 256 \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--disable-log-request \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
### Decode1 (e.g. 10.0.1.5 or 10.0.1.1)
??? Command
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20008 \
--tensor-parallel-size 1 \
--seed 1024 \
--served-model-name base_model \
--dtype float16 \
--max-model-len 10000 \
--max-num-batched-tokens 10000 \
--max-num-seqs 256 \
--trust-remote-code \
--gpu-memory-utilization 0.7 \
--disable-log-request \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
# Single request
```shell
curl -X POST -s http://10.0.1.1:10001/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "base_model",
"prompt": "San Francisco is a",
"max_tokens": 10,
"temperature": 0
}'
```
# Benchmark
??? Command
```shell
python3 benchmark_serving.py \
--backend vllm \
--model base_model \
--tokenizer meta-llama/Llama-3.1-8B-Instruct \
--dataset-name "random" \
--host 10.0.1.1 \
--port 10001 \
--random-input-len 1024 \
--random-output-len 1024 \
--ignore-eos \
--burstiness 100 \
--percentile-metrics "ttft,tpot,itl,e2el" \
--metric-percentiles "90,95,99" \
--seed $(date +%s) \
--trust-remote-code \
--request-rate 3 \
--num-prompts 1000
```
# Shut down
```shell
pgrep python | xargs kill -9 && pkill -f python
```
# Test data
## **Scenario 1**: 1K input & 1K output tokens, E2E P99 latency ~20s
- **1P5D (6×A800) vs vLLM (1×A800)**:
- Throughput ↑7.2% (1085 → 6979/6)
- ITL (P99) ↓81.3% (120ms → 22.9ms)
- TTFT (P99) ↑26.8% (175ms → 222ms)
- TPOT: No change
- **1P6D (7×A800) vs vLLM (1×A800)**:
- Throughput ↑9.6% (1085 → 8329/7)
- ITL (P99) ↓81.0% (120ms → 22.7ms)
- TTFT (P99) ↑210% (175ms →543ms)
- TPOT: No change
## **Scenario 2**: 1K input & 200 output tokens, E2E P99 latency ~4s
- **1P1D (2×A800) vs vLLM (1×A800)**:
- Throughput ↑37.4% (537 → 1476/2)
- ITL (P99) ↓81.8% (127ms → 23.1ms)
- TTFT (P99) ↑41.8% (160ms → 227ms)
- TPOT: No change
![testdata](https://github.com/user-attachments/assets/f791bfc7-9f3d-4e5c-9171-a42f9f4da627)

View File

@ -117,8 +117,8 @@ There are two design points to highlight:
1. We allocate all KVCacheBlock when initializing the KV cache manager to be a block pool. This avoids Python object creation overheads and can easily track all blocks all the time. 1. We allocate all KVCacheBlock when initializing the KV cache manager to be a block pool. This avoids Python object creation overheads and can easily track all blocks all the time.
2. We introduce doubly linked list pointers directly in the KVCacheBlock, so that we could construct a free queue directly. This gives us two benefits: 2. We introduce doubly linked list pointers directly in the KVCacheBlock, so that we could construct a free queue directly. This gives us two benefits:
1. We could have O(1) complexity moving elements in the middle to the tail. 1. We could have O(1) complexity moving elements in the middle to the tail.
2. We could avoid introducing another Python queue (e.g., `deque`) which has a wrapper to the elements. 2. We could avoid introducing another Python queue (e.g., `deque`) which has a wrapper to the elements.
As a result, we will have the following components when the KV cache manager is initialized: As a result, we will have the following components when the KV cache manager is initialized:
@ -135,19 +135,19 @@ As a result, we will have the following components when the KV cache manager is
**New request:** Workflow for the scheduler to schedule a new request with KV cache block allocation: **New request:** Workflow for the scheduler to schedule a new request with KV cache block allocation:
1. The scheduler calls `kv_cache_manager.get_computed_blocks()` to get a sequence of blocks that have already been computed. This is done by hashing the prompt tokens in the request and looking up Cache Blocks. 1. The scheduler calls `kv_cache_manager.get_computed_blocks()` to get a sequence of blocks that have already been computed. This is done by hashing the prompt tokens in the request and looking up cache blocks.
2. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps: 2. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps:
1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate. 1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.
2. “Touch” the computed blocks. It increases the reference count of the computed block by one, and removes the block from the free queue if the block wasnt used by other requests. This is to avoid these computed blocks being evicted. See the example in the next section for illustration. 2. “Touch” the computed blocks. It increases the reference count of the computed block by one, and removes the block from the free queue if the block wasnt used by other requests. This is to avoid these computed blocks being evicted. See the example in the next section for illustration.
3. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on. 3. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.
4. If an allocated block is already full of tokens, we immediately add it to the Cache Block, so that the block can be reused by other requests in the same batch. 4. If an allocated block is already full of tokens, we immediately add it to the cache block, so that the block can be reused by other requests in the same batch.
**Running request:** Workflow for the scheduler to schedule a running request with KV cache block allocation: **Running request:** Workflow for the scheduler to schedule a running request with KV cache block allocation:
1. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps: 1. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps:
1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate. 1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.
2. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on. 2. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.
3. Append token IDs to the slots in existing blocks as well as the new blocks. If a block is full, we add it to the Cache Block to cache it. 3. Append token IDs to the slots in existing blocks as well as the new blocks. If a block is full, we add it to the cache block to cache it.
**Duplicated blocks** **Duplicated blocks**
Assuming block size is 4 and you send a request (Request 1\) with prompt ABCDEF and decoding length 3: Assuming block size is 4 and you send a request (Request 1\) with prompt ABCDEF and decoding length 3:
@ -199,7 +199,7 @@ When a request is finished, we free all its blocks if no other requests are usin
When the head block (least recently used block) of the free queue is cached, we have to evict the block to prevent it from being used by other requests. Specifically, eviction involves the following steps: When the head block (least recently used block) of the free queue is cached, we have to evict the block to prevent it from being used by other requests. Specifically, eviction involves the following steps:
1. Pop the block from the head of the free queue. This is the LRU block to be evicted. 1. Pop the block from the head of the free queue. This is the LRU block to be evicted.
2. Remove the block ID from the Cache Block. 2. Remove the block ID from the cache block.
3. Remove the block hash. 3. Remove the block hash.
## Example ## Example

View File

@ -28,27 +28,29 @@ A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all
In the very verbose logs, we can see: In the very verbose logs, we can see:
``` ??? Logs
DEBUG 03-07 03:06:52 [decorators.py:203] Start compiling function <code object forward at 0x7f08acf40c90, file "xxx/vllm/model_executor/models/llama.py", line 339>
DEBUG 03-07 03:06:54 [backends.py:370] Traced files (to be considered for compilation cache): ```text
DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/_dynamo/polyfills/builtins.py DEBUG 03-07 03:06:52 [decorators.py:203] Start compiling function <code object forward at 0x7f08acf40c90, file "xxx/vllm/model_executor/models/llama.py", line 339>
DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/container.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/module.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/attention/layer.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/communication_op.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/parallel_state.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/custom_op.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/activation.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/layernorm.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/linear.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/rotary_embedding.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/vocab_parallel_embedding.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/models/llama.py
DEBUG 03-07 03:07:07 [backends.py:462] Computation graph saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py DEBUG 03-07 03:06:54 [backends.py:370] Traced files (to be considered for compilation cache):
DEBUG 03-07 03:07:07 [wrapper.py:105] Dynamo transformed code saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/transformed_code.py DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/_dynamo/polyfills/builtins.py
``` DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/container.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/module.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/attention/layer.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/communication_op.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/parallel_state.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/custom_op.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/activation.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/layernorm.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/linear.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/rotary_embedding.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/vocab_parallel_embedding.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/models/llama.py
DEBUG 03-07 03:07:07 [backends.py:462] Computation graph saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py
DEBUG 03-07 03:07:07 [wrapper.py:105] Dynamo transformed code saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/transformed_code.py
```
This is about the Python code compilation, i.e. graph capture by Dynamo. It tries to trace the function with code `xxx/vllm/model_executor/models/llama.py:339`, which is the `forward` function of the model we compile. During the forward pass, there are also other functions called and inlined by Dynamo, as shown by the logs, including some PyTorch functions from `xxx/torch/nn/modules/module.py` (used by PyTorch `nn.Module`, because module attribute access will trigger a function call), some communication / attention / activation functions from vLLM. All the traced files will be considered when we decide the cache directory to use. This way, any code change in the above files will trigger compilation cache miss, and therefore recompilation. This is about the Python code compilation, i.e. graph capture by Dynamo. It tries to trace the function with code `xxx/vllm/model_executor/models/llama.py:339`, which is the `forward` function of the model we compile. During the forward pass, there are also other functions called and inlined by Dynamo, as shown by the logs, including some PyTorch functions from `xxx/torch/nn/modules/module.py` (used by PyTorch `nn.Module`, because module attribute access will trigger a function call), some communication / attention / activation functions from vLLM. All the traced files will be considered when we decide the cache directory to use. This way, any code change in the above files will trigger compilation cache miss, and therefore recompilation.
@ -99,28 +101,31 @@ This time, Inductor compilation is completely bypassed, and we will load from di
The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example: The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example:
``` ```bash
vllm serve meta-llama/Llama-3.2-1B --compilation_config '{"compile_sizes": [1, 2, 4, 8]}' vllm serve meta-llama/Llama-3.2-1B \
--compilation_config '{"compile_sizes": [1, 2, 4, 8]}'
``` ```
Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel. Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel.
When all the shapes are known, `torch.compile` can compare different configs, and often find some better configs to run the kernel. For example, we can see the following log: When all the shapes are known, `torch.compile` can compare different configs, and often find some better configs to run the kernel. For example, we can see the following log:
``` ??? Logs
AUTOTUNE mm(8x2048, 2048x3072)
triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 ```
triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 AUTOTUNE mm(8x2048, 2048x3072)
triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
mm 0.0160 ms 81.6% triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 mm 0.0160 ms 81.6%
triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2 triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
triton_mm_7 0.0203 ms 64.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
triton_mm_2 0.0208 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
triton_mm_11 0.0215 ms 60.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 triton_mm_7 0.0203 ms 64.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
SingleProcess AUTOTUNE benchmarking takes 2.0428 seconds and 7.5727 seconds precompiling triton_mm_2 0.0208 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
``` triton_mm_11 0.0215 ms 60.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
SingleProcess AUTOTUNE benchmarking takes 2.0428 seconds and 7.5727 seconds precompiling
```
It means, for a matrix multiplication with shape `8x2048x3072`, `torch.compile` tries triton template with various configs, and it is much faster than the default code (which dispatches to cublas library). It means, for a matrix multiplication with shape `8x2048x3072`, `torch.compile` tries triton template with various configs, and it is much faster than the default code (which dispatches to cublas library).
@ -136,8 +141,9 @@ The cudagraphs are captured and managed by the compiler backend, and replayed wh
By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`: By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`:
``` ```bash
vllm serve meta-llama/Llama-3.2-1B --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}' vllm serve meta-llama/Llama-3.2-1B \
--compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}'
``` ```
Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture. Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.

View File

@ -29,24 +29,26 @@ We can now submit the prompts and call `llm.generate` with the `lora_request` pa
of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
the third parameter is the path to the LoRA adapter. the third parameter is the path to the LoRA adapter.
```python ??? Code
sampling_params = SamplingParams(
temperature=0,
max_tokens=256,
stop=["[/assistant]"]
)
prompts = [ ```python
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", sampling_params = SamplingParams(
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", temperature=0,
] max_tokens=256,
stop=["[/assistant]"]
)
outputs = llm.generate( prompts = [
prompts, "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
sampling_params, "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
lora_request=LoRARequest("sql_adapter", 1, sql_lora_path) ]
)
``` outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
)
```
Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
@ -68,24 +70,26 @@ The server entrypoint accepts all other LoRA configuration parameters (`max_lora
etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.): with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.):
```bash ??? Command
curl localhost:8000/v1/models | jq .
{ ```bash
"object": "list", curl localhost:8000/v1/models | jq .
"data": [ {
{ "object": "list",
"id": "meta-llama/Llama-2-7b-hf", "data": [
"object": "model", {
... "id": "meta-llama/Llama-2-7b-hf",
}, "object": "model",
{ ...
"id": "sql-lora", },
"object": "model", {
... "id": "sql-lora",
} "object": "model",
] ...
} }
``` ]
}
```
Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be
processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
@ -168,36 +172,36 @@ Alternatively, follow these example steps to implement your own plugin:
1. Implement the LoRAResolver interface. 1. Implement the LoRAResolver interface.
Example of a simple S3 LoRAResolver implementation: ??? Example of a simple S3 LoRAResolver implementation
```python ```python
import os import os
import s3fs import s3fs
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver from vllm.lora.resolver import LoRAResolver
class S3LoRAResolver(LoRAResolver): class S3LoRAResolver(LoRAResolver):
def __init__(self): def __init__(self):
self.s3 = s3fs.S3FileSystem() self.s3 = s3fs.S3FileSystem()
self.s3_path_format = os.getenv("S3_PATH_TEMPLATE") self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE") self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
async def resolve_lora(self, base_model_name, lora_name): async def resolve_lora(self, base_model_name, lora_name):
s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name) s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name) local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
# Download the LoRA from S3 to the local path # Download the LoRA from S3 to the local path
await self.s3._get( await self.s3._get(
s3_path, local_path, recursive=True, maxdepth=1 s3_path, local_path, recursive=True, maxdepth=1
) )
lora_request = LoRARequest( lora_request = LoRARequest(
lora_name=lora_name, lora_name=lora_name,
lora_path=local_path, lora_path=local_path,
lora_int_id=abs(hash(lora_name)) lora_int_id=abs(hash(lora_name))
) )
return lora_request return lora_request
``` ```
2. Register `LoRAResolver` plugin. 2. Register `LoRAResolver` plugin.
@ -234,38 +238,40 @@ The new format of `--lora-modules` is mainly to support the display of parent mo
- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter. - The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
- The `root` field points to the artifact location of the lora adapter. - The `root` field points to the artifact location of the lora adapter.
```bash ??? Command output
$ curl http://localhost:8000/v1/models
{ ```bash
"object": "list", $ curl http://localhost:8000/v1/models
"data": [
{ {
"id": "meta-llama/Llama-2-7b-hf", "object": "list",
"object": "model", "data": [
"created": 1715644056,
"owned_by": "vllm",
"root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
"parent": null,
"permission": [
{ {
..... "id": "meta-llama/Llama-2-7b-hf",
"object": "model",
"created": 1715644056,
"owned_by": "vllm",
"root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
"parent": null,
"permission": [
{
.....
}
]
},
{
"id": "sql-lora",
"object": "model",
"created": 1715644056,
"owned_by": "vllm",
"root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
"parent": meta-llama/Llama-2-7b-hf,
"permission": [
{
....
}
]
} }
] ]
}, }
{ ```
"id": "sql-lora",
"object": "model",
"created": 1715644056,
"owned_by": "vllm",
"root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
"parent": meta-llama/Llama-2-7b-hf,
"permission": [
{
....
}
]
}
]
}
```

View File

@ -20,111 +20,117 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples: You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
```python ??? Code
from vllm import LLM
llm = LLM(model="llava-hf/llava-1.5-7b-hf") ```python
from vllm import LLM
# Refer to the HuggingFace repo for the correct format to use llm = LLM(model="llava-hf/llava-1.5-7b-hf")
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
# Load the image using PIL.Image # Refer to the HuggingFace repo for the correct format to use
image = PIL.Image.open(...) prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
# Single prompt inference # Load the image using PIL.Image
outputs = llm.generate({ image = PIL.Image.open(...)
"prompt": prompt,
"multi_modal_data": {"image": image},
})
for o in outputs: # Single prompt inference
generated_text = o.outputs[0].text outputs = llm.generate({
print(generated_text) "prompt": prompt,
"multi_modal_data": {"image": image},
})
# Batch inference for o in outputs:
image_1 = PIL.Image.open(...) generated_text = o.outputs[0].text
image_2 = PIL.Image.open(...) print(generated_text)
outputs = llm.generate(
[
{
"prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
"multi_modal_data": {"image": image_1},
},
{
"prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
"multi_modal_data": {"image": image_2},
}
]
)
for o in outputs: # Batch inference
generated_text = o.outputs[0].text image_1 = PIL.Image.open(...)
print(generated_text) image_2 = PIL.Image.open(...)
``` outputs = llm.generate(
[
{
"prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
"multi_modal_data": {"image": image_1},
},
{
"prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
"multi_modal_data": {"image": image_2},
}
]
)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
```
Full example: <gh-file:examples/offline_inference/vision_language.py> Full example: <gh-file:examples/offline_inference/vision_language.py>
To substitute multiple images inside the same text prompt, you can pass in a list of images instead: To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
```python ??? Code
from vllm import LLM
llm = LLM( ```python
model="microsoft/Phi-3.5-vision-instruct", from vllm import LLM
trust_remote_code=True, # Required to load Phi-3.5-vision
max_model_len=4096, # Otherwise, it may not fit in smaller GPUs
limit_mm_per_prompt={"image": 2}, # The maximum number to accept
)
# Refer to the HuggingFace repo for the correct format to use llm = LLM(
prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n" model="microsoft/Phi-3.5-vision-instruct",
trust_remote_code=True, # Required to load Phi-3.5-vision
max_model_len=4096, # Otherwise, it may not fit in smaller GPUs
limit_mm_per_prompt={"image": 2}, # The maximum number to accept
)
# Load the images using PIL.Image # Refer to the HuggingFace repo for the correct format to use
image1 = PIL.Image.open(...) prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
image2 = PIL.Image.open(...)
outputs = llm.generate({ # Load the images using PIL.Image
"prompt": prompt, image1 = PIL.Image.open(...)
"multi_modal_data": { image2 = PIL.Image.open(...)
"image": [image1, image2]
},
})
for o in outputs: outputs = llm.generate({
generated_text = o.outputs[0].text "prompt": prompt,
print(generated_text) "multi_modal_data": {
``` "image": [image1, image2]
},
})
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
```
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py> Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
```python ??? Code
from vllm import LLM
# Specify the maximum number of frames per video to be 4. This can be changed. ```python
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) from vllm import LLM
# Create the request payload. # Specify the maximum number of frames per video to be 4. This can be changed.
video_frames = ... # load your video making sure it only has the number of frames specified earlier. llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
message = {
"role": "user",
"content": [
{"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
],
}
for i in range(len(video_frames)):
base64_image = encode_image(video_frames[i]) # base64 encoding.
new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
message["content"].append(new_image)
# Perform inference and log output. # Create the request payload.
outputs = llm.chat([message]) video_frames = ... # load your video making sure it only has the number of frames specified earlier.
message = {
"role": "user",
"content": [
{"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
],
}
for i in range(len(video_frames)):
base64_image = encode_image(video_frames[i]) # base64 encoding.
new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
message["content"].append(new_image)
for o in outputs: # Perform inference and log output.
generated_text = o.outputs[0].text outputs = llm.chat([message])
print(generated_text)
``` for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
```
### Video Inputs ### Video Inputs
@ -144,68 +150,72 @@ Full example: <gh-file:examples/offline_inference/audio_language.py>
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model, To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary. pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
```python ??? Code
from vllm import LLM
# Inference with image embeddings as input ```python
llm = LLM(model="llava-hf/llava-1.5-7b-hf") from vllm import LLM
# Refer to the HuggingFace repo for the correct format to use # Inference with image embeddings as input
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:" llm = LLM(model="llava-hf/llava-1.5-7b-hf")
# Embeddings for single image # Refer to the HuggingFace repo for the correct format to use
# torch.Tensor of shape (1, image_feature_size, hidden_size of LM) prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
image_embeds = torch.load(...)
outputs = llm.generate({ # Embeddings for single image
"prompt": prompt, # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
"multi_modal_data": {"image": image_embeds}, image_embeds = torch.load(...)
})
for o in outputs: outputs = llm.generate({
generated_text = o.outputs[0].text "prompt": prompt,
print(generated_text) "multi_modal_data": {"image": image_embeds},
``` })
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
```
For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings: For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
```python ??? Code
# Construct the prompt based on your model
prompt = ...
# Embeddings for multiple images ```python
# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM) # Construct the prompt based on your model
image_embeds = torch.load(...) prompt = ...
# Qwen2-VL # Embeddings for multiple images
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
mm_data = { image_embeds = torch.load(...)
"image": {
"image_embeds": image_embeds, # Qwen2-VL
# image_grid_thw is needed to calculate positional encoding. llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
"image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3), mm_data = {
"image": {
"image_embeds": image_embeds,
# image_grid_thw is needed to calculate positional encoding.
"image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3),
}
} }
}
# MiniCPM-V # MiniCPM-V
llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4}) llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
mm_data = { mm_data = {
"image": { "image": {
"image_embeds": image_embeds, "image_embeds": image_embeds,
# image_sizes is needed to calculate details of the sliced image. # image_sizes is needed to calculate details of the sliced image.
"image_sizes": [image.size for image in images], # list of image sizes "image_sizes": [image.size for image in images], # list of image sizes
}
} }
}
outputs = llm.generate({ outputs = llm.generate({
"prompt": prompt, "prompt": prompt,
"multi_modal_data": mm_data, "multi_modal_data": mm_data,
}) })
for o in outputs: for o in outputs:
generated_text = o.outputs[0].text generated_text = o.outputs[0].text
print(generated_text) print(generated_text)
``` ```
## Online Serving ## Online Serving
@ -235,51 +245,53 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
Then, you can use the OpenAI client as follows: Then, you can use the OpenAI client as follows:
```python ??? Code
from openai import OpenAI
openai_api_key = "EMPTY" ```python
openai_api_base = "http://localhost:8000/v1" from openai import OpenAI
client = OpenAI( openai_api_key = "EMPTY"
api_key=openai_api_key, openai_api_base = "http://localhost:8000/v1"
base_url=openai_api_base,
)
# Single-image input inference client = OpenAI(
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" api_key=openai_api_key,
base_url=openai_api_base,
)
chat_response = client.chat.completions.create( # Single-image input inference
model="microsoft/Phi-3.5-vision-instruct", image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
messages=[{
"role": "user",
"content": [
# NOTE: The prompt formatting with the image token `<image>` is not needed
# since the prompt will be processed automatically by the API server.
{"type": "text", "text": "Whats in this image?"},
{"type": "image_url", "image_url": {"url": image_url}},
],
}],
)
print("Chat completion output:", chat_response.choices[0].message.content)
# Multi-image input inference chat_response = client.chat.completions.create(
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" model="microsoft/Phi-3.5-vision-instruct",
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" messages=[{
"role": "user",
"content": [
# NOTE: The prompt formatting with the image token `<image>` is not needed
# since the prompt will be processed automatically by the API server.
{"type": "text", "text": "Whats in this image?"},
{"type": "image_url", "image_url": {"url": image_url}},
],
}],
)
print("Chat completion output:", chat_response.choices[0].message.content)
chat_response = client.chat.completions.create( # Multi-image input inference
model="microsoft/Phi-3.5-vision-instruct", image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
messages=[{ image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
"role": "user",
"content": [ chat_response = client.chat.completions.create(
{"type": "text", "text": "What are the animals in these images?"}, model="microsoft/Phi-3.5-vision-instruct",
{"type": "image_url", "image_url": {"url": image_url_duck}}, messages=[{
{"type": "image_url", "image_url": {"url": image_url_lion}}, "role": "user",
], "content": [
}], {"type": "text", "text": "What are the animals in these images?"},
) {"type": "image_url", "image_url": {"url": image_url_duck}},
print("Chat completion output:", chat_response.choices[0].message.content) {"type": "image_url", "image_url": {"url": image_url_lion}},
``` ],
}],
)
print("Chat completion output:", chat_response.choices[0].message.content)
```
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py> Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
@ -295,7 +307,7 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
By default, the timeout for fetching images through HTTP URL is `5` seconds. By default, the timeout for fetching images through HTTP URL is `5` seconds.
You can override this by setting the environment variable: You can override this by setting the environment variable:
```console ```bash
export VLLM_IMAGE_FETCH_TIMEOUT=<timeout> export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
``` ```
@ -311,44 +323,46 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model
Then, you can use the OpenAI client as follows: Then, you can use the OpenAI client as follows:
```python ??? Code
from openai import OpenAI
openai_api_key = "EMPTY" ```python
openai_api_base = "http://localhost:8000/v1" from openai import OpenAI
client = OpenAI( openai_api_key = "EMPTY"
api_key=openai_api_key, openai_api_base = "http://localhost:8000/v1"
base_url=openai_api_base,
)
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4" client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
## Use video url in the payload video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
chat_completion_from_url = client.chat.completions.create(
messages=[{ ## Use video url in the payload
"role": chat_completion_from_url = client.chat.completions.create(
"user", messages=[{
"content": [ "role":
{ "user",
"type": "text", "content": [
"text": "What's in this video?" {
}, "type": "text",
{ "text": "What's in this video?"
"type": "video_url",
"video_url": {
"url": video_url
}, },
}, {
], "type": "video_url",
}], "video_url": {
model=model, "url": video_url
max_completion_tokens=64, },
) },
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from image url:", result) print("Chat completion output from image url:", result)
``` ```
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py> Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
@ -356,7 +370,7 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
By default, the timeout for fetching videos through HTTP URL is `30` seconds. By default, the timeout for fetching videos through HTTP URL is `30` seconds.
You can override this by setting the environment variable: You can override this by setting the environment variable:
```console ```bash
export VLLM_VIDEO_FETCH_TIMEOUT=<timeout> export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
``` ```
@ -373,84 +387,88 @@ vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b
Then, you can use the OpenAI client as follows: Then, you can use the OpenAI client as follows:
```python ??? Code
import base64
import requests
from openai import OpenAI
from vllm.assets.audio import AudioAsset
def encode_base64_content_from_url(content_url: str) -> str: ```python
"""Encode a content retrieved from a remote url to base64 format.""" import base64
import requests
from openai import OpenAI
from vllm.assets.audio import AudioAsset
with requests.get(content_url) as response: def encode_base64_content_from_url(content_url: str) -> str:
response.raise_for_status() """Encode a content retrieved from a remote url to base64 format."""
result = base64.b64encode(response.content).decode('utf-8')
return result with requests.get(content_url) as response:
response.raise_for_status()
result = base64.b64encode(response.content).decode('utf-8')
openai_api_key = "EMPTY" return result
openai_api_base = "http://localhost:8000/v1"
client = OpenAI( openai_api_key = "EMPTY"
api_key=openai_api_key, openai_api_base = "http://localhost:8000/v1"
base_url=openai_api_base,
)
# Any format supported by librosa is supported client = OpenAI(
audio_url = AudioAsset("winning_call").url api_key=openai_api_key,
audio_base64 = encode_base64_content_from_url(audio_url) base_url=openai_api_base,
)
chat_completion_from_base64 = client.chat.completions.create( # Any format supported by librosa is supported
messages=[{ audio_url = AudioAsset("winning_call").url
"role": "user", audio_base64 = encode_base64_content_from_url(audio_url)
"content": [
{ chat_completion_from_base64 = client.chat.completions.create(
"type": "text", messages=[{
"text": "What's in this audio?" "role": "user",
}, "content": [
{ {
"type": "input_audio", "type": "text",
"input_audio": { "text": "What's in this audio?"
"data": audio_base64,
"format": "wav"
}, },
}, {
], "type": "input_audio",
}], "input_audio": {
model=model, "data": audio_base64,
max_completion_tokens=64, "format": "wav"
) },
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_base64.choices[0].message.content result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from input audio:", result) print("Chat completion output from input audio:", result)
``` ```
Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input: Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:
```python ??? Code
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role": "user",
"content": [
{
"type": "text",
"text": "What's in this audio?"
},
{
"type": "audio_url",
"audio_url": {
"url": audio_url
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content ```python
print("Chat completion output from audio url:", result) chat_completion_from_url = client.chat.completions.create(
``` messages=[{
"role": "user",
"content": [
{
"type": "text",
"text": "What's in this audio?"
},
{
"type": "audio_url",
"audio_url": {
"url": audio_url
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from audio url:", result)
```
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py> Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
@ -458,7 +476,7 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
By default, the timeout for fetching audios through HTTP URL is `10` seconds. By default, the timeout for fetching audios through HTTP URL is `10` seconds.
You can override this by setting the environment variable: You can override this by setting the environment variable:
```console ```bash
export VLLM_AUDIO_FETCH_TIMEOUT=<timeout> export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
``` ```
@ -470,61 +488,63 @@ pass a tensor of shape to the corresponding field of the multi-modal dictionary.
For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field. For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
The following example demonstrates how to pass image embeddings to the OpenAI server: The following example demonstrates how to pass image embeddings to the OpenAI server:
```python ??? Code
image_embedding = torch.load(...)
grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
buffer = io.BytesIO() ```python
torch.save(image_embedding, buffer) image_embedding = torch.load(...)
buffer.seek(0) grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
binary_data = buffer.read()
base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
client = OpenAI( buffer = io.BytesIO()
# defaults to os.environ.get("OPENAI_API_KEY") torch.save(image_embedding, buffer)
api_key=openai_api_key, buffer.seek(0)
base_url=openai_api_base, binary_data = buffer.read()
) base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
# Basic usage - this is equivalent to the LLaVA example for offline inference client = OpenAI(
model = "llava-hf/llava-1.5-7b-hf" # defaults to os.environ.get("OPENAI_API_KEY")
embeds = { api_key=openai_api_key,
"type": "image_embeds", base_url=openai_api_base,
"image_embeds": f"{base64_image_embedding}" )
}
# Pass additional parameters (available to Qwen2-VL and MiniCPM-V) # Basic usage - this is equivalent to the LLaVA example for offline inference
model = "Qwen/Qwen2-VL-2B-Instruct" model = "llava-hf/llava-1.5-7b-hf"
embeds = { embeds = {
"type": "image_embeds", "type": "image_embeds",
"image_embeds": { "image_embeds": f"{base64_image_embedding}"
"image_embeds": f"{base64_image_embedding}" , # Required }
"image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct
}, # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
} model = "Qwen/Qwen2-VL-2B-Instruct"
model = "openbmb/MiniCPM-V-2_6" embeds = {
embeds = { "type": "image_embeds",
"type": "image_embeds", "image_embeds": {
"image_embeds": { "image_embeds": f"{base64_image_embedding}" , # Required
"image_embeds": f"{base64_image_embedding}" , # Required "image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct
"image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6
},
}
chat_completion = client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
{
"type": "text",
"text": "What's in this image?",
}, },
embeds, }
], model = "openbmb/MiniCPM-V-2_6"
}, embeds = {
], "type": "image_embeds",
model=model, "image_embeds": {
) "image_embeds": f"{base64_image_embedding}" , # Required
``` "image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6
},
}
chat_completion = client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
{
"type": "text",
"text": "What's in this image?",
},
embeds,
],
},
],
model=model,
)
```
!!! note !!! note
Only one message can contain `{"type": "image_embeds"}`. Only one message can contain `{"type": "image_embeds"}`.

View File

@ -9,39 +9,41 @@ The main benefits are lower latency and memory usage.
You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?search=awq). You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?search=awq).
```console ```bash
pip install autoawq pip install autoawq
``` ```
After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
```python ??? Code
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
model_path = 'mistralai/Mistral-7B-Instruct-v0.2' ```python
quant_path = 'mistral-instruct-v0.2-awq' from awq import AutoAWQForCausalLM
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } from transformers import AutoTokenizer
# Load model model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
model = AutoAWQForCausalLM.from_pretrained( quant_path = 'mistral-instruct-v0.2-awq'
model_path, **{"low_cpu_mem_usage": True, "use_cache": False} quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# Quantize # Load model
model.quantize(tokenizer, quant_config=quant_config) model = AutoAWQForCausalLM.from_pretrained(
model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# Save quantized model # Quantize
model.save_quantized(quant_path) model.quantize(tokenizer, quant_config=quant_config)
tokenizer.save_pretrained(quant_path)
print(f'Model is quantized and saved at "{quant_path}"') # Save quantized model
``` model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)
print(f'Model is quantized and saved at "{quant_path}"')
```
To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
```console ```bash
python examples/offline_inference/llm_engine_example.py \ python examples/offline_inference/llm_engine_example.py \
--model TheBloke/Llama-2-7b-Chat-AWQ \ --model TheBloke/Llama-2-7b-Chat-AWQ \
--quantization awq --quantization awq
@ -49,27 +51,29 @@ python examples/offline_inference/llm_engine_example.py \
AWQ models are also supported directly through the LLM entrypoint: AWQ models are also supported directly through the LLM entrypoint:
```python ??? Code
from vllm import LLM, SamplingParams
# Sample prompts. ```python
prompts = [ from vllm import LLM, SamplingParams
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM. # Sample prompts.
llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ") prompts = [
# Generate texts from the prompts. The output is a list of RequestOutput objects "Hello, my name is",
# that contain the prompt, generated text, and other information. "The president of the United States is",
outputs = llm.generate(prompts, sampling_params) "The capital of France is",
# Print the outputs. "The future of AI is",
for output in outputs: ]
prompt = output.prompt # Create a sampling params object.
generated_text = output.outputs[0].text sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
``` # Create an LLM.
llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```

View File

@ -12,7 +12,7 @@ vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more effic
Below are the steps to utilize BitBLAS with vLLM. Below are the steps to utilize BitBLAS with vLLM.
```console ```bash
pip install bitblas>=0.1.0 pip install bitblas>=0.1.0
``` ```
@ -43,17 +43,19 @@ llm = LLM(
## Read gptq format checkpoint ## Read gptq format checkpoint
```python ??? Code
from vllm import LLM
import torch
# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint. ```python
model_id = "hxbgsyxh/llama-13b-4bit-g-1" from vllm import LLM
llm = LLM( import torch
model=model_id,
dtype=torch.float16, # "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
trust_remote_code=True, model_id = "hxbgsyxh/llama-13b-4bit-g-1"
quantization="bitblas", llm = LLM(
max_model_len=1024 model=model_id,
) dtype=torch.float16,
``` trust_remote_code=True,
quantization="bitblas",
max_model_len=1024
)
```

View File

@ -9,7 +9,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal
Below are the steps to utilize BitsAndBytes with vLLM. Below are the steps to utilize BitsAndBytes with vLLM.
```console ```bash
pip install bitsandbytes>=0.45.3 pip install bitsandbytes>=0.45.3
``` ```
@ -54,6 +54,6 @@ llm = LLM(
Append the following to your model arguments for 4bit inflight quantization: Append the following to your model arguments for 4bit inflight quantization:
```console ```bash
--quantization bitsandbytes --quantization bitsandbytes
``` ```

View File

@ -23,7 +23,7 @@ The FP8 types typically supported in hardware have two distinct representations,
To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
```console ```bash
pip install llmcompressor pip install llmcompressor
``` ```
@ -58,28 +58,30 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r
Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow. Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
```python ??? Code
from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
# Configure the simple PTQ quantization ```python
recipe = QuantizationModifier( from llmcompressor.transformers import oneshot
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) from llmcompressor.modifiers.quantization import QuantizationModifier
# Apply the quantization algorithm. # Configure the simple PTQ quantization
oneshot(model=model, recipe=recipe) recipe = QuantizationModifier(
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic # Apply the quantization algorithm.
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" oneshot(model=model, recipe=recipe)
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR) # Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
``` SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
```
### 3. Evaluating Accuracy ### 3. Evaluating Accuracy
Install `vllm` and `lm-evaluation-harness` for evaluation: Install `vllm` and `lm-evaluation-harness` for evaluation:
```console ```bash
pip install vllm lm-eval==0.4.4 pip install vllm lm-eval==0.4.4
``` ```
@ -97,9 +99,9 @@ Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`):
!!! note !!! note
Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations. Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations.
```console ```bash
$ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic
$ lm_eval \ lm_eval \
--model vllm \ --model vllm \
--model_args pretrained=$MODEL,add_bos_token=True \ --model_args pretrained=$MODEL,add_bos_token=True \
--tasks gsm8k --num_fewshot 5 --batch_size auto --limit 250 --tasks gsm8k --num_fewshot 5 --batch_size auto --limit 250

View File

@ -11,7 +11,7 @@ title: GGUF
To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command: To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command:
```console ```bash
wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
@ -20,7 +20,7 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs: You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs:
```console ```bash
# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
--tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
@ -32,7 +32,7 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path
```console ```bash
# If you model is not supported by huggingface you can manually provide a huggingface compatible config path # If you model is not supported by huggingface you can manually provide a huggingface compatible config path
vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
--tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
@ -41,42 +41,44 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
You can also use the GGUF model directly through the LLM entrypoint: You can also use the GGUF model directly through the LLM entrypoint:
```python ??? Code
from vllm import LLM, SamplingParams
# In this script, we demonstrate how to pass input to the chat method: ```python
conversation = [ from vllm import LLM, SamplingParams
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": "Hello"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "Write an essay about the importance of higher education.",
},
]
# Create a sampling params object. # In this script, we demonstrate how to pass input to the chat method:
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) conversation = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": "Hello"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "Write an essay about the importance of higher education.",
},
]
# Create an LLM. # Create a sampling params object.
llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.chat(conversation, sampling_params)
# Print the outputs. # Create an LLM.
for output in outputs: llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
prompt = output.prompt tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
generated_text = output.outputs[0].text # Generate texts from the prompts. The output is a list of RequestOutput objects
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") # that contain the prompt, generated text, and other information.
``` outputs = llm.chat(conversation, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```

View File

@ -21,7 +21,7 @@ for more details on this and other advanced features.
You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?search=gptq). You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?search=gptq).
```console ```bash
pip install -U gptqmodel --no-build-isolation -v pip install -U gptqmodel --no-build-isolation -v
``` ```
@ -31,34 +31,36 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t
Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`: Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
```python ??? Code
from datasets import load_dataset
from gptqmodel import GPTQModel, QuantizeConfig
model_id = "meta-llama/Llama-3.2-1B-Instruct" ```python
quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit" from datasets import load_dataset
from gptqmodel import GPTQModel, QuantizeConfig
calibration_dataset = load_dataset( model_id = "meta-llama/Llama-3.2-1B-Instruct"
"allenai/c4", quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
data_files="en/c4-train.00001-of-01024.json.gz",
split="train"
).select(range(1024))["text"]
quant_config = QuantizeConfig(bits=4, group_size=128) calibration_dataset = load_dataset(
"allenai/c4",
data_files="en/c4-train.00001-of-01024.json.gz",
split="train"
).select(range(1024))["text"]
model = GPTQModel.load(model_id, quant_config) quant_config = QuantizeConfig(bits=4, group_size=128)
# increase `batch_size` to match gpu/vram specs to speed up quantization model = GPTQModel.load(model_id, quant_config)
model.quantize(calibration_dataset, batch_size=2)
model.save(quant_path) # increase `batch_size` to match gpu/vram specs to speed up quantization
``` model.quantize(calibration_dataset, batch_size=2)
model.save(quant_path)
```
## Running a quantized model with vLLM ## Running a quantized model with vLLM
To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command: To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:
```console ```bash
python examples/offline_inference/llm_engine_example.py \ python examples/offline_inference/llm_engine_example.py \
--model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2 --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
``` ```
@ -67,32 +69,34 @@ python examples/offline_inference/llm_engine_example.py \
GPTQModel quantized models are also supported directly through the LLM entrypoint: GPTQModel quantized models are also supported directly through the LLM entrypoint:
```python ??? Code
from vllm import LLM, SamplingParams
# Sample prompts. ```python
prompts = [ from vllm import LLM, SamplingParams
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object. # Sample prompts.
sampling_params = SamplingParams(temperature=0.6, top_p=0.9) prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create an LLM. # Create a sampling params object.
llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2") sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
# Generate texts from the prompts. The output is a list of RequestOutput objects # Create an LLM.
# that contain the prompt, generated text, and other information. llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
outputs = llm.generate(prompts, sampling_params)
# Print the outputs. # Generate texts from the prompts. The output is a list of RequestOutput objects
print("-"*50) # that contain the prompt, generated text, and other information.
for output in outputs: outputs = llm.generate(prompts, sampling_params)
prompt = output.prompt
generated_text = output.outputs[0].text # Print the outputs.
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-"*50) print("-"*50)
``` for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-"*50)
```

View File

@ -14,13 +14,13 @@ Please visit the HF collection of [quantized INT4 checkpoints of popular LLMs re
To use INT4 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: To use INT4 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
```console ```bash
pip install llmcompressor pip install llmcompressor
``` ```
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
```console ```bash
pip install vllm lm-eval==0.4.4 pip install vllm lm-eval==0.4.4
``` ```
@ -53,51 +53,55 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd
It's best to use calibration data that closely matches your deployment data. It's best to use calibration data that closely matches your deployment data.
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`: For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
```python ??? Code
from datasets import load_dataset
NUM_CALIBRATION_SAMPLES = 512 ```python
MAX_SEQUENCE_LENGTH = 2048 from datasets import load_dataset
# Load and preprocess the dataset NUM_CALIBRATION_SAMPLES = 512
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") MAX_SEQUENCE_LENGTH = 2048
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
def preprocess(example): # Load and preprocess the dataset
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
ds = ds.map(preprocess) ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
def tokenize(sample): def preprocess(example):
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
ds = ds.map(tokenize, remove_columns=ds.column_names) ds = ds.map(preprocess)
```
def tokenize(sample):
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
ds = ds.map(tokenize, remove_columns=ds.column_names)
```
### 3. Applying Quantization ### 3. Applying Quantization
Now, apply the quantization algorithms: Now, apply the quantization algorithms:
```python ??? Code
from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
# Configure the quantization algorithms ```python
recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]) from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
# Apply quantization # Configure the quantization algorithms
oneshot( recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128 # Apply quantization
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128" oneshot(
model.save_pretrained(SAVE_DIR, save_compressed=True) model=model,
tokenizer.save_pretrained(SAVE_DIR) dataset=ds,
``` recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
```
This process creates a W4A16 model with weights quantized to 4-bit integers. This process creates a W4A16 model with weights quantized to 4-bit integers.
@ -112,8 +116,8 @@ model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
To evaluate accuracy, you can use `lm_eval`: To evaluate accuracy, you can use `lm_eval`:
```console ```bash
$ lm_eval --model vllm \ lm_eval --model vllm \
--model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128",add_bos_token=true \ --model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128",add_bos_token=true \
--tasks gsm8k \ --tasks gsm8k \
--num_fewshot 5 \ --num_fewshot 5 \
@ -137,34 +141,36 @@ $ lm_eval --model vllm \
The following is an example of an expanded quantization recipe you can tune to your own use case: The following is an example of an expanded quantization recipe you can tune to your own use case:
```python ??? Code
from compressed_tensors.quantization import (
QuantizationArgs, ```python
QuantizationScheme, from compressed_tensors.quantization import (
QuantizationStrategy, QuantizationArgs,
QuantizationType, QuantizationScheme,
) QuantizationStrategy,
recipe = GPTQModifier( QuantizationType,
targets="Linear", )
config_groups={ recipe = GPTQModifier(
"config_group": QuantizationScheme( targets="Linear",
targets=["Linear"], config_groups={
weights=QuantizationArgs( "config_group": QuantizationScheme(
num_bits=4, targets=["Linear"],
type=QuantizationType.INT, weights=QuantizationArgs(
strategy=QuantizationStrategy.GROUP, num_bits=4,
group_size=128, type=QuantizationType.INT,
symmetric=True, strategy=QuantizationStrategy.GROUP,
dynamic=False, group_size=128,
actorder="weight", symmetric=True,
dynamic=False,
actorder="weight",
),
), ),
), },
}, ignore=["lm_head"],
ignore=["lm_head"], update_size=NUM_CALIBRATION_SAMPLES,
update_size=NUM_CALIBRATION_SAMPLES, dampening_frac=0.01
dampening_frac=0.01 )
) ```
```
## Troubleshooting and Support ## Troubleshooting and Support

View File

@ -15,13 +15,13 @@ Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs re
To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
```console ```bash
pip install llmcompressor pip install llmcompressor
``` ```
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
```console ```bash
pip install vllm lm-eval==0.4.4 pip install vllm lm-eval==0.4.4
``` ```
@ -54,54 +54,60 @@ When quantizing activations to INT8, you need sample data to estimate the activa
It's best to use calibration data that closely matches your deployment data. It's best to use calibration data that closely matches your deployment data.
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`: For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
```python ??? Code
from datasets import load_dataset
NUM_CALIBRATION_SAMPLES = 512 ```python
MAX_SEQUENCE_LENGTH = 2048 from datasets import load_dataset
# Load and preprocess the dataset NUM_CALIBRATION_SAMPLES = 512
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") MAX_SEQUENCE_LENGTH = 2048
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
def preprocess(example): # Load and preprocess the dataset
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
ds = ds.map(preprocess) ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
def tokenize(sample): def preprocess(example):
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
ds = ds.map(tokenize, remove_columns=ds.column_names) ds = ds.map(preprocess)
```
def tokenize(sample):
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
ds = ds.map(tokenize, remove_columns=ds.column_names)
```
</details>
### 3. Applying Quantization ### 3. Applying Quantization
Now, apply the quantization algorithms: Now, apply the quantization algorithms:
```python ??? Code
from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
# Configure the quantization algorithms ```python
recipe = [ from llmcompressor.transformers import oneshot
SmoothQuantModifier(smoothing_strength=0.8), from llmcompressor.modifiers.quantization import GPTQModifier
GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
]
# Apply quantization # Configure the quantization algorithms
oneshot( recipe = [
model=model, SmoothQuantModifier(smoothing_strength=0.8),
dataset=ds, GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
recipe=recipe, ]
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token # Apply quantization
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token" oneshot(
model.save_pretrained(SAVE_DIR, save_compressed=True) model=model,
tokenizer.save_pretrained(SAVE_DIR) dataset=ds,
``` recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
```
This process creates a W8A8 model with weights and activations quantized to 8-bit integers. This process creates a W8A8 model with weights and activations quantized to 8-bit integers.
@ -116,8 +122,8 @@ model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
To evaluate accuracy, you can use `lm_eval`: To evaluate accuracy, you can use `lm_eval`:
```console ```bash
$ lm_eval --model vllm \ lm_eval --model vllm \
--model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \ --model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \
--tasks gsm8k \ --tasks gsm8k \
--num_fewshot 5 \ --num_fewshot 5 \

View File

@ -4,7 +4,7 @@ The [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-O
We recommend installing the library with: We recommend installing the library with:
```console ```bash
pip install nvidia-modelopt pip install nvidia-modelopt
``` ```
@ -14,24 +14,26 @@ You can quantize HuggingFace models using the example scripts provided in the Te
Below is an example showing how to quantize a model using modelopt's PTQ API: Below is an example showing how to quantize a model using modelopt's PTQ API:
```python ??? Code
import modelopt.torch.quantization as mtq
from transformers import AutoModelForCausalLM
# Load the model from HuggingFace ```python
model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>") import modelopt.torch.quantization as mtq
from transformers import AutoModelForCausalLM
# Select the quantization config, for example, FP8 # Load the model from HuggingFace
config = mtq.FP8_DEFAULT_CFG model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
# Define a forward loop function for calibration # Select the quantization config, for example, FP8
def forward_loop(model): config = mtq.FP8_DEFAULT_CFG
for data in calib_set:
model(data)
# PTQ with in-place replacement of quantized modules # Define a forward loop function for calibration
model = mtq.quantize(model, config, forward_loop) def forward_loop(model):
``` for data in calib_set:
model(data)
# PTQ with in-place replacement of quantized modules
model = mtq.quantize(model, config, forward_loop)
```
After the model is quantized, you can export it to a quantized checkpoint using the export API: After the model is quantized, you can export it to a quantized checkpoint using the export API:
@ -48,31 +50,33 @@ with torch.inference_mode():
The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM: The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:
```python ??? Code
from vllm import LLM, SamplingParams
def main(): ```python
from vllm import LLM, SamplingParams
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8" def main():
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
sampling_params = SamplingParams(temperature=0.8, top_p=0.9) model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
prompts = [ sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
outputs = llm.generate(prompts, sampling_params) prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
for output in outputs: outputs = llm.generate(prompts, sampling_params)
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == "__main__": for output in outputs:
main() prompt = output.prompt
``` generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == "__main__":
main()
```

Some files were not shown because too many files have changed in this diff Show More