diff --git a/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml new file mode 100644 index 0000000000000..eb4a50fd4fccd --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1 +model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.30 + - name: "exact_match,flexible-extract" + value: 0.465 +limit: 1319 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt index 6057229ac50f3..254d01edf8449 100644 --- a/.buildkite/lm-eval-harness/configs/models-small.txt +++ b/.buildkite/lm-eval-harness/configs/models-small.txt @@ -4,7 +4,7 @@ Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml -Minitron-4B-Base-FP8.yaml +Qwen1.5-MoE-W4A16-compressed-tensors.yaml Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml Qwen2-1.5B-Instruct-FP8W8.yaml Meta-Llama-3-8B-QQQ.yaml diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index 4ae23eff62f37..6015a83e82950 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -16,7 +16,7 @@ import numpy import pytest import yaml -RTOL = 0.05 +RTOL = 0.08 TEST_DATA_FILE = os.environ.get( "LM_EVAL_TEST_DATA_FILE", ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml") diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index 4cd449b141ece..80ebb370ad461 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -10,15 +10,24 @@ set -x set -o pipefail check_gpus() { - # check the number of GPUs and GPU type. - declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) + if command -v nvidia-smi; then + # check the number of GPUs and GPU type. + declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) + elif command -v amd-smi; then + declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l) + fi + if [[ $gpu_count -gt 0 ]]; then echo "GPU found." else echo "Need at least 1 GPU to run benchmarking." exit 1 fi - declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}') + if command -v nvidia-smi; then + declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}') + elif command -v amd-smi; then + declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}') + fi echo "GPU type is $gpu_type" } @@ -90,9 +99,15 @@ kill_gpu_processes() { # wait until GPU memory usage smaller than 1GB - while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do - sleep 1 - done + if command -v nvidia-smi; then + while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do + sleep 1 + done + elif command -v amd-smi; then + while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do + sleep 1 + done + fi # remove vllm config file rm -rf ~/.config/vllm diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json index 415171e268b08..13fd5aa8db97b 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json @@ -63,10 +63,12 @@ "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "disable_log_requests": "", "tensor_parallel_size": 4, - "swap_space": 16, - "speculative_model": "turboderp/Qwama-0.5B-Instruct", - "num_speculative_tokens": 4, - "speculative_draft_tensor_parallel_size": 1 + "swap_space": 16, + "speculative_config": { + "model": "turboderp/Qwama-0.5B-Instruct", + "num_speculative_tokens": 4, + "draft_tensor_parallel_size": 1 + } }, "client_parameters": { "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 18f582b6e4c94..a21a657c4b05e 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -3,10 +3,10 @@ steps: agents: queue: cpu_queue_postmerge commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/upload-wheels.sh" + - "bash .buildkite/scripts/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" @@ -14,10 +14,10 @@ steps: agents: queue: cpu_queue_postmerge commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/upload-wheels.sh" + - "bash .buildkite/scripts/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" @@ -31,10 +31,10 @@ steps: agents: queue: cpu_queue_postmerge commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/upload-wheels.sh" + - "bash .buildkite/scripts/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" @@ -48,7 +48,7 @@ steps: queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - label: "Build and publish TPU release image" @@ -57,7 +57,7 @@ steps: agents: queue: tpu_queue_postmerge commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ." - "docker push vllm/vllm-tpu:nightly" - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT" plugins: @@ -82,7 +82,22 @@ steps: queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" env: DOCKER_BUILDKIT: "1" + + - block: "Build Neuron release image" + key: block-neuron-release-image-build + depends_on: ~ + + - label: "Build and publish Neuron release image" + depends_on: block-neuron-release-image-build + agents: + queue: neuron-postmerge + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ." + - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)" + env: + DOCKER_BUILDKIT: "1" diff --git a/.buildkite/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh similarity index 79% rename from .buildkite/run-amd-test.sh rename to .buildkite/scripts/hardware_ci/run-amd-test.sh index 0680bae13ddbf..368f30434aa1d 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -98,6 +98,13 @@ if [[ $commands == *" kernels "* ]]; then --ignore=kernels/test_machete_mm.py \ --ignore=kernels/test_mha_attn.py \ --ignore=kernels/test_block_fp8.py \ + --ignore=kernels/test_cutlass_moe.py \ + --ignore=kernels/test_mamba_ssm_ssd.py \ + --ignore=kernels/test_attention.py \ + --ignore=kernels/test_block_int8.py \ + --ignore=kernels/test_fused_quant_layernorm.py \ + --ignore=kernels/test_int8_kernel.py \ + --ignore=kernels/test_triton_moe_ptpc_fp8.py \ --ignore=kernels/test_permute_cols.py" fi @@ -105,19 +112,33 @@ fi if [[ $commands == *" entrypoints/openai "* ]]; then commands=${commands//" entrypoints/openai "/" entrypoints/openai \ --ignore=entrypoints/openai/test_audio.py \ - --ignore=entrypoints/openai/test_chat.py \ --ignore=entrypoints/openai/test_shutdown.py \ --ignore=entrypoints/openai/test_completion.py \ --ignore=entrypoints/openai/test_sleep.py \ --ignore=entrypoints/openai/test_models.py \ + --ignore=entrypoints/openai/test_lora_adapters.py \ + --ignore=entrypoints/openai/test_return_tokens_as_ids.py \ + --ignore=entrypoints/openai/test_root_path.py \ + --ignore=entrypoints/openai/test_tokenization.py \ --ignore=entrypoints/openai/test_prompt_validation.py "} fi #ignore certain Entrypoints/llm tests -if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then - commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "} +if [[ $commands == *" entrypoints/llm "* ]]; then + commands=${commands//" entrypoints/llm "/" entrypoints/llm \ + --ignore=entrypoints/llm/test_chat.py \ + --ignore=entrypoints/llm/test_accuracy.py \ + --ignore=entrypoints/llm/test_init.py \ + --ignore=entrypoints/llm/test_generate_multiple_loras.py \ + --ignore=entrypoints/llm/test_prompt_validation.py "} fi +#Obsolete currently +##ignore certain Entrypoints/llm tests +#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then +# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "} +#fi + # --ignore=entrypoints/openai/test_encoder_decoder.py \ # --ignore=entrypoints/openai/test_embedding.py \ # --ignore=entrypoints/openai/test_oot_registration.py @@ -134,9 +155,10 @@ if [[ $commands == *"--shard-id="* ]]; then # assign shard-id for each shard commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "} echo "Shard ${GPU} commands:$commands_gpu" + echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" docker run \ - --device /dev/kfd --device /dev/dri \ - --network host \ + --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ + --network=host \ --shm-size=16gb \ --rm \ -e HIP_VISIBLE_DEVICES="${GPU}" \ @@ -163,9 +185,10 @@ if [[ $commands == *"--shard-id="* ]]; then fi done else + echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" docker run \ - --device /dev/kfd --device /dev/dri \ - --network host \ + --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ + --network=host \ --shm-size=16gb \ --rm \ -e HIP_VISIBLE_DEVICES=0 \ diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh new file mode 100755 index 0000000000000..036cfea9431cb --- /dev/null +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# This script build the CPU docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# Setup cleanup +remove_docker_container() { podman rm -f cpu-test-ubi9-ppc || true; podman system prune -f; } +trap remove_docker_container EXIT +remove_docker_container + +# Try building the docker image +podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le . + +# Run the image +podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test-ubi9-ppc cpu-test-ubi9-ppc + +function cpu_tests() { + + # offline inference + podman exec cpu-test-ubi9-ppc bash -c " + set -e + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" + + # Run basic model test + podman exec cpu-test-ubi9-ppc bash -c " + set -e + pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib + pip install sentence-transformers datamodel_code_generator + pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach] + pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5] + pytest -v -s tests/models/encoder_decoder/language -m cpu_model" +} + +# All of CPU tests are expected to be finished less than 40 mins. +export -f cpu_tests +timeout 40m bash -c cpu_tests + diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh similarity index 87% rename from .buildkite/run-cpu-test-ppc64le.sh rename to .buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh index bc06838d804ff..a97fa502e6cfc 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh @@ -10,5 +10,4 @@ trap remove_docker_container EXIT remove_docker_container # Try building the docker image -docker build -t cpu-test -f Dockerfile.ppc64le . - +docker build -t cpu-test -f docker/Dockerfile.s390x . diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh similarity index 86% rename from .buildkite/run-cpu-test.sh rename to .buildkite/scripts/hardware_ci/run-cpu-test.sh index 05744bb5225b8..40f3df96065d1 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -8,15 +8,19 @@ set -ex CORE_RANGE=${CORE_RANGE:-48-95} NUMA_NODE=${NUMA_NODE:-1} -# Try building the docker image -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu . -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu . - # Setup cleanup -remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; } +remove_docker_container() { + set -e; + docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; + docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; +} trap remove_docker_container EXIT remove_docker_container +# Try building the docker image +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu . + # Run the image, setting --shm-size=4g for tensor parallel. docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER" @@ -36,8 +40,6 @@ function cpu_tests() { # Run basic model test docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e - pip install -r vllm/requirements/test.txt - pip install -r vllm/requirements/cpu.txt pytest -v -s tests/kernels/test_cache.py -m cpu_model pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model pytest -v -s tests/models/decoder_only/language -m cpu_model diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/scripts/hardware_ci/run-gh200-test.sh similarity index 97% rename from .buildkite/run-gh200-test.sh rename to .buildkite/scripts/hardware_ci/run-gh200-test.sh index 5c004b47778fb..8c64e14606d3b 100644 --- a/.buildkite/run-gh200-test.sh +++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh @@ -9,6 +9,7 @@ python3 use_existing_torch.py # Try building the docker image DOCKER_BUILDKIT=1 docker build . \ + --file docker/Dockerfile \ --target vllm-openai \ --platform "linux/arm64" \ -t gh200-test \ diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/scripts/hardware_ci/run-hpu-test.sh similarity index 94% rename from .buildkite/run-hpu-test.sh rename to .buildkite/scripts/hardware_ci/run-hpu-test.sh index f83eb927aae4e..95b6ac37f1857 100644 --- a/.buildkite/run-hpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh @@ -5,7 +5,7 @@ set -ex # Try building the docker image -docker build -t hpu-test-env -f Dockerfile.hpu . +docker build -t hpu-test-env -f docker/Dockerfile.hpu . # Setup cleanup # certain versions of HPU software stack have a bug that can diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh similarity index 97% rename from .buildkite/run-neuron-test.sh rename to .buildkite/scripts/hardware_ci/run-neuron-test.sh index ad5ae6f415748..ec6a080eb499f 100644 --- a/.buildkite/run-neuron-test.sh +++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh @@ -35,7 +35,7 @@ else date "+%s" > /tmp/neuron-docker-build-timestamp fi -docker build -t "${image_name}" -f Dockerfile.neuron . +docker build -t "${image_name}" -f docker/Dockerfile.neuron . # Setup cleanup remove_docker_container() { diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh similarity index 58% rename from .buildkite/run-tpu-v1-test.sh rename to .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index d557feefba7aa..704bc6b7324da 100755 --- a/.buildkite/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -1,9 +1,9 @@ #!/bin/bash -set -e +set -xue # Build the docker image. -docker build -f Dockerfile.tpu -t vllm-tpu . +docker build -f docker/Dockerfile.tpu -t vllm-tpu . # Set up cleanup. remove_docker_container() { docker rm -f tpu-test || true; } @@ -17,12 +17,16 @@ source /etc/environment docker run --privileged --net host --shm-size=16G -it \ -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \ - && python3 -m pip install pytest \ + && python3 -m pip install pytest pytest-asyncio tpu-info \ && python3 -m pip install lm_eval[api]==0.4.4 \ && export VLLM_USE_V1=1 \ && export VLLM_XLA_CHECK_RECOMPILATION=1 \ + && echo HARDWARE \ + && tpu-info \ + && echo TEST_0 \ + && pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \ && echo TEST_1 \ - && pytest /workspace/vllm/tests/tpu/test_compilation.py \ + && pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \ && echo TEST_2 \ && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \ && echo TEST_3 \ @@ -32,9 +36,18 @@ docker run --privileged --net host --shm-size=16G -it \ && echo TEST_5 \ && python3 /workspace/vllm/examples/offline_inference/tpu.py \ && echo TEST_6 \ - && pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py" \ + && pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \ + && echo TEST_7 \ + && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \ + && echo TEST_8 \ + && pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \ + && echo TEST_9 \ + && pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py \ + && echo TEST_10 \ + && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py \ + && echo TEST_11 \ + && pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" \ # TODO: This test fails because it uses RANDOM_SEED sampling # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ - diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh similarity index 94% rename from .buildkite/run-xpu-test.sh rename to .buildkite/scripts/hardware_ci/run-xpu-test.sh index 3a0e6bdb2caaf..f54010c4231f9 100644 --- a/.buildkite/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}" container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" # Try building the docker image -docker build -t ${image_name} -f Dockerfile.xpu . +docker build -t ${image_name} -f docker/Dockerfile.xpu . # Setup cleanup remove_docker_container() { diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/scripts/run-benchmarks.sh similarity index 97% rename from .buildkite/run-benchmarks.sh rename to .buildkite/scripts/run-benchmarks.sh index 1641c1faa9d6a..195a8063fd743 100644 --- a/.buildkite/run-benchmarks.sh +++ b/.buildkite/scripts/run-benchmarks.sh @@ -5,8 +5,8 @@ set -ex set -o pipefail -# cd into parent directory of this file -cd "$(dirname "${BASH_SOURCE[0]}")/.." +# cd 2 levels into the working directory +cd "$(dirname "${BASH_SOURCE[0]}")/../.." (which wget && which curl) || (apt-get update && apt-get install -y wget curl) diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/scripts/run-multi-node-test.sh similarity index 96% rename from .buildkite/run-multi-node-test.sh rename to .buildkite/scripts/run-multi-node-test.sh index 530bf90a855fe..49aebce786b92 100755 --- a/.buildkite/run-multi-node-test.sh +++ b/.buildkite/scripts/run-multi-node-test.sh @@ -3,7 +3,7 @@ set -euox pipefail if [[ $# -lt 4 ]]; then - echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN" + echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN" exit 1 fi diff --git a/.buildkite/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh similarity index 100% rename from .buildkite/upload-wheels.sh rename to .buildkite/scripts/upload-wheels.sh diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f22b2b0ab6f2f..ec00bc7f108df 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -8,6 +8,7 @@ # Documentation # label(str): the name of the test. emoji allowed. # fast_check(bool): whether to run this on each commit on fastcheck pipeline. +# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline. # fast_check_only(bool): run this test on fastcheck pipeline only # optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run. # command(str): the single command to run for tests. incompatible with commands. @@ -70,6 +71,7 @@ steps: - label: Basic Correctness Test # 30min #mirror_hardwares: [amd] fast_check: true + torch_nightly: true source_file_dependencies: - vllm/ - tests/basic_correctness/test_basic_correctness @@ -104,7 +106,8 @@ steps: - label: Entrypoints Test # 40min working_dir: "/vllm-workspace/tests" fast_check: true - mirror_hardwares: [amd] + torch_nightly: true + #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/entrypoints/llm @@ -118,7 +121,7 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py - pytest -v -s entrypoints/test_chat_utils.py - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests @@ -135,12 +138,14 @@ steps: - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py - tests/examples/offline_inference/data_parallel.py + - tests/v1/test_async_llm_dp.py commands: # test with tp=2 and external_dp=2 - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py # test with internal dp - python3 ../examples/offline_inference/data_parallel.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py @@ -148,11 +153,12 @@ steps: # TODO: create a dedicated test section for multi-GPU example tests # when we have multiple distributed example tests - pushd ../examples/offline_inference - - VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py - - VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py + - python3 rlhf.py + - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - popd - label: Metrics, Tracing Test # 10min + mirror_hardwares: [amd] num_gpus: 2 source_file_dependencies: - vllm/ @@ -160,18 +166,13 @@ steps: - tests/tracing commands: - pytest -v -s metrics - - "pip install \ - 'opentelemetry-sdk>=1.26.0,<1.27.0' \ - 'opentelemetry-api>=1.26.0,<1.27.0' \ - 'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \ - 'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'" - pytest -v -s tracing ##### fast check tests ##### ##### 1 GPU test ##### - label: Regression Test # 5min - mirror_hardwares: [amd] + #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/test_regression @@ -202,12 +203,13 @@ steps: commands: # split the test to avoid interference - pytest -v -s v1/core - - pytest -v -s v1/entrypoints - pytest -v -s v1/engine - pytest -v -s v1/entrypoints - pytest -v -s v1/sample - pytest -v -s v1/worker - pytest -v -s v1/structured_output + - pytest -v -s v1/spec_decode + - pytest -v -s v1/test_serial_utils.py - pytest -v -s v1/test_stats.py - pytest -v -s v1/test_utils.py - pytest -v -s v1/test_oracle.py @@ -283,13 +285,21 @@ steps: - pytest -v -s spec_decode/e2e/test_eagle_correctness.py - label: LoRA Test %N # 15min each - mirror_hardwares: [amd] + #mirror_hardwares: [amd] source_file_dependencies: - vllm/lora - tests/lora - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py + command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py parallelism: 4 +- label: PyTorch Compilation Unit Tests + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/test_pass_manager.py + - pytest -v -s compile/test_fusion.py + - label: PyTorch Fullgraph Smoke Test # 9min source_file_dependencies: - vllm/ @@ -299,7 +309,6 @@ steps: # these tests need to be separated, cannot combine - pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_toy_llama.py - - pytest -v -s compile/test_pass_manager.py - label: PyTorch Fullgraph Test # 18min source_file_dependencies: @@ -308,18 +317,49 @@ steps: commands: - pytest -v -s compile/test_full_graph.py -- label: Kernels Test %N # 1h each - mirror_hardwares: [amd] +- label: Kernels Core Operation Test source_file_dependencies: - csrc/ - - vllm/attention - - tests/kernels + - tests/kernels/core commands: - - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - parallelism: 4 + - pytest -v -s kernels/core + +- label: Kernels Attention Test %N + source_file_dependencies: + - csrc/attention/ + - vllm/attention + - vllm/v1/attention + - tests/kernels/attention + commands: + - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels Quantization Test %N + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/layers/quantization + - tests/kernels/quantization + commands: + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels MoE Test + source_file_dependencies: + - csrc/moe/ + - tests/kernels/moe + - vllm/model_executor/layers/fused_moe/ + commands: + - pytest -v -s kernels/moe + +- label: Kernels Mamba Test + source_file_dependencies: + - csrc/mamba/ + - tests/kernels/mamba + commands: + - pytest -v -s kernels/mamba - label: Tensorizer Test # 11min - mirror_hardwares: [amd] + # mirror_hardwares: [amd] soft_fail: true source_file_dependencies: - vllm/model_executor/model_loader @@ -335,7 +375,14 @@ steps: source_file_dependencies: - benchmarks/ commands: - - bash run-benchmarks.sh + - bash scripts/run-benchmarks.sh + +- label: Benchmarks CLI Test # 10min + source_file_dependencies: + - vllm/ + - tests/benchmarks/ + commands: + - pytest -v -s benchmarks/ - label: Quantization Test # 33min source_file_dependencies: @@ -370,12 +417,14 @@ steps: - label: OpenAI-Compatible Tool Use # 20 min fast_check: false - mirror_hardwares: [ amd ] + #mirror_hardwares: [ amd ] source_file_dependencies: - vllm/ - tests/tool_use + - tests/mistral_tool_use commands: - pytest -v -s tool_use + - pytest -v -s mistral_tool_use ##### models test ##### @@ -387,7 +436,9 @@ steps: - pytest -v -s models/test_transformers.py - pytest -v -s models/test_registry.py # V1 Test: https://github.com/vllm-project/vllm/issues/14531 - - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py + - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2' + - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4' + - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2' - label: Language Models Test (Standard) # 32min #mirror_hardwares: [amd] @@ -397,6 +448,8 @@ steps: - tests/models/embedding/language - tests/models/encoder_decoder/language commands: + # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. + - pip install causal-conv1d - pytest -v -s models/decoder_only/language -m 'core_model or quant_model' - pytest -v -s models/embedding/language -m core_model @@ -408,6 +461,8 @@ steps: - tests/models/embedding/language - tests/models/encoder_decoder/language commands: + # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. + - pip install causal-conv1d - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' - pytest -v -s models/embedding/language -m 'not core_model' @@ -424,11 +479,12 @@ steps: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model' - - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model' + - pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model' - pytest -v -s models/embedding/vision_language -m core_model - pytest -v -s models/encoder_decoder/audio_language -m core_model - pytest -v -s models/encoder_decoder/language -m core_model - pytest -v -s models/encoder_decoder/vision_language -m core_model + - pytest -v -s models/decoder_only/vision_language/test_interleaved.py - label: Multi-Modal Models Test (Extended) 1 # 48m optional: true @@ -442,10 +498,7 @@ steps: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model' - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model' - # HACK - run phi3v tests separately to sidestep this transformers bug - # https://github.com/huggingface/transformers/issues/34307 - - pytest -v -s models/decoder_only/vision_language/test_phi3v.py - - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model' + - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model' - pytest -v -s models/embedding/vision_language -m 'not core_model' - pytest -v -s models/encoder_decoder/language -m 'not core_model' - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model' @@ -461,6 +514,7 @@ steps: # This test is used only in PR development phase to test individual models and should never run on main - label: Custom Models Test + mirror_hardwares: [amd] optional: true commands: - echo 'Testing custom models...' @@ -472,6 +526,7 @@ steps: ##### multi gpus test ##### - label: Distributed Comm Ops Test # 7min + mirror_hardwares: [amd] working_dir: "/vllm-workspace/tests" num_gpus: 2 source_file_dependencies: @@ -514,8 +569,11 @@ steps: - vllm/worker/worker.py - vllm/worker/model_runner.py - entrypoints/llm/test_collective_rpc.py + - tests/v1/test_async_llm_dp.py + - vllm/v1/engine/ commands: - - VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py + - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' @@ -530,6 +588,7 @@ steps: # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - label: Plugin Tests (2 GPUs) # 40min working_dir: "/vllm-workspace/tests" @@ -596,8 +655,6 @@ steps: # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py - - pytest -v -s -x lora/test_minicpmv_tp.py - - pytest -v -s -x lora/test_transfomers_model.py - label: Weight Loading Multiple GPU Test # 33min diff --git a/.github/ISSUE_TEMPLATE/200-installation.yml b/.github/ISSUE_TEMPLATE/200-installation.yml index 590e56c137813..34da4019687b2 100644 --- a/.github/ISSUE_TEMPLATE/200-installation.yml +++ b/.github/ISSUE_TEMPLATE/200-installation.yml @@ -14,7 +14,7 @@ body: description: | Please run the following and paste the output below. ```sh - wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py + wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py # For security purposes, please feel free to check the contents of collect_env.py before running it. python collect_env.py ``` diff --git a/.github/ISSUE_TEMPLATE/300-usage.yml b/.github/ISSUE_TEMPLATE/300-usage.yml index 004798a388a63..c9e4be0e7719f 100644 --- a/.github/ISSUE_TEMPLATE/300-usage.yml +++ b/.github/ISSUE_TEMPLATE/300-usage.yml @@ -14,7 +14,7 @@ body: description: | Please run the following and paste the output below. ```sh - wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py + wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py # For security purposes, please feel free to check the contents of collect_env.py before running it. python collect_env.py ``` diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml index d4113da8b5b81..b96ab40749003 100644 --- a/.github/ISSUE_TEMPLATE/400-bug-report.yml +++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml @@ -14,7 +14,7 @@ body: description: | Please run the following and paste the output below. ```sh - wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py + wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py # For security purposes, please feel free to check the contents of collect_env.py before running it. python collect_env.py ``` diff --git a/.github/ISSUE_TEMPLATE/600-new-model.yml b/.github/ISSUE_TEMPLATE/600-new-model.yml index 713e76c1a5cec..5f0125ef98096 100644 --- a/.github/ISSUE_TEMPLATE/600-new-model.yml +++ b/.github/ISSUE_TEMPLATE/600-new-model.yml @@ -9,7 +9,7 @@ body: value: > #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). - #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model. + #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model. - type: textarea attributes: label: The model to consider. diff --git a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml index 273f50d59cf76..3d31c11550167 100644 --- a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml +++ b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml @@ -35,7 +35,7 @@ body: description: | Please run the following and paste the output below. ```sh - wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py + wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py # For security purposes, please feel free to check the contents of collect_env.py before running it. python collect_env.py ``` diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index a20c5baf895c1..7042e81a84daa 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE FIX #xxxx (*link existing issues this PR will resolve*) -**BEFORE SUBMITTING, PLEASE READ ** +**BEFORE SUBMITTING, PLEASE READ ** (anything written below this line will be removed by GitHub Actions) diff --git a/.github/mergify.yml b/.github/mergify.yml index 54f56210b286a..2033722b5f33c 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -19,7 +19,7 @@ pull_request_rules: - files~=\.buildkite/ - files~=^cmake/ - files=CMakeLists.txt - - files~=^Dockerfile + - files~=^docker/Dockerfile - files~=^requirements.*\.txt - files=setup.py actions: @@ -55,11 +55,19 @@ pull_request_rules: description: Automatically apply structured-output label conditions: - or: + - files~=^benchmarks/structured_schemas/ + - files=benchmarks/benchmark_serving_structured_output.py + - files=benchmarks/run_structured_output_benchmark.sh + - files=docs/source/features/structured_outputs.md + - files=examples/offline_inference/structured_outputs.py + - files=examples/online_serving/openai_chat_completion_structured_outputs.py + - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py - files~=^vllm/model_executor/guided_decoding/ - files=tests/model_executor/test_guided_processors.py - files=tests/entrypoints/llm/test_guided_generate.py - - files=benchmarks/benchmark_serving_guided.py - - files=benchmarks/benchmark_guided.py + - files~=^tests/v1/structured_output/ + - files=tests/v1/entrypoints/llm/test_guided_generate.py + - files~=^vllm/v1/structured_output/ actions: label: add: @@ -88,6 +96,36 @@ pull_request_rules: add: - v1 +- name: label-tpu + description: Automatically apply tpu label + # Keep this list in sync with `label-tpu-remove` conditions + conditions: + - or: + - files~=tpu.py + - files~=_tpu + - files~=tpu_ + - files~=/tpu/ + - files~=pallas + actions: + label: + add: + - tpu + +- name: label-tpu-remove + description: Automatically remove tpu label + # Keep this list in sync with `label-tpu` conditions + conditions: + - and: + - -files~=tpu.py + - -files~=_tpu + - -files~=tpu_ + - -files~=/tpu/ + - -files~=pallas + actions: + label: + remove: + - tpu + - name: ping author on conflicts and add 'needs-rebase' label conditions: - conflict diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index b199d0867a648..7b1d9f69938c8 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -50,7 +50,7 @@ jobs: uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0 - name: Build the Docker image vllm cpu - run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env . + run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env . - name: Configuration of docker images, network and namespace for the kind cluster run: | diff --git a/.gitignore b/.gitignore index 6f5cbd0733da0..06d2b1e83b7b5 100644 --- a/.gitignore +++ b/.gitignore @@ -203,3 +203,6 @@ benchmarks/**/*.json # Linting actionlint shellcheck*/ + +# Ingore moe/marlin_moe gen code +csrc/moe/marlin_moe_wna16/kernel_* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 484cd171f5f52..f76b24c025ffb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,3 +1,6 @@ +default_install_hook_types: + - pre-commit + - commit-msg default_stages: - pre-commit # Run locally - manual # Run in CI @@ -8,7 +11,6 @@ repos: hooks: - id: yapf args: [--in-place, --verbose] - additional_dependencies: [toml] # TODO: Remove when yapf is upgraded - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.9.3 hooks: @@ -119,6 +121,12 @@ repos: language: system always_run: true pass_filenames: false + - id: update-dockerfile-graph + name: Update Dockerfile dependency graph + entry: tools/update-dockerfile-graph.sh + language: script + files: ^docker/Dockerfile$ + pass_filenames: false # Keep `suggestion` last - id: suggestion name: Suggestion diff --git a/CMakeLists.txt b/CMakeLists.txt index 65d1ddbeee0b2..21464a0560d93 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,7 +34,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12") set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0") # Supported AMD GPU architectures. -set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101") +set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201") # # Supported/expected torch versions for CUDA/ROCm. @@ -44,7 +44,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101") # # Note: the CUDA torch version is derived from pyproject.toml and various # requirements.txt files and should be kept consistent. The ROCm torch -# versions are derived from Dockerfile.rocm +# versions are derived from docker/Dockerfile.rocm # set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0") set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0") @@ -230,10 +230,12 @@ set(VLLM_EXT_SRC "csrc/cache_kernels.cu" "csrc/attention/paged_attention_v1.cu" "csrc/attention/paged_attention_v2.cu" + "csrc/attention/merge_attn_states.cu" "csrc/pos_encoding_kernels.cu" "csrc/activation_kernels.cu" "csrc/layernorm_kernels.cu" "csrc/layernorm_quant_kernels.cu" + "csrc/cuda_view.cu" "csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/fp8/common.cu" @@ -241,6 +243,7 @@ set(VLLM_EXT_SRC "csrc/quantization/gguf/gguf_kernel.cu" "csrc/cuda_utils_kernels.cu" "csrc/prepare_inputs/advance_step.cu" + "csrc/custom_all_reduce.cu" "csrc/torch_bindings.cpp") if(VLLM_GPU_LANG STREQUAL "CUDA") @@ -282,7 +285,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "csrc/mamba/causal_conv1d/causal_conv1d.cu" "csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu" - "csrc/custom_all_reduce.cu" "csrc/permute_cols.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" "csrc/quantization/fp4/nvfp4_quant_entry.cu" @@ -461,6 +463,33 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") set(FP4_ARCHS) endif() + # + # CUTLASS MoE kernels + + # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works + # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible + # to compile MoE kernels that use its output. + cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) + set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu" + "csrc/quantization/cutlass_w8a8/moe/moe_data.cu") + set_gencode_flags_for_srcs( + SRCS "${SRCS}" + CUDA_ARCHS "${SCALED_MM_ARCHS}") + list(APPEND VLLM_EXT_SRC "${SRCS}") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1") + message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}") + else() + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) + message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is " + "not >= 12.3, we recommend upgrading to CUDA 12.3 or later " + "if you intend on running FP8 quantized MoE models on Hopper.") + else() + message(STATUS "Not building grouped_mm_c3x as no compatible archs found " + "in CUDA target architectures") + endif() + endif() + # # Machete kernels @@ -580,21 +609,51 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}") cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}") if (MARLIN_MOE_ARCHS) - set(MARLIN_MOE_SRC - "csrc/moe/marlin_kernels/marlin_moe_kernel.h" - "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h" - "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu" - "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h" - "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu" - "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h" - "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu" - "csrc/moe/marlin_moe_ops.cu") + # + # For the Marlin MOE kernels we automatically generate sources for various + # preselected input type pairs and schedules. + # Generate sources: + set(MOE_MARLIN_GEN_SCRIPT + ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py) + file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH) + + message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}") + message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}") + + if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} + OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH}) + execute_process( + COMMAND ${CMAKE_COMMAND} -E env + PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH + ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} + RESULT_VARIABLE moe_marlin_generation_result + OUTPUT_VARIABLE moe_marlin_generation_output + OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log + ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log + ) + + if (NOT moe_marlin_generation_result EQUAL 0) + message(FATAL_ERROR "Marlin MOE generation failed." + " Result: \"${moe_marlin_generation_result}\"" + "\nCheck the log for details: " + "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log") + else() + set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH} + CACHE STRING "Last run Marlin MOE generate script hash" FORCE) + message(STATUS "Marlin MOE generation completed successfully.") + endif() + else() + message(STATUS "Marlin MOE generation script has not changed, skipping generation.") + endif() + + file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu") set_gencode_flags_for_srcs( - SRCS "${MARLIN_MOE_SRC}" + SRCS "${MOE_WNAA16_MARLIN_SRC}" CUDA_ARCHS "${MARLIN_MOE_ARCHS}") - list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}") + list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC}) + message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}") else() message(STATUS "Not building Marlin MOE kernels as no compatible archs found" @@ -619,6 +678,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP") # set(VLLM_ROCM_EXT_SRC "csrc/rocm/torch_bindings.cpp" + "csrc/rocm/skinny_gemms.cu" "csrc/rocm/attention.cu") define_gpu_extension_target( diff --git a/Dockerfile.cpu b/Dockerfile.cpu deleted file mode 100644 index a10090529d8a9..0000000000000 --- a/Dockerfile.cpu +++ /dev/null @@ -1,69 +0,0 @@ -# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform. - -FROM ubuntu:22.04 AS cpu-test-1 - -ENV CCACHE_DIR=/root/.cache/ccache - -ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache - -RUN --mount=type=cache,target=/var/cache/apt \ - apt-get update -y \ - && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \ - && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ - && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 - -# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html -# intel-openmp provides additional performance improvement vs. openmp -# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects. -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install intel-openmp==2025.0.1 - -ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so" - -RUN echo 'ulimit -c 0' >> ~/.bashrc - -RUN pip install intel_extension_for_pytorch==2.6.0 - -WORKDIR /workspace - -ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" -ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \ - pip install --upgrade pip && \ - pip install -r requirements/build.txt - -FROM cpu-test-1 AS build - -WORKDIR /workspace/vllm - -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \ - --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \ - pip install -v -r requirements/cpu.txt - -COPY . . -ARG GIT_REPO_CHECK=0 -RUN --mount=type=bind,source=.git,target=.git \ - if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi - -# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ... -ARG VLLM_CPU_DISABLE_AVX512 -ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512} - -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=cache,target=/root/.cache/ccache \ - --mount=type=bind,source=.git,target=.git \ - VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \ - pip install dist/*.whl && \ - rm -rf dist - -WORKDIR /workspace/ - -RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks - -# install development dependencies (for testing) -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -e tests/vllm_test_utils - -ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/README.md b/README.md index f2da0467e5c34..dda3ae6009f55 100644 --- a/README.md +++ b/README.md @@ -10,19 +10,14 @@ Easy, fast, and cheap LLM serving for everyone

-| Documentation | Blog | Paper | Twitter/X | User Forum | Developer Slack | +| Documentation | Blog | Paper | Twitter/X | User Forum | Developer Slack |

--- -[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center! - -[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day) - ---- - *Latest News* 🔥 - +- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing). +- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing). - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing). - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0). - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted. @@ -103,7 +98,7 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more. ## Contributing We welcome and value any contributions and collaborations. -Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved. +Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved. ## Sponsors @@ -126,6 +121,7 @@ Compute Resources: - Databricks - DeepInfra - Google Cloud +- Intel - Lambda Lab - Nebius - Novita AI diff --git a/benchmarks/README.md b/benchmarks/README.md index d41de1caa04c0..4a8ab895e18e9 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -41,29 +41,39 @@ become available. synthetic - HuggingFace - 🟡 - 🟡 - Specify your dataset path on HuggingFace + HuggingFace-VisionArena + ✅ + ✅ + lmarena-ai/VisionArena-Chat - VisionArena + HuggingFace-InstructCoder ✅ ✅ - lmarena-ai/vision-arena-bench-v0.1 (a HuggingFace dataset) + likaixin/InstructCoder + + + HuggingFace-AIMO + ✅ + ✅ + AI-MO/aimo-validation-aime , AI-MO/NuminaMath-1.5, AI-MO/NuminaMath-CoT + + + HuggingFace-Other + ✅ + ✅ + lmms-lab/LLaVA-OneVision-Data, Aeala/ShareGPT_Vicuna_unfiltered ✅: supported +🟡: Partial support + 🚧: to be supported -🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats -similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. -If you need support for other dataset formats, please consider contributing. - -**Note**: VisionArena’s `dataset-name` should be set to `hf` +**Note**: HuggingFace dataset's `dataset-name` should be set to `hf` --- ## Example - Online Benchmark @@ -71,8 +81,7 @@ If you need support for other dataset formats, please consider contributing. First start serving your model ```bash -MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B" -vllm serve ${MODEL_NAME} --disable-log-requests +vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests ``` Then run the benchmarking script @@ -80,12 +89,13 @@ Then run the benchmarking script ```bash # download dataset # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B" -NUM_PROMPTS=10 -BACKEND="vllm" -DATASET_NAME="sharegpt" -DATASET_PATH="/ShareGPT_V3_unfiltered_cleaned_split.json" -python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS} +python3 vllm/benchmarks/benchmark_serving.py \ + --backend vllm \ + --model NousResearch/Hermes-3-Llama-3.1-8B \ + --endpoint /v1/completions \ + --dataset-name sharegpt \ + --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json \ + --num-prompts 10 ``` If successful, you will see the following output @@ -122,88 +132,105 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests ``` ```bash -MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct" -NUM_PROMPTS=10 -BACKEND="openai-chat" -DATASET_NAME="hf" -DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1" -DATASET_SPLIT='train' - python3 vllm/benchmarks/benchmark_serving.py \ - --backend "${BACKEND}" \ - --model "${MODEL_NAME}" \ - --endpoint "/v1/chat/completions" \ - --dataset-name "${DATASET_NAME}" \ - --dataset-path "${DATASET_PATH}" \ - --hf-split "${DATASET_SPLIT}" \ - --num-prompts "${NUM_PROMPTS}" + --backend openai-chat \ + --model Qwen/Qwen2-VL-7B-Instruct \ + --endpoint /v1/chat/completions \ + --dataset-name hf \ + --dataset-path lmarena-ai/VisionArena-Chat \ + --hf-split train \ + --num-prompts 1000 ``` -### HuggingFaceDataset Examples +### InstructCoder Benchmark with Speculative Decoding -Currently, HuggingFaceDataset only supports dataset formats -similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset -formats, please consider contributing. +``` bash +VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ + --speculative-model "[ngram]" \ + --ngram_prompt_lookup_min 2 \ + --ngram-prompt-lookup-max 5 \ + --num_speculative_tokens 5 +``` + +``` bash +python3 benchmarks/benchmark_serving.py \ + --model meta-llama/Meta-Llama-3-8B-Instruct \ + --dataset-name hf \ + --dataset-path likaixin/InstructCoder \ + --num-prompts 2048 +``` + +### Other HuggingFaceDataset Examples ```bash -# need a model with vision capability here vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests ``` **`lmms-lab/LLaVA-OneVision-Data`** ```bash -MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct" -NUM_PROMPTS=10 -BACKEND="openai-chat" -DATASET_NAME="hf" -DATASET_PATH="lmms-lab/LLaVA-OneVision-Data" -DATASET_SPLIT='train' -DATASET_SUBSET='chart2text(cauldron)' python3 vllm/benchmarks/benchmark_serving.py \ - --backend "${BACKEND}" \ - --model "${MODEL_NAME}" \ - --endpoint "/v1/chat/completions" \ - --dataset-name "${DATASET_NAME}" \ - --dataset-path "${DATASET_PATH}" \ - --hf-split "${DATASET_SPLIT}" \ - --num-prompts "${NUM_PROMPTS}" \ - --hf-subset "${DATASET_SUBSET}" + --backend openai-chat \ + --model Qwen/Qwen2-VL-7B-Instruct \ + --endpoint /v1/chat/completions \ + --dataset-name hf \ + --dataset-path lmms-lab/LLaVA-OneVision-Data \ + --hf-split train \ + --hf-subset "chart2text(cauldron)" \ + --num-prompts 10 ``` **`Aeala/ShareGPT_Vicuna_unfiltered`** ```bash -MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct" -NUM_PROMPTS=10 -BACKEND="openai-chat" -DATASET_NAME="hf" -DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered" -DATASET_SPLIT='train' python3 vllm/benchmarks/benchmark_serving.py \ - --backend "${BACKEND}" \ - --model "${MODEL_NAME}" \ - --endpoint "/v1/chat/completions" \ - --dataset-name "${DATASET_NAME}" \ - --dataset-path "${DATASET_PATH}" \ - --hf-split "${DATASET_SPLIT}" \ - --num-prompts "${NUM_PROMPTS}" \ + --backend openai-chat \ + --model Qwen/Qwen2-VL-7B-Instruct \ + --endpoint /v1/chat/completions \ + --dataset-name hf \ + --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \ + --hf-split train \ + --num-prompts 10 +``` + +**`AI-MO/aimo-validation-aime`** + +``` bash +python3 vllm/benchmarks/benchmark_serving.py \ + --model Qwen/QwQ-32B \ + --dataset-name hf \ + --dataset-path AI-MO/aimo-validation-aime \ + --num-prompts 10 \ + --seed 42 +``` + +### Running With Sampling Parameters + +When using OpenAI-compatible backends such as `vllm`, optional sampling +parameters can be specified. Example client command: + +```bash +python3 vllm/benchmarks/benchmark_serving.py \ + --backend vllm \ + --model NousResearch/Hermes-3-Llama-3.1-8B \ + --endpoint /v1/completions \ + --dataset-name sharegpt \ + --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json \ + --top-k 10 \ + --top-p 0.9 \ + --temperature 0.5 \ + --num-prompts 10 ``` --- ## Example - Offline Throughput Benchmark ```bash -MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B" -NUM_PROMPTS=10 -DATASET_NAME="sonnet" -DATASET_PATH="vllm/benchmarks/sonnet.txt" - python3 vllm/benchmarks/benchmark_throughput.py \ - --model "${MODEL_NAME}" \ - --dataset-name "${DATASET_NAME}" \ - --dataset-path "${DATASET_PATH}" \ - --num-prompts "${NUM_PROMPTS}" + --model NousResearch/Hermes-3-Llama-3.1-8B \ + --dataset-name sonnet \ + --dataset-path vllm/benchmarks/sonnet.txt \ + --num-prompts 10 ``` If successful, you will see the following output @@ -217,19 +244,13 @@ Total num output tokens: 1500 ### VisionArena Benchmark for Vision Language Models ``` bash -MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct" -NUM_PROMPTS=10 -DATASET_NAME="hf" -DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1" -DATASET_SPLIT="train" - python3 vllm/benchmarks/benchmark_throughput.py \ - --model "${MODEL_NAME}" \ - --backend "vllm-chat" \ - --dataset-name "${DATASET_NAME}" \ - --dataset-path "${DATASET_PATH}" \ - --num-prompts "${NUM_PROMPTS}" \ - --hf-split "${DATASET_SPLIT}" + --model Qwen/Qwen2-VL-7B-Instruct \ + --backend vllm-chat \ + --dataset-name hf \ + --dataset-path lmarena-ai/VisionArena-Chat \ + --num-prompts 1000 \ + --hf-split train ``` The `num prompt tokens` now includes image token counts @@ -240,29 +261,83 @@ Total num prompt tokens: 14527 Total num output tokens: 1280 ``` +### InstructCoder Benchmark with Speculative Decoding + +``` bash +VLLM_WORKER_MULTIPROC_METHOD=spawn \ +VLLM_USE_V1=1 \ +python3 vllm/benchmarks/benchmark_throughput.py \ + --dataset-name=hf \ + --dataset-path=likaixin/InstructCoder \ + --model=meta-llama/Meta-Llama-3-8B-Instruct \ + --input-len=1000 \ + --output-len=100 \ + --num-prompts=2048 \ + --async-engine \ + --speculative-model="[ngram]" \ + --ngram_prompt_lookup_min=2 \ + --ngram-prompt-lookup-max=5 \ + --num_speculative_tokens=5 +``` + +``` +Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s +Total num prompt tokens: 261136 +Total num output tokens: 204800 +``` + +### Other HuggingFaceDataset Examples + +**`lmms-lab/LLaVA-OneVision-Data`** + +```bash +python3 vllm/benchmarks/benchmark_throughput.py \ + --model Qwen/Qwen2-VL-7B-Instruct \ + --backend vllm-chat \ + --dataset-name hf \ + --dataset-path lmms-lab/LLaVA-OneVision-Data \ + --hf-split train \ + --hf-subset "chart2text(cauldron)" \ + --num-prompts 10 +``` + +**`Aeala/ShareGPT_Vicuna_unfiltered`** + +```bash +python3 vllm/benchmarks/benchmark_throughput.py \ + --model Qwen/Qwen2-VL-7B-Instruct \ + --backend vllm-chat \ + --dataset-name hf \ + --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \ + --hf-split train \ + --num-prompts 10 +``` + +**`AI-MO/aimo-validation-aime`** + +```bash +python3 benchmarks/benchmark_throughput.py \ + --model Qwen/QwQ-32B \ + --backend vllm \ + --dataset-name hf \ + --dataset-path AI-MO/aimo-validation-aime \ + --hf-split train \ + --num-prompts 10 +``` + ### Benchmark with LoRA Adapters ``` bash # download dataset # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -MODEL_NAME="meta-llama/Llama-2-7b-hf" -BACKEND="vllm" -DATASET_NAME="sharegpt" -DATASET_PATH="/ShareGPT_V3_unfiltered_cleaned_split.json" -NUM_PROMPTS=10 -MAX_LORAS=2 -MAX_LORA_RANK=8 -ENABLE_LORA="--enable-lora" -LORA_PATH="yard1/llama-2-7b-sql-lora-test" - python3 vllm/benchmarks/benchmark_throughput.py \ - --model "${MODEL_NAME}" \ - --backend "${BACKEND}" \ - --dataset_path "${DATASET_PATH}" \ - --dataset_name "${DATASET_NAME}" \ - --num-prompts "${NUM_PROMPTS}" \ - --max-loras "${MAX_LORAS}" \ - --max-lora-rank "${MAX_LORA_RANK}" \ - ${ENABLE_LORA} \ - --lora-path "${LORA_PATH}" + --model meta-llama/Llama-2-7b-hf \ + --backend vllm \ + --dataset_path /ShareGPT_V3_unfiltered_cleaned_split.json \ + --dataset_name sharegpt \ + --num-prompts 10 \ + --max-loras 2 \ + --max-lora-rank 8 \ + --enable-lora \ + --lora-path yard1/llama-2-7b-sql-lora-test ``` diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 0f13c79ae234b..efd51c79c37cf 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 +import io import json import os import sys @@ -32,6 +33,7 @@ class RequestFuncInput: extra_body: Optional[dict] = None multi_modal_content: Optional[dict] = None ignore_eos: bool = False + language: Optional[str] = None @dataclass @@ -219,7 +221,15 @@ async def async_request_deepspeed_mii( if response.status == 200: parsed_resp = await response.json() output.latency = time.perf_counter() - st - output.generated_text = parsed_resp["text"][0] + if "choices" in parsed_resp: + output.generated_text = parsed_resp["choices"][0][ + "text"] + elif "text" in parsed_resp: + output.generated_text = parsed_resp["text"][0] + else: + output.error = ("Unexpected response format: " + "neither 'choices' nor 'text' found") + output.success = False output.success = True else: output.error = response.reason or "" @@ -428,6 +438,110 @@ async def async_request_openai_chat_completions( return output +async def async_request_openai_audio( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + # Lazy import without PlaceholderModule to avoid vllm dep. + import soundfile + api_url = request_func_input.api_url + assert api_url.endswith( + ("transcriptions", "translations" + )), "OpenAI Chat Completions API URL must end with 'transcriptions' " + "or `translations`." + + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: + content = [{"type": "text", "text": request_func_input.prompt}] + payload = { + "model": request_func_input.model_name \ + if request_func_input.model_name else request_func_input.model, + "temperature": 0.0, + "max_completion_tokens": request_func_input.output_len, + "stream": True, + "language": "en", + # Flattened due to multipart/form-data + "stream_include_usage": True, + "stream_continuous_usage_stats": True + } + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + + # Send audio file + def to_bytes(y, sr): + buffer = io.BytesIO() + soundfile.write(buffer, y, sr, format="WAV") + buffer.seek(0) + return buffer + + with to_bytes(*request_func_input.multi_modal_content['audio']) as f: + form = aiohttp.FormData() + form.add_field('file', f, content_type='audio/wav') + for key, value in payload.items(): + form.add_field(key, str(value)) + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post(url=api_url, + data=form, + headers=headers) as response: + if response.status == 200: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data: ") + if chunk != "[DONE]": + timestamp = time.perf_counter() + data = json.loads(chunk) + + if choices := data.get("choices"): + content = choices[0]["delta"].get( + "content") + # First token + if ttft == 0.0: + ttft = timestamp - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append( + timestamp - most_recent_timestamp) + + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") + + most_recent_timestamp = timestamp + + output.generated_text = generated_text + output.success = True + output.latency = most_recent_timestamp - st + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + def get_model(pretrained_model_name_or_path: str) -> str: if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true': from modelscope import snapshot_download @@ -485,7 +599,14 @@ ASYNC_REQUEST_FUNCS = { "deepspeed-mii": async_request_deepspeed_mii, "openai": async_request_openai_completions, "openai-chat": async_request_openai_chat_completions, + "openai-audio": async_request_openai_audio, "tensorrt-llm": async_request_trt_llm, "scalellm": async_request_openai_completions, "sglang": async_request_openai_completions, } + +OPENAI_COMPATIBLE_BACKENDS = [ + k for k, v in ASYNC_REQUEST_FUNCS.items() + if v in (async_request_openai_completions, + async_request_openai_chat_completions) +] diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 0567875f9862f..ccbc6c022f1f9 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -23,7 +23,8 @@ from abc import ABC, abstractmethod from collections.abc import Mapping from dataclasses import dataclass from functools import cache -from typing import Any, Optional, Union +from io import BytesIO +from typing import Any, Callable, Optional, Union import numpy as np import pandas as pd @@ -63,6 +64,7 @@ class SampleRequest: class BenchmarkDataset(ABC): DEFAULT_SEED = 0 + IS_MULTIMODAL = False def __init__( self, @@ -239,21 +241,24 @@ def process_image(image: Any) -> Mapping[str, Any]: """ Process a single image input and return a multimedia content dictionary. - For a PIL.Image.Image input: - - Converts the image to RGB. - - Saves the image as a JPEG in-memory. - - Encodes the JPEG data as a base64 string. - - Returns a dictionary with the image as a base64 data URL. + Supports three input types: - For a string input: - - Treats the string as a URL or file path. - - Prepends "file://" if the string doesn't start with "http://" or - "file://". - - Returns a dictionary with the image URL. + 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key + containing raw image data. - Loads the bytes as a PIL.Image.Image. + + 2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as + a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns + a dictionary with the image as a base64 data URL. + + 3. String input: - Treats the string as a URL or local file path. - + Prepends "file://" if the string doesn't start with "http://" or + "file://". - Returns a dictionary with the image URL. Raises: - ValueError: If the input is neither a PIL.Image.Image nor a string. + ValueError: If the input is not a supported type. """ + if isinstance(image, dict) and 'bytes' in image: + image = Image.open(BytesIO(image['bytes'])) if isinstance(image, Image.Image): image = image.convert("RGB") with io.BytesIO() as image_data: @@ -272,8 +277,8 @@ def process_image(image: Any) -> Mapping[str, Any]: ("http://", "file://")) else f"file://{image}") return {"type": "image_url", "image_url": {"url": image_url}} - raise ValueError( - f"Invalid image input {image}. Must be a PIL.Image.Image or str.") + raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image" + " or str or dictionary with raw image bytes.") # ----------------------------------------------------------------------------- @@ -284,7 +289,7 @@ def process_image(image: Any) -> Mapping[str, Any]: class RandomDataset(BenchmarkDataset): # Default values copied from benchmark_serving.py for the random dataset. DEFAULT_PREFIX_LEN = 0 - DEFAULT_RANGE_RATIO = 1.0 + DEFAULT_RANGE_RATIO = 0.0 DEFAULT_INPUT_LEN = 1024 DEFAULT_OUTPUT_LEN = 128 @@ -304,19 +309,32 @@ class RandomDataset(BenchmarkDataset): output_len: int = DEFAULT_OUTPUT_LEN, **kwargs, ) -> list[SampleRequest]: + # Enforce range_ratio < 1 + assert range_ratio < 1.0, ( + "random_range_ratio must be < 1.0 to ensure a valid sampling range" + ) + vocab_size = tokenizer.vocab_size prefix_token_ids = (np.random.randint( 0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else []) - input_low = int(input_len * range_ratio) - output_low = int(output_len * range_ratio) + # New sampling logic: [X * (1 - b), X * (1 + b)] + input_low = int(input_len * (1 - range_ratio)) + input_high = int(input_len * (1 + range_ratio)) + output_low = int(output_len * (1 - range_ratio)) + output_high = int(output_len * (1 + range_ratio)) + + # Add logging for debugging + logger.info("Sampling input_len from [%s, %s]", input_low, input_high) + logger.info("Sampling output_len from [%s, %s]", output_low, + output_high) input_lens = np.random.randint(input_low, - input_len + 1, + input_high + 1, size=num_requests) output_lens = np.random.randint(output_low, - output_len + 1, + output_high + 1, size=num_requests) offsets = np.random.randint(0, vocab_size, size=num_requests) @@ -468,11 +486,11 @@ class SonnetDataset(BenchmarkDataset): # Determine how many poem lines to use. num_input_lines = round((input_len - base_offset) / avg_len) - num_prefix_lines = round((prefix_len - base_offset) / avg_len) + num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0) prefix_lines = self.data[:num_prefix_lines] samples = [] - for _ in range(num_requests): + while len(samples) < num_requests: extra_lines = random.choices(self.data, k=num_input_lines - num_prefix_lines) prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}" @@ -480,13 +498,14 @@ class SonnetDataset(BenchmarkDataset): prompt_formatted = tokenizer.apply_chat_template( msg, add_generation_prompt=True, tokenize=False) prompt_len = len(tokenizer(prompt_formatted).input_ids) - samples.append( - SampleRequest( - prompt=prompt_formatted - if return_prompt_formatted else prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - )) + if prompt_len <= input_len: + samples.append( + SampleRequest( + prompt=prompt_formatted + if return_prompt_formatted else prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + )) return samples @@ -562,48 +581,48 @@ class BurstGPTDataset(BenchmarkDataset): # ----------------------------------------------------------------------------- -# HuggingFace Dataset Implementation +# HuggingFace Dataset Base Implementation # ----------------------------------------------------------------------------- - - class HuggingFaceDataset(BenchmarkDataset): - """ - Dataset class for processing a HuggingFace dataset with conversation data - and optional images. - """ + """Base class for datasets hosted on HuggingFace.""" + + SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set() def __init__( self, + dataset_path: str, dataset_split: str, dataset_subset: Optional[str] = None, **kwargs, ) -> None: - super().__init__(**kwargs) + super().__init__(dataset_path=dataset_path, **kwargs) + self.dataset_split = dataset_split self.dataset_subset = dataset_subset - self.load_data() def load_data(self) -> None: - if not self.dataset_path: - raise ValueError("dataset_path must be provided for loading data.") - + """Load data from HuggingFace datasets.""" self.data = load_dataset( self.dataset_path, name=self.dataset_subset, split=self.dataset_split, streaming=True, ) - if self.data.features is None or "conversations" \ - not in self.data.features: - raise ValueError( - "HuggingFaceDataset currently only supports datasets with " - "a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. " - "Please consider contributing if you would like to add " - "support for additional dataset formats.") - # Shuffle and filter examples with at least 2 conversations. - self.data = self.data.shuffle(seed=self.random_seed).filter( - lambda x: len(x["conversations"]) >= 2) + self.data = self.data.shuffle(seed=self.random_seed) + + +# ----------------------------------------------------------------------------- +# Conversation Dataset Implementation +# ----------------------------------------------------------------------------- + + +class ConversationDataset(HuggingFaceDataset): + """Dataset for conversation data with multimodal support.""" + SUPPORTED_DATASET_PATHS = { + 'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered' + } + IS_MULTIMODAL = True def sample(self, tokenizer: PreTrainedTokenizerBase, @@ -611,10 +630,13 @@ class HuggingFaceDataset(BenchmarkDataset): output_len: Optional[int] = None, enable_multimodal_chat: bool = False, **kwargs) -> list: + # Filter examples with at least 2 conversations + filtered_data = self.data.filter( + lambda x: len(x["conversations"]) >= 2) sampled_requests = [] dynamic_output = output_len is None - for item in self.data: + for item in filtered_data: if len(sampled_requests) >= num_requests: break conv = item["conversations"] @@ -659,29 +681,13 @@ class VisionArenaDataset(HuggingFaceDataset): """ DEFAULT_OUTPUT_LEN = 128 - VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1" - - def __init__( - self, - **kwargs, - ) -> None: - super().__init__(**kwargs) - if self.dataset_path != self.VISION_ARENA_DATASET_PATH: - raise ValueError(f"Only support Vision Arena dataset.\ - This data path {self.dataset_path} is not valid.") - if self.dataset_subset is None and self.dataset_split != "train": - raise ValueError("Dataset split must be 'train'.") - - self.load_data() - - def load_data(self) -> None: - dataset = load_dataset( - self.dataset_path, - name=self.dataset_subset, - split=self.dataset_split, - streaming=True, - ) - self.data = dataset.shuffle(seed=self.random_seed) + SUPPORTED_DATASET_PATHS = { + "lmarena-ai/VisionArena-Chat": + lambda x: x["conversation"][0][0]["content"], + "lmarena-ai/vision-arena-bench-v0.1": + lambda x: x["turns"][0][0]["content"] + } + IS_MULTIMODAL = True def sample( self, @@ -697,7 +703,11 @@ class VisionArenaDataset(HuggingFaceDataset): for item in self.data: if len(sampled_requests) >= num_requests: break - prompt = item["turns"][0][0]["content"] + parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path) + if parser_fn is None: + raise ValueError( + f"Unsupported dataset path: {self.dataset_path}") + prompt = parser_fn(item) mm_content = process_image(item["images"][0]) prompt_len = len(tokenizer(prompt).input_ids) if enable_multimodal_chat: @@ -715,3 +725,173 @@ class VisionArenaDataset(HuggingFaceDataset): )) self.maybe_oversample_requests(sampled_requests, num_requests) return sampled_requests + + +# ----------------------------------------------------------------------------- +# Instruct Coder Dataset Implementation +# ----------------------------------------------------------------------------- + + +class InstructCoderDataset(HuggingFaceDataset): + """ + InstructCoder Dataset. + https://huggingface.co/datasets/likaixin/InstructCoder + + InstructCoder is the dataset designed for general code editing. It consists + of 114,239 instruction-input-output triplets, and covers multiple distinct + code editing scenario. + """ + + DEFAULT_OUTPUT_LEN = 200 # this is the average default output length + SUPPORTED_DATASET_PATHS = { + "likaixin/InstructCoder", + } + + def sample(self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + **kwargs) -> list: + output_len = (output_len + if output_len is not None else self.DEFAULT_OUTPUT_LEN) + sampled_requests = [] + for item in self.data: + if len(sampled_requests) >= num_requests: + break + prompt = f"{item['instruction']}:\n{item['input']}" + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + )) + self.maybe_oversample_requests(sampled_requests, num_requests) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# AIMO Dataset Implementation +# ----------------------------------------------------------------------------- + + +class AIMODataset(HuggingFaceDataset): + """ + Dataset class for processing a AIMO dataset with reasoning questions. + """ + SUPPORTED_DATASET_PATHS = { + "AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5", + "AI-MO/NuminaMath-CoT" + } + + def sample(self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + **kwargs) -> list: + sampled_requests = [] + dynamic_output = output_len is None + + for item in self.data: + if len(sampled_requests) >= num_requests: + break + prompt, completion = item['problem'], item["solution"] + + prompt_ids = tokenizer(prompt).input_ids + completion_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_ids) + completion_len = len(completion_ids) + output_len = completion_len if dynamic_output else output_len + assert isinstance(output_len, int) and output_len > 0 + if dynamic_output and not is_valid_sequence(prompt_len, + completion_len, + max_prompt_len=2048, + max_total_len=32000): + continue + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=None, + )) + self.maybe_oversample_requests(sampled_requests, num_requests) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# ASR Dataset Implementation +# ----------------------------------------------------------------------------- + + +class ASRDataset(HuggingFaceDataset): + """ + Dataset class for processing a ASR dataset for transcription. + Tested on the following set: + + +----------------+----------------------------------------+--------------------------+-----------------------------+ + | Dataset | Domain | Speaking Style | hf-subset | + +----------------+----------------------------------------+--------------------------+-----------------------------+ + | TED-LIUM | TED talks | Oratory | release1, release2, release3| + | | | | release3-speaker-adaptation | + | VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... | + | LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" | + | GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test | + | SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test | + | AMI | Meetings | Spontaneous | ihm, sdm | + +----------------+----------------------------------------+--------------------------+-----------------------------+ + + """ # noqa: E501 + SUPPORTED_DATASET_PATHS = { + "openslr/librispeech_asr", "facebook/voxpopuli", "LIUM/tedlium", + "edinburghcstr/ami", "speechcolab/gigaspeech", "kensho/spgispeech" + } + + DEFAULT_OUTPUT_LEN = 128 + IS_MULTIMODAL = True + + # TODO Whisper-specific. Abstract interface when more models are supported. + TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|>"\ + "<|notimestamps|>" + skip_long_audios: bool = True + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + **kwargs, + ) -> list: + import librosa + output_len = (output_len + if output_len is not None else self.DEFAULT_OUTPUT_LEN) + prompt = ASRDataset.TRANSCRIPTION_PREAMBLE + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests = [] + skipped = 0 + for item in self.data: + if len(sampled_requests) >= num_requests: + break + audio = item["audio"] + y, sr = audio["array"], audio["sampling_rate"] + duration_s = librosa.get_duration(y=y, sr=sr) + # Whisper max supported duration + if self.skip_long_audios and duration_s > 30: + skipped += 1 + continue + + mm_content = {"audio": (y, sr)} + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + )) + if skipped: + logger.warning("%d samples discarded from dataset due to" \ + " their length being greater than" \ + " what Whisper supports.", skipped) + self.maybe_oversample_requests(sampled_requests, num_requests) + return sampled_requests diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 69cb7d7038cf7..5543ccffbf078 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -7,9 +7,6 @@ On the server side, run one of the following commands: --swap-space 16 \ --disable-log-requests - (TGI backend) - ./launch_tgi_server.sh - On the client side, run: python benchmarks/benchmark_serving.py \ --backend \ @@ -37,7 +34,8 @@ from datetime import datetime from typing import Any, Optional import numpy as np -from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, +from backend_request_func import (ASYNC_REQUEST_FUNCS, + OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput, RequestFuncOutput) from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase @@ -52,9 +50,11 @@ try: except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser -from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset, - RandomDataset, SampleRequest, ShareGPTDataset, - SonnetDataset, VisionArenaDataset) +from benchmark_dataset import (AIMODataset, ASRDataset, BurstGPTDataset, + ConversationDataset, HuggingFaceDataset, + InstructCoderDataset, RandomDataset, + SampleRequest, ShareGPTDataset, SonnetDataset, + VisionArenaDataset) from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json MILLISECONDS_TO_SECONDS_CONVERSION = 1000 @@ -156,7 +156,7 @@ def calculate_metrics( if outputs[i].success: output_len = outputs[i].output_tokens - if output_len is None: + if not output_len: # We use the tokenizer to count the number of output tokens # for some serving backends instead of looking at # len(outputs[i].itl) since multiple output tokens may be @@ -261,6 +261,7 @@ async def benchmark( goodput_config_dict: dict[str, float], max_concurrency: Optional[int], lora_modules: Optional[Iterable[str]], + extra_body: Optional[dict], ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -273,10 +274,6 @@ async def benchmark( input_requests[0].expected_output_len, \ input_requests[0].multi_modal_data - if backend != "openai-chat" and test_mm_content is not None: - # multi-modal benchmark is only available on OpenAI Chat backend. - raise ValueError( - "Multi-modal content is only supported on 'openai-chat' backend.") assert test_mm_content is None or isinstance(test_mm_content, dict) test_input = RequestFuncInput( model=model_id, @@ -288,6 +285,7 @@ async def benchmark( logprobs=logprobs, multi_modal_content=test_mm_content, ignore_eos=ignore_eos, + extra_body=extra_body, ) test_output = await request_func(request_func_input=test_input) @@ -314,7 +312,8 @@ async def benchmark( output_len=test_output_len, logprobs=logprobs, multi_modal_content=test_mm_content, - ignore_eos=ignore_eos) + ignore_eos=ignore_eos, + extra_body=extra_body) profile_output = await request_func(request_func_input=profile_input) if profile_output.success: print("Profiler started") @@ -364,7 +363,8 @@ async def benchmark( output_len=output_len, logprobs=logprobs, multi_modal_content=mm_content, - ignore_eos=ignore_eos) + ignore_eos=ignore_eos, + extra_body=extra_body) tasks.append( asyncio.create_task( limited_request_func(request_func_input=request_func_input, @@ -586,19 +586,49 @@ def main(args: argparse.Namespace): return_prompt_formatted=True) elif args.dataset_name == "hf": - # Choose between VisionArenaDataset - # and HuggingFaceDataset based on provided parameters. - dataset_class = (VisionArenaDataset if args.dataset_path - == VisionArenaDataset.VISION_ARENA_DATASET_PATH - and args.hf_subset is None else HuggingFaceDataset) + # all following datasets are implemented from the + # HuggingFaceDataset base class + if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: + dataset_class = VisionArenaDataset + args.hf_split = "train" + args.hf_subset = None + elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: + dataset_class = InstructCoderDataset + args.hf_split = "train" + elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: + dataset_class = ConversationDataset + elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: + dataset_class = AIMODataset + args.hf_split = "train" + elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS: + dataset_class = ASRDataset + args.hf_split = "train" + else: + supported_datasets = set([ + dataset_name for cls in HuggingFaceDataset.__subclasses__() + for dataset_name in cls.SUPPORTED_DATASET_PATHS + ]) + raise ValueError( + f"Unsupported dataset path: {args.dataset_path}. " + "Huggingface dataset only supports dataset_path" + f" from one of following: {supported_datasets}. " + "Please consider contributing if you would " + "like to add support for additional dataset formats.") + + if (dataset_class.IS_MULTIMODAL and backend not in \ + ["openai-chat", "openai-audio"]): + # multi-modal benchmark is only available on OpenAI Chat backend. + raise ValueError( + "Multi-modal content is only supported on 'openai-chat' and " \ + "'openai-audio' backend.") input_requests = dataset_class( dataset_path=args.dataset_path, dataset_subset=args.hf_subset, dataset_split=args.hf_split, + random_seed=args.seed, ).sample( num_requests=args.num_prompts, tokenizer=tokenizer, - random_seed=args.seed, output_len=args.hf_output_len, ) @@ -633,6 +663,26 @@ def main(args: argparse.Namespace): raise ValueError(f"Unknown dataset: {args.dataset_name}") from err goodput_config_dict = check_goodput_args(args) + # Collect the sampling parameters. + sampling_params = { + k: v + for k, v in { + "top_p": args.top_p, + "top_k": args.top_k, + "min_p": args.min_p, + "temperature": args.temperature + }.items() if v is not None + } + + # Sampling parameters are only supported by openai-compatible backend. + if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS: + raise ValueError( + "Sampling parameters are only supported by openai-compatible " + "backends.") + + if "temperature" not in sampling_params: + sampling_params["temperature"] = 0.0 # Default to greedy decoding. + # Avoid GC processing "static" data - reduce pause times. gc.collect() gc.freeze() @@ -659,6 +709,7 @@ def main(args: argparse.Namespace): goodput_config_dict=goodput_config_dict, max_concurrency=args.max_concurrency, lora_modules=args.lora_modules, + extra_body=sampling_params, )) # Save config and results to json @@ -881,7 +932,7 @@ if __name__ == "__main__": "--percentile-metrics", type=str, default="ttft,tpot,itl", - help="Comma-seperated list of selected metrics to report percentils. " + help="Comma-separated list of selected metrics to report percentils. " "This argument specifies the metrics to report percentiles. " "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " "Default value is \"ttft,tpot,itl\".") @@ -889,7 +940,7 @@ if __name__ == "__main__": "--metric-percentiles", type=str, default="99", - help="Comma-seperated list of percentiles for selected metrics. " + help="Comma-separated list of percentiles for selected metrics. " "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " "Default value is \"99\". " "Use \"--percentile-metrics\" to select metrics.", @@ -956,18 +1007,23 @@ if __name__ == "__main__": random_group.add_argument( "--random-range-ratio", type=float, - default=1.0, - help="Range of sampled ratio of input/output length, " - "used only for random sampling.", + default=0.0, + help="Range ratio for sampling input/output length, " + "used only for random sampling. Must be in the range [0, 1) to define " + "a symmetric sampling range" + "[length * (1 - range_ratio), length * (1 + range_ratio)].", ) random_group.add_argument( "--random-prefix-len", type=int, default=0, - help="Number of fixed prefix tokens before random " - " context. The length range of context in a random " - " request is [random-prefix-len, " - " random-prefix-len + random-prefix-len * random-range-ratio).") + help=("Number of fixed prefix tokens before the random context " + "in a request. " + "The total input length is the sum of `random-prefix-len` and " + "a random " + "context length sampled from [input_len * (1 - range_ratio), " + "input_len * (1 + range_ratio)]."), + ) hf_group = parser.add_argument_group("hf dataset options") hf_group.add_argument("--hf-subset", @@ -986,6 +1042,33 @@ if __name__ == "__main__": "from the sampled HF dataset.", ) + sampling_group = parser.add_argument_group("sampling parameters") + sampling_group.add_argument( + "--top-p", + type=float, + default=None, + help="Top-p sampling parameter. Only has effect on openai-compatible " + "backends.") + sampling_group.add_argument( + "--top-k", + type=int, + default=None, + help="Top-k sampling parameter. Only has effect on openai-compatible " + "backends.") + sampling_group.add_argument( + "--min-p", + type=float, + default=None, + help="Min-p sampling parameter. Only has effect on openai-compatible " + "backends.") + sampling_group.add_argument( + "--temperature", + type=float, + default=None, + help="Temperature sampling parameter. Only has effect on " + "openai-compatible backends. If not specified, default to greedy " + "decoding (i.e. temperature==0.0).") + parser.add_argument( '--tokenizer-mode', type=str, diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index c79a93faff197..5dd9b1dbd4611 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -5,16 +5,13 @@ On the server side, run one of the following commands: (vLLM OpenAI API server) vllm serve --disable-log-requests - (TGI backend) - ./launch_tgi_server.sh - On the client side, run: python benchmarks/benchmark_serving_structured_output.py \ --backend \ --model \ --dataset json \ --structured-output-ratio 1.0 \ - --structured-output-backend xgrammar \ + --structured-output-backend auto \ --request-rate 10 \ --num-prompts 1000 @@ -54,7 +51,7 @@ try: except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser -from vllm.v1.structured_output.utils import ( +from vllm.v1.structured_output.backend_xgrammar import ( has_xgrammar_unsupported_json_features) MILLISECONDS_TO_SECONDS_CONVERSION = 1000 @@ -133,10 +130,11 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, "description": "An unique optional field to avoid cached schemas" } + else: + json_schemas = [schema] * args.num_prompts def gen_prompt(index: int): - schema = json_schemas[index % len(json_schemas)] - return f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501 + return f"Generate an example of a user profile given the following schema: {json.dumps(get_schema(index))}" # noqa: E501 def get_schema(index: int): return json_schemas[index % len(json_schemas)] @@ -966,7 +964,7 @@ if __name__ == "__main__": "--percentile-metrics", type=str, default="ttft,tpot,itl", - help="Comma-seperated list of selected metrics to report percentils. " + help="Comma-separated list of selected metrics to report percentils. " "This argument specifies the metrics to report percentiles. " "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " "Default value is \"ttft,tpot,itl\".") @@ -974,7 +972,7 @@ if __name__ == "__main__": "--metric-percentiles", type=str, default="99", - help="Comma-seperated list of percentiles for selected metrics. " + help="Comma-separated list of percentiles for selected metrics. " "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " "Default value is \"99\". " "Use \"--percentile-metrics\" to select metrics.", @@ -999,12 +997,14 @@ if __name__ == "__main__": type=float, default=1.0, help="Ratio of Structured Outputs requests") - parser.add_argument( - "--structured-output-backend", - type=str, - choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"], - default="xgrammar", - help="Backend to use for structured outputs") + parser.add_argument("--structured-output-backend", + type=str, + choices=[ + "outlines", "lm-format-enforcer", "xgrammar", + "guidance", "auto" + ], + default="auto", + help="Backend to use for structured outputs") args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 53869db478c51..1f65277e1bfeb 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -11,7 +11,8 @@ from typing import Any, Optional, Union import torch import uvloop -from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset, +from benchmark_dataset import (AIMODataset, BurstGPTDataset, + ConversationDataset, InstructCoderDataset, RandomDataset, SampleRequest, ShareGPTDataset, SonnetDataset, VisionArenaDataset) from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json @@ -212,14 +213,17 @@ def run_hf( max_prompt_len = 0 max_output_len = 0 for i in range(len(requests)): - prompt, prompt_len, output_len = requests[i] + prompt = requests[i].prompt + prompt_len = requests[i].prompt_len + output_len = requests[i].expected_output_len # Add the prompt to the batch. batch.append(prompt) max_prompt_len = max(max_prompt_len, prompt_len) max_output_len = max(max_output_len, output_len) if len(batch) < max_batch_size and i != len(requests) - 1: # Check if we can add more requests to the batch. - _, next_prompt_len, next_output_len = requests[i + 1] + next_prompt_len = requests[i + 1].prompt_len + next_output_len = requests[i + 1].expected_output_len if (max(max_prompt_len, next_prompt_len) + max(max_output_len, next_output_len)) <= 2048: # We can add more requests to the batch. @@ -300,6 +304,7 @@ def get_requests(args, tokenizer): "input_len": args.input_len, "output_len": args.output_len, } + if args.dataset_path is None or args.dataset_name == "random": sample_kwargs["range_ratio"] = args.random_range_ratio sample_kwargs["prefix_len"] = args.prefix_len @@ -317,18 +322,23 @@ def get_requests(args, tokenizer): elif args.dataset_name == "burstgpt": dataset_cls = BurstGPTDataset elif args.dataset_name == "hf": - if args.backend != "vllm-chat": - raise ValueError( - "hf datasets only are supported by vllm-chat backend") - # Choose between VisionArenaDataset and HuggingFaceDataset based on - # provided parameters. - dataset_cls = (VisionArenaDataset if args.dataset_path - == VisionArenaDataset.VISION_ARENA_DATASET_PATH - and args.hf_subset is None else HuggingFaceDataset) - common_kwargs['dataset_subset'] = args.hf_subset - common_kwargs['dataset_split'] = args.hf_split - sample_kwargs["enable_multimodal_chat"] = True - + if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: + dataset_cls = VisionArenaDataset + common_kwargs['dataset_subset'] = None + common_kwargs['dataset_split'] = "train" + sample_kwargs["enable_multimodal_chat"] = True + elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: + dataset_cls = InstructCoderDataset + common_kwargs['dataset_split'] = "train" + elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: + dataset_cls = ConversationDataset + common_kwargs['dataset_subset'] = args.hf_subset + common_kwargs['dataset_split'] = args.hf_split + sample_kwargs["enable_multimodal_chat"] = True + elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: + dataset_cls = AIMODataset + common_kwargs['dataset_subset'] = None + common_kwargs['dataset_split'] = "train" else: raise ValueError(f"Unknown dataset name: {args.dataset_name}") # Remove None values @@ -462,9 +472,17 @@ def validate_args(args): warnings.warn("--hf-subset and --hf-split will be ignored \ since --dataset-name is not 'hf'.", stacklevel=2) - elif args.dataset_name == "hf" and args.backend != "vllm-chat": - raise ValueError( - "When --dataset-name is 'hf', backend must be 'vllm-chat'") + elif args.dataset_name == "hf": + if args.dataset_path in ( + VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys() + | ConversationDataset.SUPPORTED_DATASET_PATHS): + assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend." #noqa: E501 + elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS + | AIMODataset.SUPPORTED_DATASET_PATHS): + assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend." #noqa: E501 + else: + raise ValueError( + f"{args.dataset_path} is not supported by hf dataset.") # --random-range-ratio: only used when dataset_name is 'random' if args.dataset_name != 'random' and args.random_range_ratio is not None: @@ -505,6 +523,13 @@ def validate_args(args): raise ValueError( "Tokenizer must be the same as the model for MII backend.") + # --data-parallel is not supported currently. + # https://github.com/vllm-project/vllm/issues/16222 + if args.data_parallel_size > 1: + raise ValueError( + "Data parallel is not supported in offline benchmark, \ + please use benchmark serving instead") + if __name__ == "__main__": parser = FlexibleArgumentParser(description="Benchmark the throughput.") @@ -576,18 +601,30 @@ if __name__ == "__main__": default=None, help="Path to the lora adapters to use. This can be an absolute path, " "a relative path, or a Hugging Face model identifier.") - parser.add_argument("--prefix-len", - type=int, - default=None, - help="Number of prefix tokens per request." - "This is for the RandomDataset and SonnetDataset") + parser.add_argument( + "--prefix-len", + type=int, + default=None, + help=f"Number of prefix tokens to be used in RandomDataset " + "and SonnetDataset. For RandomDataset, the total input " + "length is the sum of prefix-len (default: " + f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length " + "sampled from [input_len * (1 - range_ratio), " + "input_len * (1 + range_ratio)]. For SonnetDataset, " + f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) " + "controls how much of the input is fixed lines versus " + "random lines, but the total input length remains approximately " + "input_len tokens.") # random dataset parser.add_argument( "--random-range-ratio", type=float, default=None, - help="Range of sampled ratio of input/output length, " - "used only for RandomDataSet.", + help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) " + "for sampling input/output length, " + "used only for RandomDataset. Must be in the range [0, 1) to " + "define a symmetric sampling range " + "[length * (1 - range_ratio), length * (1 + range_ratio)].", ) # hf dtaset diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py new file mode 100644 index 0000000000000..b23b4f3ea685a --- /dev/null +++ b/benchmarks/kernels/benchmark_bitblas.py @@ -0,0 +1,236 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +from vllm.model_executor.layers.quantization.utils.bitblas_utils import ( + MINIMUM_BITBLAS_VERSION) + +try: + import bitblas + if bitblas.__version__ < MINIMUM_BITBLAS_VERSION: + raise ImportError("bitblas version is wrong. Please " + f"install bitblas>={MINIMUM_BITBLAS_VERSION}") +except ImportError as e: + bitblas_import_exception = e + raise ValueError("Trying to use the bitblas backend, but could not import" + f"with the following error: {bitblas_import_exception}. " + "Please install bitblas through the following command: " + f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`" + ) from bitblas_import_exception + +from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target + +from vllm.utils import FlexibleArgumentParser + +parser = FlexibleArgumentParser( + description="Benchmark BitBLAS int4 on a specific target.") + +# Add arguments to the parser +parser.add_argument( + "--target", + type=str, + default=auto_detect_nvidia_target(), + help="Specify the target device for benchmarking.", +) +parser.add_argument("--group_size", + type=int, + default=None, + help="Group size for grouped quantization.") +parser.add_argument( + "--A_dtype", + type=str, + default="float16", + choices=["float16", "float32", "float64", "int32", "int8"], + help="Data type of activation A.", +) +parser.add_argument( + "--W_dtype", + type=str, + default="int4", + choices=[ + "float16", + "float32", + "float64", + "int32", + "int8", + "int4", + "int2", + "int1", + "nf4", + "fp4_e2m1", + ], + help="Data type of weight W.", +) +parser.add_argument( + "--accum_dtype", + type=str, + default="float16", + choices=["float16", "int32"], + help="Data type for accumulation.", +) +parser.add_argument( + "--out_dtype", + type=str, + default="float16", + choices=["float16", "float32", "int32", "int8"], + help="Data type for output.", +) +parser.add_argument( + "--layout", + type=str, + default="nt", + choices=["nt", "nn"], + help="Matrix layout, 'nt' for non-transpose A and transpose W.", +) +parser.add_argument("--with_bias", + action="store_true", + help="Include bias in the benchmark.") +parser.add_argument( + "--with_scaling", + action="store_true", + help="Include scaling factor in the quantization.", +) +parser.add_argument("--with_zeros", + action="store_true", + help="Include zeros in the quantization.") +parser.add_argument( + "--zeros_mode", + type=str, + default=None, + choices=["original", "rescale", "quantized"], + help="Specify the mode for calculating zeros.", +) + +# Parse the arguments +args = parser.parse_args() + +# Assign arguments to variables +target = args.target +A_dtype = args.A_dtype +W_dtype = args.W_dtype +accum_dtype = args.accum_dtype +out_dtype = args.out_dtype +layout = args.layout +with_bias = args.with_bias +group_size = args.group_size +with_scaling = args.with_scaling +with_zeros = args.with_zeros +zeros_mode = args.zeros_mode + +# Define a list of shared arguments that repeat in every config +shared_args = [ + A_dtype, + W_dtype, + out_dtype, + accum_dtype, + layout, + with_bias, + group_size, + with_scaling, + with_zeros, + zeros_mode, +] + +# Define just the (M, K, N) shapes in a more compact list +shapes = [ + # square test + (1, 16384, 16384), + # BLOOM-176B + (1, 43008, 14336), + (1, 14336, 14336), + (1, 57344, 14336), + (1, 14336, 57344), + # OPT-65B + (1, 9216, 9216), + (1, 36864, 9216), + (1, 9216, 36864), + (1, 22016, 8192), + # LLAMA-70B/65B + (1, 8192, 22016), + (1, 8192, 8192), + (1, 28672, 8192), + (1, 8192, 28672), + # square test + (16384, 16384, 16384), + # BLOOM-176B + (8192, 43008, 14336), + (8192, 14336, 14336), + (8192, 57344, 14336), + (8192, 14336, 57344), + # OPT-65B + (8192, 9216, 9216), + (8192, 36864, 9216), + (8192, 9216, 36864), + (8192, 22016, 8192), + # LLAMA-70B/65B + (8192, 8192, 22016), + (8192, 8192, 8192), + (8192, 28672, 8192), + (8192, 8192, 28672), +] + +# Build test shapes with all the shared arguments +test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args)) + for shape in shapes] + +benchmark_sets = [] +benchmark_sets.extend(test_shapes) + +benchmark_results = {} +for config_class, operator, input_args in benchmark_sets: + config = config_class(*input_args) + matmul = operator(config, target=target, enable_tuning=True) + kernel_latency = matmul.profile_latency() + + print("Time cost is: {:.3f} ms".format(kernel_latency)) + + profile_config = { + f"{operator.__name__}-{'-'.join([str(i) for i in input_args])}": { + "BitBLAS_top20_latency": kernel_latency, + } + } + + benchmark_results.update(profile_config) + +# Define headers for the table +headers = [ + "PrimFunc", + "Input Arguments", + "BitBLAS Top20 Latency", +] + +# Calculate column widths for pretty printing +col_widths = [0, 0, 0] +for config_key, values in benchmark_results.items(): + args_split = config_key.split("-") + func_name = args_split[0] + input_args_str = "-".join(args_split[1:]) + col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2) + col_widths[1] = max(col_widths[1], + len(input_args_str) + 2, + len(headers[1]) + 2) + col_widths[2] = max(col_widths[2], + len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2, + len(headers[2]) + 2) + # break only if you want to measure widths from a single example; + # otherwise, let it loop over all items. + +# Print header +for i, header in enumerate(headers): + headers[i] = header.ljust(col_widths[i]) +print("".join(headers)) +print("-" * sum(col_widths)) + +# Print rows +for config_key, values in benchmark_results.items(): + args_split = config_key.split("-") + func_name = args_split[0] + input_args_str = "-".join(args_split[1:]) + row = [ + func_name, + input_args_str, + f"{values['BitBLAS_top20_latency']:.3f} ms", + ] + row_str = "".join( + [str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)]) + print(row_str) diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py new file mode 100644 index 0000000000000..bcdbf6c7551a3 --- /dev/null +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -0,0 +1,340 @@ +# SPDX-License-Identifier: Apache-2.0 + +import torch +import torch.utils.benchmark as benchmark +from benchmark_shapes import WEIGHT_SHAPES_MOE + +from vllm import _custom_ops as ops +from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.fused_moe import (cutlass_moe_fp8, + fused_experts, + fused_topk) +from vllm.utils import FlexibleArgumentParser + +DEFAULT_MODELS = [ + "nm-testing/Mixtral-8x7B-Instruct-v0.1", "nm-testing/deepseekv2-lite", + "ibm-granite/granite-3.0-1b-a400m", "ibm-granite/granite-3.0-3b-a800m" +] +DEFAULT_BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512] +DEFAULT_TP_SIZES = [1] + +PER_ACT_TOKEN_OPTS = [False] +PER_OUT_CH_OPTS = [False] + + +def to_fp8(tensor: torch.Tensor): + finfo = torch.finfo(torch.float8_e4m3fn) + return torch.round(tensor.clamp( + min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) + + +def bench_run(results: list[benchmark.Measurement], model: str, + num_experts: int, topk: int, per_act_token: bool, + per_out_ch: bool, mkn: tuple[int, int, int]): + label = "Quant Matmul" + + sub_label = ( + "{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, " + "MKN=({})".format(model, num_experts, topk, per_act_token, per_out_ch, + mkn)) + + print(f"Testing: {sub_label}") + + (m, k, n) = mkn + + dtype = torch.half + + a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 + w1 = torch.randn((num_experts, 2 * n, k), device="cuda", dtype=dtype) / 10 + w2 = torch.randn((num_experts, k, n), device="cuda", dtype=dtype) / 10 + + _, a_scale = ops.scaled_fp8_quant(a) + + w1_q = torch.empty((num_experts, 2 * n, k), + device="cuda", + dtype=torch.float8_e4m3fn) + w2_q = torch.empty((num_experts, k, n), + device="cuda", + dtype=torch.float8_e4m3fn) + w1_scale = torch.empty((num_experts, 1, 1), + device="cuda", + dtype=torch.float32) + w2_scale = torch.empty((num_experts, 1, 1), + device="cuda", + dtype=torch.float32) + + ab_strides1 = torch.full((num_experts, ), + k, + device="cuda", + dtype=torch.int64) + c_strides1 = torch.full((num_experts, ), + 2 * n, + device="cuda", + dtype=torch.int64) + ab_strides2 = torch.full((num_experts, ), + n, + device="cuda", + dtype=torch.int64) + c_strides2 = torch.full((num_experts, ), + k, + device="cuda", + dtype=torch.int64) + + for expert in range(num_experts): + w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert]) + w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert]) + w1_q_notransp = w1_q.clone() + w2_q_notransp = w2_q.clone() + w1_q = w1_q.transpose(1, 2) + w2_q = w2_q.transpose(1, 2) + + score = torch.randn((m, num_experts), device="cuda", dtype=dtype) + + topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False) + + def run_triton_moe(a: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, + topk_weights: torch.Tensor, topk_ids: torch.Tensor, + w1_scale: torch.Tensor, w2_scale: torch.Tensor, + a_scale: torch.Tensor, num_repeats: int): + for _ in range(num_repeats): + fused_experts(a, + w1, + w2, + topk_weights, + topk_ids, + use_fp8_w8a8=True, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_scale) + + def run_cutlass_moe(a: torch.Tensor, a_scale: torch.Tensor, + w1: torch.Tensor, w2: torch.Tensor, + w1_scale: torch.Tensor, w2_scale: torch.Tensor, + topk_weights: torch.Tensor, topk_ids: torch.Tensor, + ab_strides1: torch.Tensor, c_strides1: torch.Tensor, + ab_strides2: torch.Tensor, c_strides2: torch.Tensor, + num_repeats: int): + for _ in range(num_repeats): + cutlass_moe_fp8(a, + w1, + w2, + w1_scale, + w2_scale, + topk_weights, + topk_ids, + ab_strides1, + c_strides1, + ab_strides2, + c_strides2, + a1_scale=a_scale) + + def run_cutlass_from_graph( + a: torch.Tensor, a_scale: torch.Tensor, w1_q: torch.Tensor, + w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, + topk_weights: torch.Tensor, topk_ids: torch.Tensor, + ab_strides1: torch.Tensor, c_strides1: torch.Tensor, + ab_strides2: torch.Tensor, c_strides2: torch.Tensor): + with set_current_vllm_config( + VllmConfig(parallel_config=ParallelConfig( + pipeline_parallel_size=1))): + return cutlass_moe_fp8(a, + w1_q, + w2_q, + w1_scale, + w2_scale, + topk_weights, + topk_ids, + ab_strides1, + c_strides1, + ab_strides2, + c_strides2, + a1_scale=a_scale) + + def run_triton_from_graph(a: torch.Tensor, w1: torch.Tensor, + w2: torch.Tensor, topk_weights: torch.Tensor, + topk_ids: torch.Tensor, w1_scale: torch.Tensor, + w2_scale: torch.Tensor, a_scale: torch.Tensor): + with set_current_vllm_config( + VllmConfig(parallel_config=ParallelConfig( + pipeline_parallel_size=1))): + return fused_experts(a, + w1, + w2, + topk_weights, + topk_ids, + use_fp8_w8a8=True, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_scale) + + def replay_graph(graph, num_repeats): + for _ in range(num_repeats): + graph.replay() + torch.cuda.synchronize() + + cutlass_stream = torch.cuda.Stream() + cutlass_graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(cutlass_graph, stream=cutlass_stream): + run_cutlass_from_graph(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, + topk_weights, topk_ids, ab_strides1, c_strides1, + ab_strides2, c_strides2) + torch.cuda.synchronize() + + triton_stream = torch.cuda.Stream() + triton_graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(triton_graph, stream=triton_stream): + run_triton_from_graph(a, w1_q_notransp, w2_q_notransp, topk_weights, + topk_ids, w1_scale, w2_scale, a_scale) + torch.cuda.synchronize() + + min_run_time = 5 + num_warmup = 5 + num_runs = 25 + + globals = { + # Baseline params + "w1": w1, + "w2": w2, + "score": score, + "topk": topk, + "w1_q_notransp": w1_q_notransp, + "w2_q_notransp": w2_q_notransp, + # Cutlass params + "a_scale": a_scale, + "w1_q": w1_q, + "w2_q": w2_q, + "w1_scale": w1_scale, + "w2_scale": w2_scale, + "ab_strides1": ab_strides1, + "c_strides1": c_strides1, + "ab_strides2": ab_strides2, + "c_strides2": c_strides2, + # cuda graph params + "cutlass_graph": cutlass_graph, + "triton_graph": triton_graph, + # Gen params + "a": a, + "topk_weights": topk_weights, + "topk_ids": topk_ids, + "num_runs": num_runs, + # Kernels + "run_triton_moe": run_triton_moe, + "run_cutlass_moe": run_cutlass_moe, + "replay_graph": replay_graph, + } + + # Warmup + run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, + w1_scale, w2_scale, a_scale, num_warmup) + + results.append( + benchmark.Timer( + stmt= + "run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="triton_moe", + ).blocked_autorange(min_run_time=min_run_time)) + + # Warmup + replay_graph(triton_graph, num_warmup) + + results.append( + benchmark.Timer( + stmt="replay_graph(triton_graph, num_runs)", + globals=globals, + label=label, + sub_label=sub_label, + description="triton_moe_cuda_graphs", + ).blocked_autorange(min_run_time=min_run_time)) + + # Warmup + run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, + topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, + num_warmup) + + results.append( + benchmark.Timer( + stmt= + "run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="grouped_gemm_moe", + ).blocked_autorange(min_run_time=min_run_time)) + + # Warmup + replay_graph(cutlass_graph, num_warmup) + + results.append( + benchmark.Timer( + stmt="replay_graph(cutlass_graph, num_runs)", + globals=globals, + label=label, + sub_label=sub_label, + description="grouped_gemm_moe_cuda_graphs", + ).blocked_autorange(min_run_time=min_run_time)) + + +def main(args): + print("Benchmarking models:") + for i, model in enumerate(args.models): + print(f"[{i}] {model}") + + results: list[benchmark.Measurement] = [] + + for model in args.models: + for tp in args.tp_sizes: + for layer in WEIGHT_SHAPES_MOE[model]: + num_experts = layer[0] + topk = layer[1] + size_k = layer[2] + size_n = layer[3] // tp + + if len(args.limit_k) > 0 and size_k not in args.limit_k: + continue + + if len(args.limit_n) > 0 and size_n not in args.limit_n: + continue + + for per_act_token in PER_ACT_TOKEN_OPTS: + for per_out_ch in PER_OUT_CH_OPTS: + for size_m in DEFAULT_BATCH_SIZES: + mkn = (size_m, size_k, size_n) + bench_run(results, model, num_experts, topk, + per_act_token, per_out_ch, mkn) + + compare = benchmark.Compare(results) + compare.print() + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark Marlin across specified models/shapes/batches") + parser.add_argument( + "--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES_MOE.keys(), + ) + parser.add_argument("--tp-sizes", + nargs="+", + type=int, + default=DEFAULT_TP_SIZES) + parser.add_argument("--batch-sizes", + nargs="+", + type=int, + default=DEFAULT_BATCH_SIZES) + parser.add_argument("--limit-k", nargs="+", type=int, default=[]) + parser.add_argument("--limit-n", nargs="+", type=int, default=[]) + parser.add_argument("--limit-num-groups", nargs="+", type=int, default=[]) + parser.add_argument("--limit-per-act-token", + nargs="+", + type=int, + default=[]) + parser.add_argument("--limit-per-out-ch", nargs="+", type=int, default=[]) + + args = parser.parse_args() + main(args) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 491f8c3962f73..afe0b53077a70 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -30,19 +30,18 @@ class BenchmarkConfig(TypedDict): num_stages: int -def benchmark_config( - config: BenchmarkConfig, - num_tokens: int, - num_experts: int, - shard_intermediate_size: int, - hidden_size: int, - topk: int, - dtype: torch.dtype, - use_fp8_w8a8: bool, - use_int8_w8a16: bool, - num_iters: int = 100, - block_quant_shape: List[int] = None, -) -> float: +def benchmark_config(config: BenchmarkConfig, + num_tokens: int, + num_experts: int, + shard_intermediate_size: int, + hidden_size: int, + topk: int, + dtype: torch.dtype, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + num_iters: int = 100, + block_quant_shape: List[int] = None, + use_deep_gemm: bool = False) -> float: init_dtype = torch.float16 if use_fp8_w8a8 else dtype x = torch.randn(num_tokens, hidden_size, dtype=dtype) if use_int8_w8a16: @@ -115,22 +114,41 @@ def benchmark_config( def run(): from vllm.model_executor.layers.fused_moe import override_config with override_config(config): - fused_moe( - x, - w1, - w2, - input_gating, - topk, - renormalize=True, - inplace=True, - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a16=use_int8_w8a16, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, - block_shape=block_quant_shape, - ) + if use_deep_gemm: + topk_weights, topk_ids = fused_topk(x, input_gating, topk, + False) + return fused_experts( + x, + w1, + w2, + topk_weights, + topk_ids, + inplace=True, + use_fp8_w8a8=use_fp8_w8a8, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_quant_shape, + allow_deep_gemm=True, + ) + else: + fused_moe( + x, + w1, + w2, + input_gating, + topk, + renormalize=True, + inplace=True, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_quant_shape, + ) # JIT compilation & warmup run() @@ -366,6 +384,7 @@ class BenchmarkWorker: use_fp8_w8a8: bool, use_int8_w8a16: bool, block_quant_shape: List[int] = None, + use_deep_gemm: bool = False, ) -> tuple[dict[str, int], float]: current_platform.seed_everything(self.seed) dtype_str = get_config_dtype_str(dtype, @@ -396,7 +415,8 @@ class BenchmarkWorker: use_fp8_w8a8, use_int8_w8a16, num_iters=100, - block_quant_shape=block_quant_shape) + block_quant_shape=block_quant_shape, + use_deep_gemm=use_deep_gemm) return config, kernel_time def tune( @@ -411,6 +431,7 @@ class BenchmarkWorker: use_int8_w8a16: bool, search_space: list[dict[str, int]], block_quant_shape: list[int], + use_deep_gemm: bool, ) -> dict[str, int]: best_config = None best_time = float("inf") @@ -436,7 +457,8 @@ class BenchmarkWorker: use_fp8_w8a8, use_int8_w8a16, num_iters=20, - block_quant_shape=block_quant_shape) + block_quant_shape=block_quant_shape, + use_deep_gemm=use_deep_gemm) except triton.runtime.autotuner.OutOfResources: # Some configurations may be invalid and fail to compile. continue @@ -531,6 +553,9 @@ def main(args: argparse.Namespace): intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // args.tp_size else: + if not hasattr(config, "hidden_size"): + # Support for llama4 + config = config.text_config # Default: Mixtral. E = config.num_local_experts topk = config.num_experts_per_tok @@ -550,6 +575,8 @@ def main(args: argparse.Namespace): else: batch_sizes = [args.batch_size] + use_deep_gemm = bool(args.use_deep_gemm) + ray.init() num_gpus = int(ray.available_resources()["GPU"]) workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)] @@ -572,10 +599,10 @@ def main(args: argparse.Namespace): start = time.time() configs = _distribute( - "tune", - [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype, - use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape) - for batch_size in batch_sizes]) + "tune", [(batch_size, E, shard_intermediate_size, hidden_size, + topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space, + block_quant_shape, use_deep_gemm) + for batch_size in batch_sizes]) best_configs = { M: sort_config(config) for M, config in zip(batch_sizes, configs) @@ -589,7 +616,7 @@ def main(args: argparse.Namespace): outputs = _distribute( "benchmark", [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype, - use_fp8_w8a8, use_int8_w8a16, block_quant_shape) + use_fp8_w8a8, use_int8_w8a16, block_quant_shape, use_deep_gemm) for batch_size in batch_sizes]) for batch_size, (config, kernel_time) in zip(batch_sizes, outputs): @@ -611,6 +638,7 @@ if __name__ == "__main__": type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto") + parser.add_argument("--use-deep-gemm", action="store_true") parser.add_argument("--seed", type=int, default=0) parser.add_argument("--batch-size", type=int, required=False) parser.add_argument("--tune", action="store_true") diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 48b351bc48141..2625239b08ef2 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -7,10 +7,13 @@ from typing import Optional import torch from vllm import _custom_ops as ops +from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, create_kv_caches_with_random) +logger = init_logger(__name__) + NUM_BLOCKS = 128 * 1024 PARTITION_SIZE = 512 PARTITION_SIZE_ROCM = 256 @@ -193,6 +196,9 @@ def main( if __name__ == '__main__': + logger.warning("This script benchmarks the paged attention kernel. " + "By default this is no longer used in vLLM inference.") + parser = FlexibleArgumentParser( description="Benchmark the paged attention kernel.") parser.add_argument("--version", diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py index c375e61e41873..70190ba24d9df 100644 --- a/benchmarks/kernels/benchmark_shapes.py +++ b/benchmarks/kernels/benchmark_shapes.py @@ -75,3 +75,19 @@ WEIGHT_SHAPES = { [7168, 8192], ], } + +WEIGHT_SHAPES_MOE = { + "nm-testing/Mixtral-8x7B-Instruct-v0.1": [ + [8, 2, 4096, 28672], + [8, 2, 14336, 4096], + ], + "nm-testing/deepseekv2-lite": [ + [64, 6, 2048, 1408], + ], + "ibm-granite/granite-3.0-1b-a400m": [ + [32, 8, 1024, 1024], + ], + "ibm-granite/granite-3.0-3b-a800m": [ + [40, 8, 1024, 1536], + ], +} diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh deleted file mode 100755 index ba7383d88dc49..0000000000000 --- a/benchmarks/launch_tgi_server.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -PORT=8000 -MODEL=$1 -TOKENS=$2 - -docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \ - -v "$PWD/data:/data" \ - ghcr.io/huggingface/text-generation-inference:2.2.0 \ - --model-id "$MODEL" \ - --sharded false \ - --max-input-length 1024 \ - --max-total-tokens 2048 \ - --max-best-of 5 \ - --max-concurrent-requests 5000 \ - --max-batch-total-tokens "$TOKENS" diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index b57d9e2263109..00670bd398b5d 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -33,8 +33,6 @@ endif() if(MACOSX_FOUND) list(APPEND CXX_COMPILE_FLAGS - "-Xpreprocessor" - "-fopenmp" "-DVLLM_CPU_EXTENSION") else() list(APPEND CXX_COMPILE_FLAGS @@ -197,6 +195,7 @@ set(VLLM_EXT_SRC if (AVX512_FOUND AND NOT AVX512_DISABLED) set(VLLM_EXT_SRC "csrc/cpu/quant.cpp" + "csrc/cpu/shm.cpp" ${VLLM_EXT_SRC}) endif() diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index afd7c47e8ac00..110ef266c6653 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG dc9d410b3e2d6534a4c70724c2515f4def670a22 + GIT_TAG 0a721daebe4fa7149f06ecf3d3eabeb6dcd0f1fa GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu new file mode 100644 index 0000000000000..14e5edd7e283d --- /dev/null +++ b/csrc/attention/merge_attn_states.cu @@ -0,0 +1,178 @@ +#include +#include +#include +#include +#include + +#include "attention_dtypes.h" +#include "attention_utils.cuh" + +namespace vllm { + +// Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005 +// can be used to combine partial attention results (in the split-KV case) +template +__global__ void merge_attn_states_kernel( + scalar_t* output, float* output_lse, const scalar_t* prefix_output, + const float* prefix_lse, const scalar_t* suffix_output, + const float* suffix_lse, const uint num_tokens, const uint num_heads, + const uint head_size) { + using pack_128b_t = uint4; + const uint pack_size = 16 / sizeof(scalar_t); + const uint threads_per_head = head_size / pack_size; + + const uint global_idx = blockIdx.x * NUM_THREADS + threadIdx.x; + const uint token_head_threads = num_tokens * num_heads * threads_per_head; + + if (global_idx >= token_head_threads) return; + + // global_idx -> token_idx + head_idx + pack_idx + const uint token_head_idx = global_idx / threads_per_head; + const uint pack_idx = global_idx % threads_per_head; + + const uint token_idx = token_head_idx / num_heads; + const uint head_idx = token_head_idx % num_heads; + + const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc. + const uint head_offset = + token_idx * num_heads * head_size + head_idx * head_size; + const scalar_t* prefix_head_ptr = prefix_output + head_offset; + const scalar_t* suffix_head_ptr = suffix_output + head_offset; + scalar_t* output_head_ptr = output + head_offset; + + float p_lse = prefix_lse[head_idx * num_tokens + token_idx]; + float s_lse = suffix_lse[head_idx * num_tokens + token_idx]; + p_lse = std::isinf(p_lse) ? -std::numeric_limits::infinity() : p_lse; + s_lse = std::isinf(s_lse) ? -std::numeric_limits::infinity() : s_lse; + + const float max_lse = fmaxf(p_lse, s_lse); + p_lse = p_lse - max_lse; + s_lse = s_lse - max_lse; + const float p_se = expf(p_lse); + const float s_se = expf(s_lse); + const float out_se = p_se + s_se; + const float p_scale = p_se / out_se; + const float s_scale = s_se / out_se; + + if (pack_offset < head_size) { + // Pack 128b load + pack_128b_t p_out_pack = reinterpret_cast( + prefix_head_ptr)[pack_offset / pack_size]; + pack_128b_t s_out_pack = reinterpret_cast( + suffix_head_ptr)[pack_offset / pack_size]; + pack_128b_t o_out_pack; + +#pragma unroll + for (uint i = 0; i < pack_size; ++i) { + // Always use float for FMA to keep high precision. + // half(uint16_t), bfloat16, float -> float. + const float p_out_f = + vllm::to_float(reinterpret_cast(&p_out_pack)[i]); + const float s_out_f = + vllm::to_float(reinterpret_cast(&s_out_pack)[i]); + // fma: a * b + c = p_out_f * p_scale + (s_out_f * s_scale) + const float o_out_f = p_out_f * p_scale + (s_out_f * s_scale); + // float -> half(uint16_t), bfloat16, float. + vllm::from_float(reinterpret_cast(&o_out_pack)[i], o_out_f); + } + + // Pack 128b storage + reinterpret_cast(output_head_ptr)[pack_offset / pack_size] = + o_out_pack; + } + // We only need to write to output_lse once per head. + if (output_lse != nullptr && pack_idx == 0) { + float out_lse = logf(out_se) + max_lse; + output_lse[head_idx * num_tokens + token_idx] = out_lse; + } +} + +} // namespace vllm + +// The following macro is used to dispatch the conversion function based on +// the output data type. The FN is a macro that calls a function with +// template. +#define DISPATCH_BY_SCALAR_DTYPE(scalar_dtype, fn) \ + { \ + if (scalar_dtype == at::ScalarType::Float) { \ + fn(float); \ + } else if (scalar_dtype == at::ScalarType::Half) { \ + fn(uint16_t); \ + } else if (scalar_dtype == at::ScalarType::BFloat16) { \ + fn(__nv_bfloat16); \ + } else { \ + TORCH_CHECK(false, "Unsupported data type of O: ", scalar_dtype); \ + } \ + } + +#define LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS) \ + { \ + vllm::merge_attn_states_kernel \ + <<>>( \ + reinterpret_cast(output.data_ptr()), output_lse_ptr, \ + reinterpret_cast(prefix_output.data_ptr()), \ + reinterpret_cast(prefix_lse.data_ptr()), \ + reinterpret_cast(suffix_output.data_ptr()), \ + reinterpret_cast(suffix_lse.data_ptr()), num_tokens, \ + num_heads, head_size); \ + } + +/*@brief Merges the attention states from prefix and suffix + * into the output tensor. NUM_TOKENS: n, NUM_HEADS: h, HEAD_SIZE: d + * + * @param output [n,h,d] The output tensor to store the merged attention states. + * @param output_lse [h,d] Optional tensor to store the log-sum-exp values. + * @param prefix_output [n,h,d] The prefix attention states. + * @param prefix_lse [h,n] The log-sum-exp values for the prefix attention + * states. + * @param suffix_output [n,h,d] The suffix attention states. + * @param suffix_lse [h,n] The log-sum-exp values for the suffix attention + * states. + */ +template +void merge_attn_states_launcher(torch::Tensor& output, + std::optional output_lse, + const torch::Tensor& prefix_output, + const torch::Tensor& prefix_lse, + const torch::Tensor& suffix_output, + const torch::Tensor& suffix_lse) { + constexpr uint NUM_THREADS = 128; + const uint num_tokens = output.size(0); + const uint num_heads = output.size(1); + const uint head_size = output.size(2); + const uint pack_size = 16 / sizeof(scalar_t); + TORCH_CHECK(head_size % pack_size == 0, + "headsize must be multiple of pack_size:", pack_size); + float* output_lse_ptr = nullptr; + if (output_lse.has_value()) { + output_lse_ptr = output_lse.value().data_ptr(); + } + // Process one pack elements per thread. for float, the + // pack_size is 4 for half/bf16, the pack_size is 8. + const uint threads_per_head = head_size / pack_size; + const uint total_threads = num_tokens * num_heads * threads_per_head; + + dim3 block(NUM_THREADS); + dim3 grid((total_threads + NUM_THREADS - 1) / NUM_THREADS); + + const c10::cuda::OptionalCUDAGuard device_guard(prefix_output.device()); + auto stream = at::cuda::getCurrentCUDAStream(); + + LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS); +} + +#define CALL_MERGE_ATTN_STATES_LAUNCHER(scalar_t) \ + { \ + merge_attn_states_launcher(output, output_lse, prefix_output, \ + prefix_lse, suffix_output, \ + suffix_lse); \ + } + +void merge_attn_states(torch::Tensor& output, + std::optional output_lse, + const torch::Tensor& prefix_output, + const torch::Tensor& prefix_lse, + const torch::Tensor& suffix_output, + const torch::Tensor& suffix_lse) { + DISPATCH_BY_SCALAR_DTYPE(output.dtype(), CALL_MERGE_ATTN_STATES_LAUNCHER); +} diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index 4568699b30773..cf67847b45ba0 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -78,9 +78,14 @@ struct FP16Vec16 : public Vec { __m256i reg; + // normal load explicit FP16Vec16(const void* ptr) : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} + // non-temproal load + explicit FP16Vec16(bool, void* ptr) + : reg(_mm256_stream_load_si256((__m256i*)ptr)) {} + explicit FP16Vec16(const FP32Vec16&); void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } @@ -110,9 +115,14 @@ struct BF16Vec16 : public Vec { __m256i reg; + // normal load explicit BF16Vec16(const void* ptr) : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} + // non-temproal load + explicit BF16Vec16(bool, void* ptr) + : reg(_mm256_stream_load_si256((__m256i*)ptr)) {} + explicit BF16Vec16(const FP32Vec16&); void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } @@ -313,8 +323,13 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {} + // normal load explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {} + // non-temproal load + explicit FP32Vec16(bool, void* ptr) + : reg((__m512)_mm512_stream_load_si512(ptr)) {} + explicit FP32Vec16(__m512 data) : reg(data) {} explicit FP32Vec16(const FP32Vec4& data) @@ -547,6 +562,33 @@ struct INT8Vec16 : public Vec { _mm_mask_storeu_epi8(ptr, mask, reg); } }; + +struct INT8Vec64 : public Vec { + constexpr static int VEC_ELEM_NUM = 64; + union AliasReg { + __m512i reg; + int8_t values[VEC_ELEM_NUM]; + }; + + __m512i reg; + + // normal load + explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {} + + // non-temproal load + explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {} + + void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); } + + void save(int8_t* ptr, const int elem_num) const { + constexpr uint64_t M = 0xFFFFFFFFFFFFFFFF; + __mmask64 mask = _cvtu64_mask64(M >> (64 - elem_num)); + _mm512_mask_storeu_epi8(ptr, mask, reg); + } + + // non-temproal save + void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); } +}; #endif template @@ -657,6 +699,22 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) { inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); } +#ifdef __AVX512F__ +inline void non_temporal_save(FP16Vec16& vec, void* ptr) { + _mm256_stream_si256((__m256i*)ptr, vec.reg); +} +inline void non_temporal_save(BF16Vec32& vec, void* ptr) { + _mm512_stream_si512((__m512i*)ptr, vec.reg); +} +inline void non_temporal_save(BF16Vec16& vec, void* ptr) { + _mm256_stream_si256((__m256i*)ptr, vec.reg); +} +inline void non_temporal_save(FP32Vec16& vec, void* ptr) { + _mm512_stream_ps((float*)ptr, vec.reg); +} +#endif + +inline void mem_barrier() { _mm_mfence(); } }; // namespace vec_op #endif diff --git a/csrc/cpu/shm.cpp b/csrc/cpu/shm.cpp new file mode 100644 index 0000000000000..f55e96de251d0 --- /dev/null +++ b/csrc/cpu/shm.cpp @@ -0,0 +1,781 @@ +#include "cpu/cpu_types.hpp" + +#include +#include +#include +#include + +namespace { +#define MAX_SHM_RANK_NUM 8 +#define MAX_THREAD_NUM 12 +#define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024) +#define MIN_THREAD_PROCESS_SIZE (8 * 1024) +#define MAX_P2P_SEND_TENSOR_NUM 8 + +template +struct KernelVecType { + using scalar_vec_t = void; +}; + +template <> +struct KernelVecType { + using scalar_vec_t = vec_op::FP32Vec16; +}; + +template <> +struct KernelVecType { + using scalar_vec_t = vec_op::BF16Vec16; +}; + +template <> +struct KernelVecType { + using scalar_vec_t = vec_op::FP16Vec16; +}; + +enum class ThreadSHMStat : char { THREAD_READY = 0, SHM_DATA_READY, DONE }; + +struct ThreadSHMContext { + volatile ThreadSHMStat thread_stats[MAX_SHM_RANK_NUM]; + int thread_id; + int thread_num; + int rank; + int group_size; + size_t _spinning_count; + int swizzled_ranks[MAX_SHM_RANK_NUM]; + void* thread_shm_ptrs[MAX_SHM_RANK_NUM]; + ThreadSHMContext* shm_contexts[MAX_SHM_RANK_NUM]; + + ThreadSHMContext(const int thread_id, const int thread_num, const int rank, + const int group_size, void* thread_shm_ptr) + : thread_id(thread_id), + thread_num(thread_num), + rank(rank), + group_size(group_size), + _spinning_count(0) { + static_assert(sizeof(ThreadSHMContext) % 64 == 0); + TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM); + TORCH_CHECK((size_t)this % 64 == 0); + TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0); + for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) { + shm_contexts[i] = nullptr; + thread_shm_ptrs[i] = nullptr; + swizzled_ranks[i] = (i + rank) % group_size; + thread_stats[i] = ThreadSHMStat::DONE; + } + set_context(rank, this, thread_shm_ptr); + } + + void set_context(int rank, ThreadSHMContext* ptr, void* thread_shm_ptr) { + TORCH_CHECK(rank < MAX_SHM_RANK_NUM); + TORCH_CHECK(ptr); + TORCH_CHECK(thread_shm_ptr); + TORCH_CHECK_EQ(ptr->thread_num, thread_num); + TORCH_CHECK_EQ(ptr->thread_id, thread_id); + shm_contexts[rank] = ptr; + thread_shm_ptrs[rank] = thread_shm_ptr; + } + + template + T* get_thread_shm_ptr(int rank) { + return reinterpret_cast(thread_shm_ptrs[rank]); + } + + int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; } + + void wait_for_all(ThreadSHMStat prev_stat) { + for (int idx = 0; idx < group_size; ++idx) { + int rank = get_swizzled_rank(idx); + while (thread_stats[rank] == prev_stat) { + ++_spinning_count; + _mm_pause(); + } + } + vec_op::mem_barrier(); + } + + void wait_for_one(int rank, ThreadSHMStat prev_stat) { + while (thread_stats[rank] == prev_stat) { + ++_spinning_count; + _mm_pause(); + } + vec_op::mem_barrier(); + } + + void set_thread_stat(ThreadSHMStat stat) { + for (int idx = 0; idx < group_size; ++idx) { + int rank = get_swizzled_rank(idx); + shm_contexts[rank]->thread_stats[this->rank] = stat; + } + } + + void set_thread_stat(int target_rank, ThreadSHMStat stat) { + for (int idx = 0; idx < group_size; ++idx) { + int rank = get_swizzled_rank(idx); + shm_contexts[rank]->thread_stats[target_rank] = stat; + } + } + + // barrier for all ranks in the group, used for all2all ops + // DONE -> THREAD_READY -> SHM_DATA_READY -> DONE -> ... + void barrier(ThreadSHMStat next_stat) { + if (next_stat == ThreadSHMStat::THREAD_READY) { + set_thread_stat(ThreadSHMStat::THREAD_READY); + wait_for_all(ThreadSHMStat::DONE); + } else if (next_stat == ThreadSHMStat::SHM_DATA_READY) { + set_thread_stat(ThreadSHMStat::SHM_DATA_READY); + wait_for_all(ThreadSHMStat::THREAD_READY); + } else if (next_stat == ThreadSHMStat::DONE) { + set_thread_stat(ThreadSHMStat::DONE); + wait_for_all(ThreadSHMStat::SHM_DATA_READY); + } else { + TORCH_CHECK(false, "Invalid next_stat to barrier."); + } + } + + std::string to_string() const { + std::stringstream ss; + ss << "SHMContext:"; + ss << "\nrank: " << rank; + ss << "\ngroup_size: " << group_size; + ss << "\nthread_num: " << thread_num; + ss << "\nthread_id: " << thread_id; + + ss << "\nshm_ctx_stat_loop_seq: ["; + for (int i = 0; i < group_size; ++i) { + ss << swizzled_ranks[i] << ", "; + } + ss << "]"; + + ss << "\nshm_contexts: ["; + for (int i = 0; i < group_size; ++i) { + if (shm_contexts[i]) { + ss << shm_contexts[i]->rank << ", "; + } + } + ss << "]"; + + return ss.str(); + } +}; + +class SHMManager { + public: + explicit SHMManager(const std::string& name, const int rank, + const int group_size) + : _rank(rank), + _group_size(group_size), + _thread_num(std::min(torch::get_num_threads(), MAX_THREAD_NUM)), + _shm_names({""}), + _shared_mem_ptrs({nullptr}), + _shm_ctx(nullptr) { + _shm_names[rank] = get_shm_name(name, rank); + _shared_mem_ptrs[rank] = init_shm(rank); + _shm_ctx = reinterpret_cast(_shared_mem_ptrs[rank]); + + for (int i = 0; i < _thread_num; ++i) { + ThreadSHMContext* ctx = new (_shm_ctx + i) + ThreadSHMContext(i, _thread_num, _rank, _group_size, + compute_thread_shm_ptr(_shm_ctx, i)); + } + } + + void join(const std::string& name) { + for (int rank_idx = 0; rank_idx < _group_size; ++rank_idx) { + if (rank_idx != _rank) { + TORCH_CHECK(_shm_names[rank_idx].empty()); + TORCH_CHECK(_shared_mem_ptrs[rank_idx] == nullptr); + _shm_names[rank_idx] = get_shm_name(name, rank_idx); + _shared_mem_ptrs[rank_idx] = init_shm(rank_idx); + ThreadSHMContext* target_ctx = + reinterpret_cast(_shared_mem_ptrs[rank_idx]); + for (int thread_idx = 0; thread_idx < _thread_num; ++thread_idx) { + _shm_ctx[thread_idx].set_context( + rank_idx, target_ctx + thread_idx, + compute_thread_shm_ptr(target_ctx, thread_idx)); + } + } + } + } + + ~SHMManager() { destroy_shm(); } + + ThreadSHMContext* get_shm_ctx() const { return _shm_ctx; } + + static std::string get_shm_name(const std::string& name, int rank) { + return name + "_" + std::to_string(rank); + } + + static int64_t create_singleton_instance(const std::string& name, + const int group_size, + const int rank) { + std::lock_guard guard(SingletonInstancesLock); + SingletonInstances.emplace_back( + std::make_unique(name, rank, group_size)); + return static_cast(SingletonInstances.size() - 1); + } + + static SHMManager* get_singleton_instance(int64_t handle) { + return SingletonInstances[handle].get(); + } + + protected: + static std::vector> SingletonInstances; + static std::mutex SingletonInstancesLock; + + private: + static size_t round_to_alignment(size_t num) { + return ((num + 63) / 64) * 64; + } + + int8_t* compute_thread_shm_ptr(ThreadSHMContext* ctx, int thread_id) { + int8_t* thread_shm_ptr = + reinterpret_cast(ctx) + + round_to_alignment(_thread_num * sizeof(ThreadSHMContext)); + return thread_shm_ptr + + thread_id * round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES); + } + + size_t compute_shm_size() { + const size_t rounded_rank_buffer_size = + round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES) * _thread_num; + const size_t rounded_thread_shm_ctx_size = + round_to_alignment(_thread_num * sizeof(ThreadSHMContext)); + const size_t shm_size = + rounded_thread_shm_ctx_size + rounded_rank_buffer_size; + return shm_size; + } + + void* init_shm(int target_rank) { + const std::string& shm_name = _shm_names[target_rank]; + const int local_rank = _rank; + const size_t shm_size = compute_shm_size(); + + int fd = -1; + if (local_rank == target_rank) { + fd = shm_open(shm_name.c_str(), O_CREAT | O_EXCL | O_RDWR, + S_IRUSR | S_IWUSR); + + if (fd == -1) + TORCH_CHECK(false, "create shm in SHMManager failed. errno: " + + std::to_string(errno)); + + if (ftruncate(fd, shm_size) == -1) + TORCH_CHECK(false, "ftruncate in SHMManager failed. errno: " + + std::to_string(errno)); + } else { + fd = shm_open(shm_name.c_str(), O_RDWR, S_IRUSR | S_IWUSR); + + if (fd == -1) + TORCH_CHECK(false, "open shm in SHMManager failed. errno: " + + std::to_string(errno)); + } + + void* shm_ptr = mmap(nullptr, shm_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + + if (shm_ptr == MAP_FAILED) { + TORCH_CHECK(false, + "mmap in SHMManager failed. errno: " + std::to_string(errno)); + } + + if (close(fd) != 0) { + TORCH_CHECK( + false, "close in SHMManager failed. errno: " + std::to_string(errno)); + } + + TORCH_CHECK((size_t)shm_ptr % 64 == 0); + + return shm_ptr; + } + + void destroy_shm() { + std::stringstream ss; + ss << "local rank " << _rank << ": ["; + for (int thread_id = 0; thread_id < _thread_num; ++thread_id) { + ss << _shm_ctx[thread_id]._spinning_count << ", "; + } + ss << "]\n"; + + for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) { + if (_shared_mem_ptrs[i] != nullptr) { + munmap(_shared_mem_ptrs[i], compute_shm_size()); + } + + if (!_shm_names[i].empty()) { + shm_unlink(_shm_names[i].c_str()); + } + } + } + + int _rank; + int _group_size; + int _thread_num; + std::array _shm_names; + std::array _shared_mem_ptrs; + ThreadSHMContext* _shm_ctx; +}; + +namespace shm_cc_ops { +template +void shm_cc_loop(ThreadSHMContext* ctx, int64_t elem_num, F&& inner_func) { + int thread_num = ctx->thread_num; + int64_t total_bytes = elem_num * sizeof(scalar_t); + int64_t total_units_num = + (total_bytes + MIN_THREAD_PROCESS_SIZE - 1) / MIN_THREAD_PROCESS_SIZE; + int64_t per_thread_units_num = + (total_units_num + thread_num - 1) / thread_num; + int64_t per_unit_elem_num = MIN_THREAD_PROCESS_SIZE / sizeof(scalar_t); + int64_t max_per_thread_iteration_elem_num = + PER_THREAD_SHM_BUFFER_BYTES / sizeof(scalar_t); + int64_t per_thread_elem_num = per_unit_elem_num * per_thread_units_num; + +#pragma omp parallel for schedule(static, 1) + for (int i = 0; i < thread_num; ++i) { + int64_t offset = i * per_thread_elem_num; + int64_t end = std::min(elem_num, offset + per_thread_elem_num); + int64_t curr_elem_num = + std::min(max_per_thread_iteration_elem_num, end - offset); + ThreadSHMContext* thread_ctx = ctx + i; + + while (curr_elem_num > 0) { + inner_func(thread_ctx, offset, curr_elem_num); + + offset += max_per_thread_iteration_elem_num; + curr_elem_num = std::min(max_per_thread_iteration_elem_num, end - offset); + } + } +} +}; // namespace shm_cc_ops + +namespace shm_cc_ops { + +void memcpy_from_shm(void* dst, void* src, const int64_t bytes) { + const int64_t aligned_bytes = ((bytes >> 6) << 6); // 64 bytes aligned + int64_t i = 0; +#pragma GCC unroll 4 + for (; i < aligned_bytes; i += 64) { + vec_op::INT8Vec64 data( + true, (int8_t*)src + i); // stream loading shm to avoid caching + data.save((int8_t*)dst + i); + } + if (aligned_bytes < bytes) { + vec_op::INT8Vec64 data(true, (int8_t*)src + aligned_bytes); + data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes); + } +} + +void memcpy_to_shm(void* dst, void* src, const int64_t bytes) { +#pragma GCC unroll 4 + for (int64_t i = 0; i < bytes; i += 64) { + vec_op::INT8Vec64 data((int8_t*)src + i); + data.nt_save((int8_t*)dst + i); + } +} + +void memcpy(void* dst, void* src, const int64_t bytes) { + const int64_t aligned_bytes = ((bytes >> 6) << 6); // 64 bytes aligned + int64_t i = 0; +#pragma GCC unroll 4 + for (; i < aligned_bytes; i += 64) { + vec_op::INT8Vec64 data((int8_t*)src + i); + data.save((int8_t*)dst + i); + } + if (aligned_bytes < bytes) { + vec_op::INT8Vec64 data((int8_t*)src + aligned_bytes); + data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes); + } +} + +template +void all_reduce_sum_impl(ThreadSHMContext* ctx, scalar_t* data, + size_t elem_num) { + CPU_KERNEL_GUARD_IN(all_reduce_sum_impl) + using vec_t = typename KernelVecType::scalar_vec_t; + constexpr int64_t vec_elem_num = vec_t::get_elem_num(); + const int worldsize = ctx->group_size; + + shm_cc_ops::shm_cc_loop( + ctx, elem_num, + [&](ThreadSHMContext* thread_ctx, int64_t data_offset, + int64_t data_elem_num) { + int rank = thread_ctx->rank; + scalar_t* thread_shm_ptr = + thread_ctx->get_thread_shm_ptr(rank); + scalar_t* thread_data_ptr = data + data_offset; + int64_t thread_data_elem_num = data_elem_num * sizeof(scalar_t); + + scalar_t* remote_data_ptrs[RANKS - 1]; + vec_op::unroll_loop([&](int idx) { + remote_data_ptrs[idx] = thread_ctx->get_thread_shm_ptr( + thread_ctx->get_swizzled_rank(idx + 1)); + }); + + thread_ctx->barrier(ThreadSHMStat::THREAD_READY); + + shm_cc_ops::memcpy_to_shm(thread_shm_ptr, thread_data_ptr, + thread_data_elem_num); + + thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY); + + int64_t aligned_data_elem_num = + (data_elem_num / vec_elem_num) * vec_elem_num; + int64_t i = 0; +#pragma GCC unroll 4 + for (; i < aligned_data_elem_num; i += vec_elem_num) { + vec_t local_data(thread_data_ptr + i); // load from cache + vec_op::FP32Vec16 local_data_fp32(local_data); + vec_op::unroll_loop([&](int idx) { + vec_t remote_data( + true, remote_data_ptrs[idx] + i); // stream load from shm + vec_op::FP32Vec16 remote_data_fp32(remote_data); + local_data_fp32 = local_data_fp32 + remote_data_fp32; // sum reduce + }); + vec_t reduced_data(local_data_fp32); + reduced_data.save(thread_data_ptr + i); + } + + if (i < data_elem_num) { + vec_t local_data(thread_data_ptr + i); // load from cache + vec_op::FP32Vec16 local_data_fp32(local_data); + vec_op::unroll_loop([&](int idx) { + vec_t remote_data( + true, remote_data_ptrs[idx] + i); // stream load from shm + vec_op::FP32Vec16 remote_data_fp32(remote_data); + local_data_fp32 = local_data_fp32 + remote_data_fp32; // sum reduce + }); + vec_t reduced_data(local_data_fp32); + reduced_data.save(thread_data_ptr + i, + data_elem_num - aligned_data_elem_num); + } + + thread_ctx->barrier(ThreadSHMStat::DONE); + }); + + return; +} +}; // namespace shm_cc_ops + +std::vector> SHMManager::SingletonInstances = {}; +std::mutex SHMManager::SingletonInstancesLock = {}; + +template +void shm_allreduce_sum(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num) { + switch (ctx->group_size) { + case 2: + shm_cc_ops::all_reduce_sum_impl(ctx, data, elem_num); + break; + case 3: + shm_cc_ops::all_reduce_sum_impl(ctx, data, elem_num); + break; + case 4: + shm_cc_ops::all_reduce_sum_impl(ctx, data, elem_num); + break; + case 8: + shm_cc_ops::all_reduce_sum_impl(ctx, data, elem_num); + break; + default: + TORCH_CHECK(false, + "Invalid world size: " + std::to_string(ctx->group_size)); + } +} + +template +void shm_gather_impl(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num, + scalar_t** outputs, const int dst) { + CPU_KERNEL_GUARD_IN(shm_gather_impl) + const int worldsize = ctx->group_size; + TORCH_CHECK_LT(dst, worldsize); + shm_cc_ops::shm_cc_loop( + ctx, elem_num, + [&](ThreadSHMContext* thread_ctx, int64_t data_offset, + int64_t data_elem_num) { + int rank = thread_ctx->rank; + scalar_t* thread_shm_ptr = + thread_ctx->get_thread_shm_ptr(rank); + + thread_ctx->barrier(ThreadSHMStat::THREAD_READY); + + shm_cc_ops::memcpy_to_shm(thread_shm_ptr, data + data_offset, + data_elem_num * sizeof(scalar_t)); + + thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY); + + if (rank == dst) { + shm_cc_ops::memcpy(outputs[rank] + data_offset, data + data_offset, + data_elem_num * sizeof(scalar_t)); + for (int i = 1; i < worldsize; ++i) { + int src_rank = thread_ctx->get_swizzled_rank(i); + scalar_t* src_ptr = + thread_ctx->get_thread_shm_ptr(src_rank); // shm + scalar_t* dst_ptr = outputs[src_rank] + data_offset; + shm_cc_ops::memcpy_from_shm(dst_ptr, src_ptr, + data_elem_num * sizeof(scalar_t)); + } + } + + thread_ctx->barrier(ThreadSHMStat::DONE); + }); + + return; +} + +struct MemPiece { + void* ptr; + int64_t size; + + template + T* data_ptr() { + return reinterpret_cast(ptr); + } +}; + +struct TensorListMeta { + int64_t tensor_bytes[MAX_P2P_SEND_TENSOR_NUM]; + torch::ScalarType tensor_types[MAX_P2P_SEND_TENSOR_NUM]; + int64_t tensor_num; + int64_t total_bytes; + + TensorListMeta() : tensor_num(0), total_bytes(0) { + static_assert(sizeof(TensorListMeta) % 64 == 0); + static_assert(sizeof(TensorListMeta) < + MIN_THREAD_PROCESS_SIZE); // To ensure the metadata always + // hold by the thread 0 + for (int i = 0; i < MAX_P2P_SEND_TENSOR_NUM; ++i) { + tensor_bytes[i] = 0; + tensor_ptrs[i] = nullptr; + tensor_types[i] = torch::ScalarType::Undefined; + } + } + + // For send and recv + void bind_tensor_list(std::vector& tensor_list) { + TORCH_CHECK(tensor_types[0] == torch::ScalarType::Undefined, + "Re-bind TensorListMeta is not allowed.") + TORCH_CHECK_LE(tensor_list.size(), MAX_P2P_SEND_TENSOR_NUM); + tensor_num = tensor_list.size(); + int64_t bytes_sum = 0; + for (int i = 0; i < tensor_list.size(); ++i) { + torch::Tensor& t = tensor_list[i]; + TORCH_CHECK(t.is_contiguous()); + tensor_bytes[i] = t.nbytes(); + tensor_types[i] = t.scalar_type(); + tensor_ptrs[i] = t.data_ptr(); + bytes_sum += t.nbytes(); + } + total_bytes = bytes_sum; + } + + // For recv + std::vector generate_tensor_list() { + std::vector tensor_list; + tensor_list.reserve(tensor_num); + + for (int i = 0; i < tensor_num; ++i) { + int64_t bytes = tensor_bytes[i]; + auto type = tensor_types[i]; + int64_t elem_bytes = torch::elementSize(type); + + TORCH_CHECK_EQ(bytes % elem_bytes, 0); + int64_t elem_num = bytes / elem_bytes; + auto options = torch::TensorOptions().dtype(type).device(torch::kCPU); + tensor_list.emplace_back(torch::empty({elem_num}, options)); + } + return tensor_list; + } + + MemPiece get_data(int64_t offset) { + for (int i = 0; i < tensor_num; ++i) { + if (offset < tensor_bytes[i]) { + return {reinterpret_cast(tensor_ptrs[i]) + offset, + tensor_bytes[i] - offset}; + } + offset -= tensor_bytes[i]; + } + return {nullptr, 0}; + } + + private: + void* tensor_ptrs[MAX_P2P_SEND_TENSOR_NUM]; + int8_t _padding[40]; +}; + +void shm_send_tensor_list_impl(ThreadSHMContext* ctx, + const std::vector& tensor_list) { + CPU_KERNEL_GUARD_IN(shm_send_tensor_list_impl) + std::vector tensor_list_with_metadata; + tensor_list_with_metadata.reserve(1 + tensor_list.size()); + + auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU); + tensor_list_with_metadata.emplace_back( + torch::empty({sizeof(TensorListMeta)}, options)); + tensor_list_with_metadata.insert(tensor_list_with_metadata.end(), + tensor_list.begin(), tensor_list.end()); + + torch::Tensor& metadata_tensor = tensor_list_with_metadata[0]; + TORCH_CHECK_EQ(metadata_tensor.nbytes(), sizeof(TensorListMeta)); + + TensorListMeta* metadata = new (metadata_tensor.data_ptr()) TensorListMeta(); + metadata->bind_tensor_list(tensor_list_with_metadata); + + shm_cc_ops::shm_cc_loop( + ctx, metadata->total_bytes, + [&](ThreadSHMContext* thread_ctx, int64_t data_offset, + int64_t data_elem_num) { + int rank = thread_ctx->rank; + // Wait until the receiver set the stat to DONE + thread_ctx->wait_for_one(rank, ThreadSHMStat::SHM_DATA_READY); + + int64_t curr_shm_offset = 0; + while (curr_shm_offset < data_elem_num) { + MemPiece frag = metadata->get_data(data_offset + curr_shm_offset); + frag.size = std::min(frag.size, data_elem_num - curr_shm_offset); + shm_cc_ops::memcpy( + thread_ctx->get_thread_shm_ptr(rank) + curr_shm_offset, + frag.ptr, frag.size); + curr_shm_offset += frag.size; + } + + thread_ctx->set_thread_stat(rank, ThreadSHMStat::SHM_DATA_READY); + }); +} + +std::vector shm_recv_tensor_list_impl(ThreadSHMContext* ctx, + int64_t src) { + CPU_KERNEL_GUARD_IN(shm_recv_tensor_list_impl) + auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU); + torch::Tensor metadata_tensor = + torch::empty({sizeof(TensorListMeta)}, options); + + // Wait until the sender set the stat of the thread 0 to SHM_DATA_READY + ctx->wait_for_one(src, ThreadSHMStat::DONE); + shm_cc_ops::memcpy(metadata_tensor.data_ptr(), + ctx->get_thread_shm_ptr(src), + sizeof(TensorListMeta)); + TensorListMeta* src_metadata = + reinterpret_cast(metadata_tensor.data_ptr()); + std::vector tensor_list_with_metadata = + src_metadata->generate_tensor_list(); + + TensorListMeta metadata; + metadata.bind_tensor_list(tensor_list_with_metadata); + TORCH_CHECK_EQ(metadata.tensor_num, src_metadata->tensor_num); + TORCH_CHECK_EQ(metadata.total_bytes, src_metadata->total_bytes); + + shm_cc_ops::shm_cc_loop( + ctx, metadata.total_bytes, + [&](ThreadSHMContext* thread_ctx, int64_t data_offset, + int64_t data_elem_num) { + // Wait until the sender set the stat to SHM_DATA_READY + thread_ctx->wait_for_one(src, ThreadSHMStat::DONE); + int64_t curr_shm_offset = 0; + while (curr_shm_offset < data_elem_num) { + MemPiece frag = metadata.get_data(data_offset + curr_shm_offset); + frag.size = std::min(frag.size, data_elem_num - curr_shm_offset); + shm_cc_ops::memcpy( + frag.ptr, + thread_ctx->get_thread_shm_ptr(src) + curr_shm_offset, + frag.size); + curr_shm_offset += frag.size; + } + + thread_ctx->set_thread_stat(src, ThreadSHMStat::DONE); + }); + + std::vector tensor_list; + tensor_list.reserve(metadata.tensor_num - 1); + tensor_list.insert(tensor_list.begin(), tensor_list_with_metadata.begin() + 1, + tensor_list_with_metadata.end()); + + return tensor_list; +} +} // namespace + +void shm_gather(int64_t handle, torch::Tensor& data, + const std::optional>& outputs, + int64_t dst) { + TORCH_CHECK(data.is_contiguous()) + VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_gather_impl", [&] { + CPU_KERNEL_GUARD_IN(shm_gather_impl) + + if (outputs.has_value()) { + TORCH_CHECK_LE(outputs->size(), MAX_SHM_RANK_NUM); + scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr}; + for (int i = 0; i < outputs->size(); ++i) { + output_ptrs[i] = outputs->at(i).data_ptr(); + } + shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(), + data.data_ptr(), data.numel(), output_ptrs, + dst); + } else { + shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(), + data.data_ptr(), data.numel(), (scalar_t**)(0), + dst); + } + + CPU_KERNEL_GUARD_OUT(shm_gather_impl) + }); +} + +void shm_all_gather(int64_t handle, const torch::Tensor& data, + torch::Tensor& output) { + TORCH_CHECK(data.is_contiguous()) + TORCH_CHECK(output.is_contiguous()) + + const int64_t input_elem_num = data.numel(); + const int64_t output_elem_num = output.numel(); + TORCH_CHECK_EQ(output_elem_num % input_elem_num, 0); + const int world_size = output_elem_num / input_elem_num; + + VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_all_gather_impl", [&] { + CPU_KERNEL_GUARD_IN(shm_all_gather_impl) + auto ctx = SHMManager::get_singleton_instance(handle)->get_shm_ctx(); + TORCH_CHECK_EQ(ctx->group_size, world_size); + + scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr}; + for (int i = 0; i < world_size; ++i) { + output_ptrs[i] = output.data_ptr() + i * input_elem_num; + } + shm_gather_impl(ctx, data.data_ptr(), data.numel(), output_ptrs, + ctx->rank); + CPU_KERNEL_GUARD_OUT(shm_all_gather_impl) + }); +} + +void shm_allreduce(int64_t handle, torch::Tensor& data) { + TORCH_CHECK(data.is_contiguous()) + VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_allreduce_sum", [&] { + CPU_KERNEL_GUARD_IN(shm_allreduce_sum) + shm_allreduce_sum(SHMManager::get_singleton_instance(handle)->get_shm_ctx(), + data.data_ptr(), data.numel()); + CPU_KERNEL_GUARD_OUT(shm_allreduce_sum) + }); +} + +void shm_send_tensor_list(int64_t handle, + const std::vector& tensor_list, + int64_t dst) { + CPU_KERNEL_GUARD_IN(shm_send_tensor_list) + shm_send_tensor_list_impl( + SHMManager::get_singleton_instance(handle)->get_shm_ctx(), tensor_list); + CPU_KERNEL_GUARD_OUT(shm_send_tensor_list) +} + +std::vector shm_recv_tensor_list(int64_t handle, int64_t src) { + CPU_KERNEL_GUARD_IN(shm_recv_tensor_list) + auto tensor_list = shm_recv_tensor_list_impl( + SHMManager::get_singleton_instance(handle)->get_shm_ctx(), src); + CPU_KERNEL_GUARD_OUT(shm_recv_tensor_list) + return tensor_list; +} + +int64_t init_shm_manager(const std::string& name, const int64_t group_size, + const int64_t rank) { + return SHMManager::create_singleton_instance(name, group_size, rank); +} + +std::string join_shm_manager(int64_t handle, const std::string& name) { + auto shm_manager = SHMManager::get_singleton_instance(handle); + TORCH_CHECK(shm_manager); + shm_manager->join(name); + return shm_manager->get_shm_ctx()->to_string(); +} \ No newline at end of file diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index ef5a2fb5c4d22..7ae7e3386b4ed 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -22,6 +22,26 @@ void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query, torch::Tensor& kv_cache, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens); +int64_t init_shm_manager(const std::string& name, const int64_t group_size, + const int64_t rank); + +std::string join_shm_manager(int64_t handle, const std::string& name); + +void shm_allreduce(int64_t handle, torch::Tensor& data); + +void shm_gather(int64_t handle, torch::Tensor& data, + const std::optional>& outputs, + int64_t dst); + +void shm_all_gather(int64_t handle, const torch::Tensor& data, + torch::Tensor& output); + +void shm_send_tensor_list(int64_t handle, + const std::vector& tensor_list, + int64_t dst); + +std::vector shm_recv_tensor_list(int64_t handle, int64_t src); + TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops @@ -131,6 +151,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor? azp, Tensor? bias) -> ()"); ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp); #endif + +// SHM CCL +#ifdef __AVX512F__ + ops.def("init_shm_manager(str name, int group_size, int rank) -> int", + &init_shm_manager); + ops.def("join_shm_manager(int handle, str name) -> str", &join_shm_manager); + ops.def("shm_allreduce(int handle, Tensor! data) -> ()"); + ops.impl("shm_allreduce", torch::kCPU, &shm_allreduce); + ops.def( + "shm_gather(int handle, Tensor data, Tensor[](a!)? outputs, int dst) -> " + "()"); + ops.impl("shm_gather", torch::kCPU, &shm_gather); + ops.def( + "shm_all_gather(int handle, Tensor data, Tensor! output) -> " + "()"); + ops.impl("shm_all_gather", torch::kCPU, &shm_all_gather); + ops.def( + "shm_send_tensor_list(int handle, Tensor[](a) tensor_list, int dst) -> " + "()"); + ops.impl("shm_send_tensor_list", torch::kCPU, &shm_send_tensor_list); + ops.def("shm_recv_tensor_list(int handle, int src) -> Tensor[](a)", + &shm_recv_tensor_list); +#endif } TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp index 42a1c1d924bac..c17a8961629a6 100644 --- a/csrc/cpu/utils.cpp +++ b/csrc/cpu/utils.cpp @@ -4,6 +4,11 @@ #include #include #endif +#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 30 + #include + #include + #define gettid() syscall(SYS_gettid) +#endif #include "cpu_types.hpp" @@ -18,7 +23,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { #ifndef VLLM_NUMA_DISABLED std::string init_cpu_threads_env(const std::string& cpu_ids) { - bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str()); + bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str()); TORCH_CHECK(omp_cpu_mask->size > 0); std::vector omp_cpu_ids; omp_cpu_ids.reserve(omp_cpu_mask->size); diff --git a/csrc/cuda_view.cu b/csrc/cuda_view.cu new file mode 100644 index 0000000000000..938bd4ab7fc62 --- /dev/null +++ b/csrc/cuda_view.cu @@ -0,0 +1,39 @@ +#include +#include +#include + +// This function assumes that `cpu_tensor` is a CPU tensor allocated with pinned +// memory, and that UVA (Unified Virtual Addressing) is enabled. +torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) { + TORCH_CHECK(cpu_tensor.device().is_cpu(), "Input tensor must be on CPU"); + + // Get raw host pointer from CPU tensor + void* host_ptr = cpu_tensor.data_ptr(); + + // Get a device pointer corresponding to the pinned host memory + void* device_ptr = nullptr; + cudaError_t err = cudaHostGetDevicePointer(&device_ptr, host_ptr, 0); + TORCH_CHECK(err == cudaSuccess, + "cudaHostGetDevicePointer failed: ", cudaGetErrorString(err)); + + // We'll use the same sizes, strides, and dtype as the CPU tensor. + // TODO: check if layout is respected. + auto sizes = cpu_tensor.sizes(); + auto strides = cpu_tensor.strides(); + auto options = cpu_tensor.options().device(torch::kCUDA); + + // from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter, + // const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the + // memory, so we don't free it here. + auto deleter = [](void*) { + // no-op, since the memory is owned by the original CPU tensor + }; + + torch::Tensor cuda_tensor = + torch::from_blob(device_ptr, sizes, strides, deleter, options); + + TORCH_CHECK(cuda_tensor.device().is_cuda(), + "Resulting tensor is not on CUDA device"); + + return cuda_tensor; +} diff --git a/csrc/custom_all_reduce.cu b/csrc/custom_all_reduce.cu index 123278bfed71d..a38d6fa24a28e 100644 --- a/csrc/custom_all_reduce.cu +++ b/csrc/custom_all_reduce.cu @@ -12,7 +12,7 @@ static_assert(sizeof(void*) == sizeof(fptr_t)); fptr_t init_custom_ar(const std::vector& fake_ipc_ptrs, torch::Tensor& rank_data, int64_t rank, - bool full_nvlink) { + bool fully_connected) { int world_size = fake_ipc_ptrs.size(); if (world_size > 8) throw std::invalid_argument("world size > 8 is not supported"); @@ -27,7 +27,7 @@ fptr_t init_custom_ar(const std::vector& fake_ipc_ptrs, } return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(), rank_data.numel(), rank, world_size, - full_nvlink); + fully_connected); } /** @@ -142,3 +142,48 @@ void register_graph_buffers(fptr_t _fa, bytes.reserve(handles.size()); fa->register_graph_buffers(bytes, offsets); } + +std::tuple allocate_shared_buffer_and_handle( + int64_t size) { + auto device_index = c10::cuda::current_device(); + at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index)); + void* buffer; + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + auto stream = c10::cuda::getCurrentCUDAStream().stream(); + AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + + // Allocate buffer +#if defined(USE_ROCM) + // data buffers need to be "uncached" for signal on MI200 + AT_CUDA_CHECK( + hipExtMallocWithFlags((void**)&buffer, size, hipDeviceMallocUncached)); +#else + AT_CUDA_CHECK(cudaMalloc((void**)&buffer, size)); +#endif + AT_CUDA_CHECK(cudaMemsetAsync(buffer, 0, size, stream)); + AT_CUDA_CHECK(cudaStreamSynchronize(stream)); + AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + + // Create IPC memhandle for the allocated buffer. + // Will use it in open_mem_handle. + auto options = + torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU); + auto handle = + torch::empty({static_cast(sizeof(cudaIpcMemHandle_t))}, options); + AT_CUDA_CHECK( + cudaIpcGetMemHandle((cudaIpcMemHandle_t*)handle.data_ptr(), buffer)); + + return std::make_tuple(reinterpret_cast(buffer), handle); +} + +fptr_t open_mem_handle(torch::Tensor& mem_handle) { + void* ipc_ptr; + AT_CUDA_CHECK(cudaIpcOpenMemHandle( + (void**)&ipc_ptr, *((const cudaIpcMemHandle_t*)mem_handle.data_ptr()), + cudaIpcMemLazyEnablePeerAccess)); + return reinterpret_cast(ipc_ptr); +} + +void free_shared_buffer(fptr_t buffer) { + AT_CUDA_CHECK(cudaFree(reinterpret_cast(buffer))); +} diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh index b9df4ed160b03..44709b4597765 100644 --- a/csrc/custom_all_reduce.cuh +++ b/csrc/custom_all_reduce.cuh @@ -5,6 +5,10 @@ #include #include +#if defined(USE_ROCM) +typedef __hip_bfloat16 nv_bfloat16; +#endif + #include #include #include @@ -12,6 +16,7 @@ #include #include +namespace vllm { #define CUDACHECK(cmd) \ do { \ cudaError_t e = cmd; \ @@ -22,24 +27,37 @@ } \ } while (0) -namespace vllm { - +// Maximal number of blocks in allreduce kernel. constexpr int kMaxBlocks = 36; + +// Default number of blocks in allreduce kernel. +#ifndef USE_ROCM +const int defaultBlockLimit = 36; +CUpointer_attribute rangeStartAddrAttr = CU_POINTER_ATTRIBUTE_RANGE_START_ADDR; +#else +const int defaultBlockLimit = 16; +hipPointer_attribute rangeStartAddrAttr = + HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR; +#endif + // Counter may overflow, but it's fine since unsigned int overflow is // well-defined behavior. using FlagType = uint32_t; + +// Two sets of peer counters are needed for two syncs: starting and ending an +// operation. The reason is that it's possible for peer GPU block to arrive at +// the second sync point while the current GPU block haven't passed the first +// sync point. Thus, peer GPU may write counter+1 while current GPU is busy +// waiting for counter. We use alternating counter array to avoid this +// possibility. struct Signal { - alignas(128) FlagType self_counter[kMaxBlocks][8]; - // Two sets of peer counters are needed for two syncs. The reason is that - // it's possible for peer GPU block to arrive at the second sync point while - // the current GPU block haven't passed the first sync point. Thus, peer GPU - // may write counter+1 while current GPU is busy waiting for counter. We use - // alternating counter array to avoid this possibility. - alignas(128) FlagType peer_counter[2][kMaxBlocks][8]; + alignas(128) FlagType start[kMaxBlocks][8]; + alignas(128) FlagType end[kMaxBlocks][8]; + alignas(128) FlagType _flag[kMaxBlocks]; // incremental flags for each rank }; struct __align__(16) RankData { - const void* __restrict__ ptrs[8]; + const void* ptrs[8]; }; struct __align__(16) RankSignals { @@ -134,27 +152,29 @@ DINLINE O downcast(array_t val) { } } +#if !defined(USE_ROCM) + static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag), "l"(flag_addr)); -#else + #else asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag), "l"(flag_addr)); -#endif + #endif } static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) { FlagType flag; -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 asm volatile("ld.acquire.sys.global.u32 %0, [%1];" : "=r"(flag) : "l"(flag_addr)); -#else + #else asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;" : "=r"(flag) : "l"(flag_addr)); -#endif + #endif return flag; } @@ -170,37 +190,99 @@ static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) { return flag; } -// is_start: whether this is the very first synchronization barrier. -// need_fence: whether a memory fence is needed. If true, a release-acquire -// semantic is used to enforce memory access order before and after this -// barrier. -template -DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg, - int rank) { - if constexpr (!is_start) __syncthreads(); - static_assert( - !(is_start && need_fence)); // Start barrier shouldn't need fence. +// This function is meant to be used as the first synchronization in the all +// reduce kernel. Thus, it doesn't need to make any visibility guarantees for +// prior memory accesses. Note: volatile writes will not be reordered against +// other volatile writes. +template +DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg, + int rank) { + uint32_t flag = self_sg->_flag[blockIdx.x] + 1; if (threadIdx.x < ngpus) { - // Increment the counter. Technically we only need one counter, but we use - // multiple per block to eliminate the need to share the counter via smem. - auto val = self_sg->self_counter[blockIdx.x][threadIdx.x] += 1; + auto peer_counter_ptr = &sg.signals[threadIdx.x]->start[blockIdx.x][rank]; + auto self_counter_ptr = &self_sg->start[blockIdx.x][threadIdx.x]; + // Write the expected counter value to peer and wait for correct value + // from peer. + st_flag_volatile(peer_counter_ptr, flag); + while (ld_flag_volatile(self_counter_ptr) != flag); + } + __syncthreads(); + // use one thread to update flag + if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag; +} + +// This function is meant to be used as the second or the final +// synchronization barrier in the all reduce kernel. If it's the final +// synchronization barrier, we don't need to make any visibility guarantees +// for prior memory accesses. +template +DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) { + __syncthreads(); + uint32_t flag = self_sg->_flag[blockIdx.x] + 1; + if (threadIdx.x < ngpus) { + auto peer_counter_ptr = &sg.signals[threadIdx.x]->end[blockIdx.x][rank]; + auto self_counter_ptr = &self_sg->end[blockIdx.x][threadIdx.x]; // Write the expected counter value to peer and wait for correct value from // peer. - auto peer_counter_ptr = - &sg.signals[threadIdx.x]->peer_counter[val % 2][blockIdx.x][rank]; - auto self_counter_ptr = - &self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x]; - if constexpr (need_fence) { - st_flag_release(peer_counter_ptr, val); - while (ld_flag_acquire(self_counter_ptr) != val); + if constexpr (!final_sync) { + st_flag_release(peer_counter_ptr, flag); + while (ld_flag_acquire(self_counter_ptr) != flag); } else { - st_flag_volatile(peer_counter_ptr, val); - while (ld_flag_volatile(self_counter_ptr) != val); + st_flag_volatile(peer_counter_ptr, flag); + while (ld_flag_volatile(self_counter_ptr) != flag); } } - if constexpr (is_start || need_fence) __syncthreads(); + if constexpr (!final_sync) __syncthreads(); + + // use one thread to update flag + if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag; } +#else + +template +DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg, + int rank) { + uint32_t flag = self_sg->_flag[blockIdx.x] + 1; + if (threadIdx.x < ngpus) { + // simultaneously write to the corresponding flag of all ranks. + // Latency = 1 p2p write + __scoped_atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank], + flag, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); + // wait until we got true from all ranks + while (__scoped_atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x], + __ATOMIC_RELAXED, + __MEMORY_SCOPE_DEVICE) < flag); + } + __syncthreads(); + // use one thread to update flag + if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag; +} + +template +DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) { + __syncthreads(); + uint32_t flag = self_sg->_flag[blockIdx.x] + 1; + if (threadIdx.x < ngpus) { + // simultaneously write to the corresponding flag of all ranks. + // Latency = 1 p2p write + __scoped_atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank], + flag, + final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE, + __MEMORY_SCOPE_SYSTEM); + // wait until we got true from all ranks + while ( + __scoped_atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x], + final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE, + __MEMORY_SCOPE_DEVICE) < flag); + } + if constexpr (!final_sync) __syncthreads(); + // use one thread to update flag + if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag; +} + +#endif + template DINLINE P packed_reduce(const P* ptrs[], int idx) { A tmp = upcast(ptrs[0][idx]); @@ -220,13 +302,13 @@ __global__ void __launch_bounds__(512, 1) // note: we don't reorder the address so the accumulation order is the same // for all ranks, ensuring bitwise identical results auto dp = *_dp; - multi_gpu_barrier(sg, self_sg, rank); + barrier_at_start(sg, self_sg, rank); // do the actual reduction for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += gridDim.x * blockDim.x) { ((P*)result)[idx] = packed_reduce((const P**)&dp.ptrs[0], idx); } - multi_gpu_barrier(sg, self_sg, rank); + barrier_at_end(sg, self_sg, rank); } template @@ -255,18 +337,20 @@ __global__ void __launch_bounds__(512, 1) tmps[i] = get_tmp_buf

(sg.signals[target]); } auto tmp_out = tmps[0]; - multi_gpu_barrier(sg, self_sg, rank); + barrier_at_start(sg, self_sg, rank); + // stage 1: reduce scatter for (int idx = start + tid; idx < end; idx += stride) { tmp_out[idx - start] = packed_reduce(ptrs, idx); } - multi_gpu_barrier(sg, self_sg, rank); + barrier_at_end(sg, self_sg, rank); // stage 2: allgather. Note: it's important to match the tid between // the two stages, because visibility across devices is only guaranteed // between threads that have the same tid. If thread i computes the sum of - // start + i in the first stage, then thread i also gathers start + i from all - // ranks. + // start + i in the first stage, then thread i also gathers start + i from + // all ranks. + for (int idx = tid; idx < largest_part; idx += stride) { #pragma unroll for (int i = 0; i < ngpus; i++) { @@ -287,21 +371,22 @@ class CustomAllreduce { public: int rank_; int world_size_; - bool full_nvlink_; + // Full NVLink or xGMI connection between GPUs. + bool fully_connected_; RankSignals sg_; - // Stores an map from a pointer to its peer pointters from all ranks. + // Stores a map from a pointer to its peer pointers from all ranks. std::unordered_map buffers_; Signal* self_sg_; // Stores rank data from all ranks. This is mainly for cuda graph purposes. // For cuda graph to work, all kernel arguments must be fixed during graph - // capture time. However, the peer pointers are not known during graph capture - // time. Therefore, during capture, we increment the rank data pointer and use - // that as the argument to the kernel. The kernel arguments are stored in - // graph_unreg_buffers_. The actual peer pointers will be filled in at the - // memory pointed to by the pointers in graph_unreg_buffers_ when - // the IPC handles are exchanged between ranks. + // capture time. However, the peer pointers are not known during graph + // capture time. Therefore, during capture, we increment the rank data + // pointer and use that as the argument to the kernel. The kernel arguments + // are stored in graph_unreg_buffers_. The actual peer pointers will be + // filled in at the memory pointed to by the pointers in + // graph_unreg_buffers_ when the IPC handles are exchanged between ranks. // // The overall process looks like this: // 1. Graph capture. @@ -319,17 +404,18 @@ class CustomAllreduce { * Signals are an array of ipc-enabled buffers from all ranks. * For each of the buffer, the layout is as follows: * | -- sizeof(Signal) -- | ------ a few MB ----- | - * The first section is for allreduce synchronization, and the second section - * is for storing the intermediate results required by some allreduce algos. + * The first section is for allreduce synchronization, and the second + * section is for storing the intermediate results required by some + * allreduce algos. * * Note: this class does not own any device memory. Any required buffers * are passed in from the constructor. */ CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz, - int rank, int world_size, bool full_nvlink = true) + int rank, int world_size, bool fully_connected = true) : rank_(rank), world_size_(world_size), - full_nvlink_(full_nvlink), + fully_connected_(fully_connected), self_sg_(signals[rank]), d_rank_data_base_(reinterpret_cast(rank_data)), d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) { @@ -361,8 +447,7 @@ class CustomAllreduce { void* base_ptr; // note: must share the base address of each allocation, or we get wrong // address - if (cuPointerGetAttribute(&base_ptr, - CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, + if (cuPointerGetAttribute(&base_ptr, rangeStartAddrAttr, (CUdeviceptr)ptr) != CUDA_SUCCESS) throw std::runtime_error("failed to get pointer attr"); CUDACHECK(cudaIpcGetMemHandle( @@ -396,11 +481,11 @@ class CustomAllreduce { // Note: when registering graph buffers, we intentionally choose to not // deduplicate the addresses. That means if the allocator reuses some - // addresses, they will be registered again. This is to account for the remote - // possibility of different allocation patterns between ranks. For example, - // rank 1 may get the same input address for the second allreduce, but rank 2 - // got a different address. IPC handles have internal reference counting - // mechanism so overhead should be small. + // addresses, they will be registered again. This is to account for the + // remote possibility of different allocation patterns between ranks. For + // example, rank 1 may get the same input address for the second allreduce, + // but rank 2 got a different address. IPC handles have internal reference + // counting mechanism so overhead should be small. void register_graph_buffers( const std::vector& handles, const std::vector>& offsets) { @@ -431,15 +516,15 @@ class CustomAllreduce { /** * Performs allreduce, assuming input has already been registered. * - * Block and grid default configs are results after careful grid search. Using - * 36 blocks give the best or close to the best runtime on the devices I - * tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only - * take a small amount of SMs. Not quite sure the underlying reason, but my - * guess is that too many SMs will cause contention on NVLink bus. + * Block and grid default configs are results after careful grid search. + * Using 36 blocks give the best or close to the best runtime on the devices + * I tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also + * only take a small amount of SMs. Not quite sure the underlying reason, + * but my guess is that too many SMs will cause contention on NVLink bus. */ template void allreduce(cudaStream_t stream, T* input, T* output, int size, - int threads = 512, int block_limit = 36) { + int threads = 512, int block_limit = defaultBlockLimit) { auto d = packed_t::P::size; if (size % d != 0) throw std::runtime_error( @@ -473,13 +558,11 @@ class CustomAllreduce { #define KL(ngpus, name) \ name<<>>(ptrs, sg_, self_sg_, output, \ rank_, size); - // TODO(hanzhi713): Threshold is different for A100 and H100. - // Add per device threshold. #define REDUCE_CASE(ngpus) \ case ngpus: { \ if (world_size_ == 2) { \ KL(ngpus, cross_device_reduce_1stage); \ - } else if (full_nvlink_) { \ + } else if (fully_connected_) { \ if ((world_size_ <= 4 && bytes < 512 * 1024) || \ (world_size_ <= 8 && bytes < 256 * 1024)) { \ KL(ngpus, cross_device_reduce_1stage); \ @@ -497,7 +580,8 @@ class CustomAllreduce { REDUCE_CASE(8) default: throw std::runtime_error( - "custom allreduce only supports num gpus in (2,4,6,8). Actual num " + "custom allreduce only supports num gpus in (2,4,6,8). Actual " + "num " "gpus = " + std::to_string(world_size_)); } @@ -511,10 +595,11 @@ class CustomAllreduce { } } }; + /** - * To inspect PTX/SASS, copy paste this header file to compiler explorer and add - a template instantiation: + * To inspect PTX/SASS, copy paste this header file to compiler explorer and + add a template instantiation: * template void vllm::CustomAllreduce::allreduce(cudaStream_t, half *, half *, int, int, int); */ -} // namespace vllm +} // namespace vllm \ No newline at end of file diff --git a/csrc/custom_all_reduce_test.cu b/csrc/custom_all_reduce_test.cu index b59ea40d980f4..f7f0823465d30 100644 --- a/csrc/custom_all_reduce_test.cu +++ b/csrc/custom_all_reduce_test.cu @@ -1,9 +1,9 @@ /** * This is a standalone test for custom allreduce. * To compile, make sure you have MPI and NCCL installed in your system. - * export MPI_HOME=xxx + * export MPI_HOME=XXX * nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o - * custom_all_reduce_test -lnccl -I${MPI_HOME} -lmpi + * custom_all_reduce_test -lnccl -I${MPI_HOME}/include -lmpi * * Warning: this C++ test is not designed to be very readable and was used * during the rapid prototyping process. @@ -22,7 +22,15 @@ #include "cuda_profiler_api.h" #include "custom_all_reduce.cuh" #include "mpi.h" -#include "nccl.h" +#ifdef USE_ROCM + #include +typedef __hip_bfloat16 nv_bfloat16; + #include "rccl/rccl.h" + #include "custom_all_reduce_hip.cuh" +#else + #include "nccl.h" + #include "custom_all_reduce.cuh" +#endif #define MPICHECK(cmd) \ do { \ @@ -43,16 +51,29 @@ } \ } while (0) +#ifdef USE_ROCM __global__ void dummy_kernel() { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + for (int i = 0; i < 100; i++) { + uint64_t start = wall_clock64(); + uint64_t cycles_elapsed; + do { + cycles_elapsed = wall_clock64() - start; + } while (cycles_elapsed < 100); + } for (int i = 0; i < 100; i++) __nanosleep(1000000); // 100ms +} #else +__global__ void dummy_kernel() { + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + for (int i = 0; i < 100; i++) __nanosleep(1000000); // 100ms + #else for (int i = 0; i < 100; i++) { long long int start = clock64(); while (clock64() - start < 150000000); // approximately 98.4ms on P40 } -#endif + #endif } +#endif template __global__ void set_data(T* data, int size, int myRank) { @@ -121,8 +142,14 @@ void run(int myRank, int nRanks, ncclComm_t& comm, int threads, int block_limit, * registration, they are allocated and registered together in the test for * convenience. */ +#ifdef USE_ROCM + CUDACHECK(hipExtMallocWithFlags( + (void**)&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal), + hipDeviceMallocUncached)); +#else CUDACHECK( cudaMalloc(&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal))); +#endif CUDACHECK( cudaMemset(buffer, 0, 2 * data_size * sizeof(T) + sizeof(vllm::Signal))); CUDACHECK(cudaMalloc(&self_data_copy, data_size * sizeof(T))); @@ -311,13 +338,18 @@ int main(int argc, char** argv) { bool performance_test = true; cudaProfilerStart(); - // Uncomment to scan through different block size configs. - // for (int threads : {256, 512, 1024}) { - // for (int block_limit = 16; block_limit < 112; block_limit += 4) { - // run(myRank, nRanks, comm, threads, block_limit, 1024 * 1024, - // performance_test); - // } - // } +// Uncomment to scan through different block size configs. +// for (int threads : {256, 512, 1024}) { +// for (int block_limit = 16; block_limit < 112; block_limit += 4) { +// run(myRank, nRanks, comm, threads, block_limit, 1024 * 1024, +// performance_test); +// } +// } +#ifdef USE_ROCM + const int block_limit = 16; +#else + const int block_limit = 36; +#endif // Scan through different sizes to test performance. for (int sz = 512; sz <= (8 << 20); sz *= 2) { run(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test); @@ -326,4 +358,4 @@ int main(int argc, char** argv) { cudaProfilerStop(); MPICHECK(MPI_Finalize()); return EXIT_SUCCESS; -} +} \ No newline at end of file diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp index febc4eccd9561..dbe0e30f5cbfe 100644 --- a/csrc/cutlass_extensions/common.hpp +++ b/csrc/cutlass_extensions/common.hpp @@ -48,4 +48,14 @@ struct enable_sm90_or_later : Kernel { Kernel::operator()(std::forward(args)...); #endif } -}; \ No newline at end of file +}; + +template +struct enable_sm90_only : Kernel { + template + CUTLASS_DEVICE void operator()(Args&&... args) { +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 900 + Kernel::operator()(std::forward(args)...); +#endif + } +}; diff --git a/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp new file mode 100644 index 0000000000000..5c1d6e3f46be0 --- /dev/null +++ b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp @@ -0,0 +1,457 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights + *reserved. SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + *POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +// +// This file is a modified excerpt of +// include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp +// from https://github.com/NVIDIA/cutlass v3.5.0 +// It has been modified to support either row/column or scalar broadcasting +// where the tensor being loaded from is always passed in via a device pointer. +// This lets one compiled kernel handle all cases of per-tensor or +// per-channel/per-token quantization. +// +// This interface also allows the scales to be passed in as tensors that +// consistently reside on the device, which avoids an issue with a previous +// implementation where scalars needed to be on the CPU since they +// were passed in via float values. This created a potential performance hazard +// if scales were initially on the device, and caused torch.compile graphs +// breaks when moving scales to the CPU. +// +#pragma once + +// Turn off clang-format for the entire file to keep it close to upstream +// clang-format off + +#include "cutlass/cutlass.h" +#include "cutlass/arch/barrier.h" + +#include "cute/tensor.hpp" +#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp" + +namespace cutlass::epilogue::fusion { + +using namespace cute; +using namespace detail; + +// Row vector broadcast +template< + int Stages, + class CtaTileShapeMNK, + class Element, + class StrideMNL = Stride<_0,_1,_0>, + int Alignment = 128 / sizeof_bits_v +> +struct Sm90RowOrScalarBroadcastArray { + static_assert(Stages == 0, "Row broadcast doesn't support smem usage"); + static_assert(is_static_v(StrideMNL{}))>); // batch stride can be dynamic or static + static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{}); + + struct SharedStorage { + array_aligned(CtaTileShapeMNK{})> smem; + }; + + // This struct has been modified to have a bool indicating that ptr_row is a + // scalar that must be broadcast, instead of containing a scalar that is + // valid if ptr_row is null. + struct Arguments { + const Element* const* ptr_row_array = nullptr; + bool row_broadcast = true; + StrideMNL dRow = {}; + }; + + using Params = Arguments; + + template + static constexpr Params + to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) { + return args; + } + + template + static bool + can_implement(ProblemShape const& problem_shape, Arguments const& args) { + return true; + } + + template + static size_t + get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) { + return 0; + } + + template + static cutlass::Status + initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, + CudaHostAdapter* cuda_adapter = nullptr) { + return cutlass::Status::kSuccess; + } + + CUTLASS_HOST_DEVICE + Sm90RowOrScalarBroadcastArray() { } + + CUTLASS_HOST_DEVICE + Sm90RowOrScalarBroadcastArray(Params const& params, SharedStorage const& shared_storage) + : params(params) + , smem(const_cast(shared_storage.smem.data())) { } + + Params params; + Element *smem = nullptr; + + CUTLASS_DEVICE bool + is_producer_load_needed() const { + return false; + } + + CUTLASS_DEVICE bool + is_C_load_needed() const { + return false; + } + + CUTLASS_DEVICE bool + is_zero() const { + return (!params.row_broadcast && *(params.ptr_row_array[group]) == Element(0)); + } + + template + CUTLASS_DEVICE auto + get_producer_load_callbacks(ProducerLoadArgs const& args) { + return EmptyProducerLoadCallbacks{}; + } + + template + struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks { + CUTLASS_DEVICE + ConsumerStoreCallbacks( + GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_, + GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_, + SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_, + CTensor tCcRow_, ThrResidue residue_tCcRow_, ThrNum thr_num_, + int group, Params const& params_) + : tGS_gRow(tGS_gRow_) + , tGS_sRow(tGS_sRow_) + , tGS_cRow(tGS_cRow_) + , tiled_G2S(tiled_g2s_) + , tSR_sRow(tSR_sRow_) + , tSR_rRow(tSR_rRow_) + , tCcRow(tCcRow_) + , residue_tCcRow(residue_tCcRow_) + , group(group) + , params(params_) {} + + GS_GTensor tGS_gRow; // (CPY,CPY_M,CPY_N) + GS_STensor tGS_sRow; // (CPY,CPY_M,CPY_N) + GS_CTensor tGS_cRow; // (CPY,CPY_M,CPY_N) + Tiled_G2S tiled_G2S; + + SR_STensor tSR_sRow; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) + SR_RTensor tSR_rRow; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) + + CTensor tCcRow; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) + ThrResidue residue_tCcRow; // (m, n) + ThrNum thr_num; + int group; + Params const& params; + + CUTLASS_DEVICE void + begin() { + if (!params.row_broadcast) { + fill(tSR_rRow, *(params.ptr_row_array[group])); + return; + } + + auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); }; + Tensor tGS_gRow_flt = filter_zeros(tGS_gRow); + Tensor tGS_sRow_flt = filter_zeros(tGS_sRow); + Tensor tGS_cRow_flt = make_tensor(tGS_cRow.data(), make_layout(tGS_gRow_flt.shape(), tGS_cRow.stride())); + + for (int i = 0; i < size(tGS_gRow_flt); ++i) { + if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) { + continue; // OOB of SMEM, + } + if (elem_less(tGS_cRow_flt(i), make_coord(get<0>(residue_tCcRow), get<1>(residue_tCcRow)))) { + tGS_sRow_flt(i) = tGS_gRow_flt(i); + } + else { + tGS_sRow_flt(i) = Element(0); // Set to Zero when OOB so LDS could be issue without any preds. + } + } + synchronize(); + } + + CUTLASS_DEVICE void + begin_loop(int epi_m, int epi_n) { + if (epi_m == 0) { // Assumes M-major subtile loop + if (!params.row_broadcast) return; // Do not issue LDS when row is scalar + Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n)); + Tensor tSR_rRow_flt = filter_zeros(tSR_rRow); + copy(tSR_sRow_flt, tSR_rRow_flt); + } + } + + template + CUTLASS_DEVICE Array + visit(Array const& frg_acc, int epi_v, int epi_m, int epi_n) { + Array frg_row; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < FragmentSize; ++i) { + frg_row[i] = tSR_rRow(epi_v * FragmentSize + i); + } + + return frg_row; + } + }; + + template < + bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy + class... Args + > + CUTLASS_DEVICE auto + get_consumer_store_callbacks(ConsumerStoreArgs const& args) { + auto [M, N, K, L] = args.problem_shape_mnkl; + auto [m, n, k, l] = args.tile_coord_mnkl; + using ThreadCount = decltype(size(args.tiled_copy)); + + Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row_array[l]), make_shape(M,N,1), params.dRow); + Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n)); // (CTA_M, CTA_N) + Tensor sRow = make_tensor(make_smem_ptr(smem), + make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{})); // (CTA_M, CTA_N) + //// G2S: Gmem to Smem + auto tiled_g2s = make_tiled_copy(Copy_Atom{}, + Layout< Shape<_1, ThreadCount>, + Stride<_0, _1>>{}, + Layout<_1>{}); + auto thr_g2s = tiled_g2s.get_slice(args.thread_idx); + Tensor tGS_gRow = thr_g2s.partition_S(gRow); + Tensor tGS_sRow = thr_g2s.partition_D(sRow); + + //// G2S: Coord + auto cRow = make_identity_tensor(make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{}))); + Tensor tGS_cRow = thr_g2s.partition_S(cRow); + + //// S2R: Smem to Reg + Tensor tSR_sRow = sm90_partition_for_epilogue(sRow, args.epi_tile, args.tiled_copy, args.thread_idx); + Tensor tSR_rRow = make_tensor_like(take<0,3>(tSR_sRow)); // (CPY,CPY_M,CPY_N) + + return ConsumerStoreCallbacks( + tGS_gRow, + tGS_sRow, + tGS_cRow, tiled_g2s, + tSR_sRow, + tSR_rRow, + args.tCcD, + args.residue_cD, + ThreadCount{}, + l, + params); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// Column vector broadcast +template< + int Stages, + class CtaTileShapeMNK, + class Element, + class StrideMNL = Stride<_1,_0,_0>, + int Alignment = 128 / sizeof_bits_v +> +struct Sm90ColOrScalarBroadcastArray { + static_assert(Stages == 0, "Column broadcast doesn't support smem usage yet"); + static_assert(Alignment * sizeof_bits_v % 128 == 0, "sub-16B alignment not supported yet"); + static_assert( + (cute::is_same_v>) || // col vector broadcast, e.g. per-row alpha/bias + (cute::is_same_v>)); // batched col vector broadcast, e.g. batched per-row bias + + // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem + struct SharedStorage { }; + + // This struct has been modified to have a bool indicating that ptr_col is a + // scalar that must be broadcast, instead of containing a scalar that is + // valid if ptr_col is null. + struct Arguments { + const Element* const* ptr_col_array = nullptr; + bool col_broadcast = true; + StrideMNL dCol = {}; + }; + + using Params = Arguments; + + template + static constexpr Params + to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) { + return args; + } + + template + static bool + can_implement(ProblemShape const& problem_shape, Arguments const& args) { + return true; + } + + template + static size_t + get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) { + return 0; + } + + template + static cutlass::Status + initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, + CudaHostAdapter* cuda_adapter = nullptr) { + return cutlass::Status::kSuccess; + } + + CUTLASS_DEVICE bool + is_producer_load_needed() const { + return false; + } + + CUTLASS_DEVICE bool + is_C_load_needed() const { + return false; + } + + CUTLASS_DEVICE bool + is_zero() const { + return (!params.col_broadcast && *(params.ptr_col_array[group]) == Element(0)); + } + + CUTLASS_HOST_DEVICE + Sm90ColOrScalarBroadcastArray() { } + + CUTLASS_HOST_DEVICE + Sm90ColOrScalarBroadcastArray(Params const& params, SharedStorage const& shared_storage) + : params(params) { } + + Params params; + + template + CUTLASS_DEVICE auto + get_producer_load_callbacks(ProducerLoadArgs const& args) { + return EmptyProducerLoadCallbacks{}; + } + + template + struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks { + CUTLASS_DEVICE + ConsumerStoreCallbacks( + GTensor&& tCgCol, + RTensor&& tCrCol, + CTensor&& tCcCol, + ProblemShape problem_shape, + int group, + Params const& params + ): + tCgCol(cute::forward(tCgCol)), + tCrCol(cute::forward(tCrCol)), + tCcCol(cute::forward(tCcCol)), + m(get<0>(problem_shape)), + group(group), + params(params) {} + + GTensor tCgCol; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) + RTensor tCrCol; + CTensor tCcCol; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) + Params const& params; + int m; + int group; + + CUTLASS_DEVICE void + begin() { + Tensor pred = make_tensor(shape(tCgCol)); + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(pred); ++i) { + pred(i) = get<0>(tCcCol(i)) < m; + } + + if (!params.col_broadcast) { + fill(tCrCol, *(params.ptr_col_array[group])); + return; + } + + // Filter so we don't issue redundant copies over stride-0 modes + // (only works if 0-strides are in same location, which is by construction) + copy_if(pred, filter(tCgCol), filter(tCrCol)); + } + + template + CUTLASS_DEVICE Array + visit(Array const& frg_acc, int epi_v, int epi_m, int epi_n) { + Array frg_col; + Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < FragmentSize; ++i) { + frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i); + } + + return frg_col; + } + + }; + + template < + bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy + class... Args + > + CUTLASS_DEVICE auto + get_consumer_store_callbacks(ConsumerStoreArgs const& args) { + + auto [M, N, K, L] = args.problem_shape_mnkl; + auto [m, n, k, l] = args.tile_coord_mnkl; + + Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col_array[l]), make_shape(M,N,1), params.dCol); + Tensor tCgCol = sm90_partition_for_epilogue( // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) + mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx); + Tensor tCrCol = make_tensor_like(tCgCol); // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) + + // Generate an identity tensor matching the shape of the global tensor and + // partition the same way, this will be used to generate the predicate + // tensor for loading + Tensor cCol = make_identity_tensor(mCol.shape()); + Tensor tCcCol = sm90_partition_for_epilogue( // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) + cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx); + + return ConsumerStoreCallbacks( + cute::move(tCgCol), + cute::move(tCrCol), + cute::move(tCcCol), + args.problem_shape_mnkl, + l, + params + ); + } +}; + +} diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp index 0a812dc56a994..62b848a0a9635 100644 --- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp +++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp @@ -1,6 +1,7 @@ #pragma once #include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp" +#include "cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp" /* This file defines custom epilogues for fusing channel scales, token scales, @@ -69,6 +70,16 @@ struct ScaledEpilogueBase { 0 /*Stages*/, TileShape, T, T, Stride, Int<1>, Int<0>>, 128 / sizeof_bits_v, EnableNullPtr>; + template + using ColOrScalarLoadArray = + cutlass::epilogue::fusion::Sm90ColOrScalarBroadcastArray< + 0 /*Stages*/, TileShape, T, Stride, Int<0>, Int<0>>>; + + template + using RowOrScalarLoadArray = + cutlass::epilogue::fusion::Sm90RowOrScalarBroadcastArray< + 0 /*Stages*/, TileShape, T, Stride, Int<1>, Int<0>>>; + // This utility function constructs the arguments for the load descriptors // from a tensor. It can handle both row and column, as well as row/column or // scalar cases. @@ -96,6 +107,14 @@ struct ScaledEpilogueBase { std::is_same_v>); return Arguments{data_ptr}; } + + template + static auto args_from_tensor(const T* const* data_ptr, bool do_broadcast) { + using Arguments = typename Descriptor::Arguments; + static_assert(std::is_same_v> || + std::is_same_v>); + return Arguments{data_ptr, do_broadcast}; + } }; /* @@ -381,4 +400,51 @@ struct ScaledEpilogueBiasAzpToken } }; +/* + This epilogue works like ScaledEpilogue, but ScaleA and ScaleB are pointers + to arrays containing different scales used in group gemm. The number of + pointers in ScaleA and the number of pointers in ScaleB are equal to the + group size. +*/ +template +struct ScaledEpilogueArray + : private ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::template ColOrScalarLoadArray; + using ScaleB = typename SUPER::template RowOrScalarLoadArray; + + using Compute0 = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTCompute0 = + cutlass::epilogue::fusion::Sm90EVT; + + using Compute1 = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiplies, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + public: + using EVTCompute = + cutlass::epilogue::fusion::Sm90EVT; + using ArgumentType = typename EVTCompute::Arguments; + + using ScaleAArray = typename SUPER::template ColOrScalarLoadArray; + using ScaleBArray = typename SUPER::template RowOrScalarLoadArray; + + static ArgumentType prepare_args(float const* const* a_scales_ptr, + float const* const* b_scales_ptr, + bool a_col_broadcast, bool b_row_broadcast) { + auto a_args = SUPER::template args_from_tensor( + a_scales_ptr, a_col_broadcast); + auto b_args = SUPER::template args_from_tensor( + b_scales_ptr, b_row_broadcast); + + typename EVTCompute0::Arguments evt0_args{b_args, {}, {}}; + return ArgumentType{a_args, evt0_args, {}}; + } +}; + }; // namespace vllm::c3x diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu index f0e5533bcae60..98daf1a1b8e6c 100644 --- a/csrc/mamba/causal_conv1d/causal_conv1d.cu +++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu @@ -422,7 +422,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) { int final_state_position = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize); // in case the final state is separated between the last "smem_exchange" and // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), - // (which occurs when `final_state_position` is a non-positivie index) + // (which occurs when `final_state_position` is a non-positive index) // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){ input_t vals_load[kNElts] = {0}; diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py new file mode 100644 index 0000000000000..d1c0d92f6814a --- /dev/null +++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: Apache-2.0 +import glob +import itertools +import os +import subprocess + +import jinja2 + +FILE_HEAD = """ +// auto generated by generate.py +// clang-format off + +#include "kernel.h" +#include "marlin_template.h" + +namespace MARLIN_NAMESPACE_NAME { +""".strip() + +TEMPLATE = ("template __global__ void Marlin<" + "{{scalar_t}}, " + "{{w_type_id}}, " + "{{threads}}, " + "{{thread_m_blocks}}, " + "{{thread_n_blocks}}, " + "{{thread_k_blocks}}, " + "{{'true' if m_block_size_8 else 'false'}}, " + "{{stages}}, " + "{{'true' if has_act_order else 'false'}}, " + "{{'true' if has_zp else 'false'}}, " + "{{group_blocks}}, " + "{{'true' if is_zp_float else 'false'}}>" + "( MARLIN_KERNEL_PARAMS );") + +# int8 with zero point case (vllm::kU8) is also supported, +# we don't add it to reduce wheel size. +SCALAR_TYPES = ["vllm::kU4", "vllm::kU4B8", "vllm::kU8B128"] +THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)] + +THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4] +# group_blocks: +# = 0 : act order case +# = -1 : channelwise quantization +# > 0 : group_size=16*group_blocks +GROUP_BLOCKS = [0, -1, 2, 4, 8] +DTYPES = ["fp16", "bf16"] + + +def remove_old_kernels(): + for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"): + subprocess.call(["rm", "-f", filename]) + + +def generate_new_kernels(): + for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES): + has_zp = "B" not in scalar_type + all_template_str_list = [] + + for group_blocks, m_blocks, thread_configs in itertools.product( + GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS): + + has_act_order = group_blocks == 0 + if has_zp and has_act_order: + continue + if thread_configs[2] == 256: + if m_blocks <= 1 and thread_configs[0] != 128: + continue + if m_blocks > 1 and thread_configs[0] != 64: + continue + + k_blocks = thread_configs[0] // 16 + n_blocks = thread_configs[1] // 16 + threads = thread_configs[2] + + c_dtype = "half" if dtype == "fp16" else "nv_bfloat16" + + template_str = jinja2.Template(TEMPLATE).render( + scalar_t=c_dtype, + w_type_id=scalar_type + ".id()", + threads=threads, + thread_m_blocks=max(m_blocks, 1), + thread_n_blocks=n_blocks, + thread_k_blocks=k_blocks, + m_block_size_8=m_blocks == 0.5, + stages="pipe_stages", + has_act_order=has_act_order, + has_zp=has_zp, + group_blocks=group_blocks, + is_zp_float=False, + ) + + all_template_str_list.append(template_str) + + file_content = FILE_HEAD + "\n\n" + file_content += "\n\n".join(all_template_str_list) + "\n\n}\n" + filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu" + + with open(os.path.join(os.path.dirname(__file__), filename), "w") as f: + f.write(file_content) + + +if __name__ == "__main__": + remove_old_kernels() + generate_new_kernels() diff --git a/csrc/moe/marlin_moe_wna16/kernel.h b/csrc/moe/marlin_moe_wna16/kernel.h new file mode 100644 index 0000000000000..3d92660e8028e --- /dev/null +++ b/csrc/moe/marlin_moe_wna16/kernel.h @@ -0,0 +1,44 @@ + +#ifndef MARLIN_NAMESPACE_NAME + #define MARLIN_NAMESPACE_NAME marlin_moe_wna16 +#endif + +#include "quantization/gptq_marlin/marlin.cuh" +#include "quantization/gptq_marlin/marlin_dtypes.cuh" +#include "core/scalar_type.hpp" + +#define MARLIN_KERNEL_PARAMS \ + const int4 *__restrict__ A, const int4 *__restrict__ B, \ + int4 *__restrict__ C, int4 *__restrict__ C_tmp, \ + const int4 *__restrict__ scales_ptr, const int4 *__restrict__ zp_ptr, \ + const int *__restrict__ g_idx, \ + const int32_t *__restrict__ sorted_token_ids_ptr, \ + const int32_t *__restrict__ expert_ids_ptr, \ + const int32_t *__restrict__ num_tokens_past_padded_ptr, \ + const float *__restrict__ topk_weights_ptr, int top_k, \ + bool mul_topk_weights, bool is_ep, int num_groups, int prob_m, \ + int prob_n, int prob_k, int *locks, bool use_atomic_add, \ + bool use_fp32_reduce + +namespace MARLIN_NAMESPACE_NAME { +template shared + // fetch pipeline + const bool has_act_order, // whether act_order is enabled + const bool has_zp, // whether zero-points are enabled + const int group_blocks, // number of consecutive 16x16 blocks + // with a separate quantization scale + const bool is_zp_float // is zero point of float16 type? + > +__global__ void Marlin(MARLIN_KERNEL_PARAMS); + +} diff --git a/csrc/moe/marlin_moe_wna16/marlin_template.h b/csrc/moe/marlin_moe_wna16/marlin_template.h new file mode 100644 index 0000000000000..205b308fe511b --- /dev/null +++ b/csrc/moe/marlin_moe_wna16/marlin_template.h @@ -0,0 +1,1917 @@ +/* + * Modified by Neural Magic + * Copyright (C) Marlin.2024 Elias Frantar + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Adapted from https://github.com/IST-DASLab/marlin + */ + +#ifndef MARLIN_NAMESPACE_NAME + #define MARLIN_NAMESPACE_NAME marlin_moe_wna16 +#endif + +#include "quantization/gptq_marlin/marlin.cuh" +#include "quantization/gptq_marlin/marlin_dtypes.cuh" +#include "core/scalar_type.hpp" + +#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \ + static_assert(std::is_same::value || \ + std::is_same::value, \ + "only float16 and bfloat16 is supported"); + +namespace MARLIN_NAMESPACE_NAME { + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + +template shared + // fetch pipeline + const bool has_act_order, // whether act_order is enabled + const bool has_zp, // whether zero-points are enabled + const int group_blocks, // number of consecutive 16x16 blocks + // with a separate quantization scale + const bool is_zp_float // is zero point of float16 type? + > +__global__ void Marlin( + const int4* __restrict__ A, // fp16 input matrix of shape mxk + const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn + int4* __restrict__ C, // fp16 output buffer of shape mxn + int4* __restrict__ C_tmp, // fp32 tmp output buffer (for reduce) + const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape + // (k/groupsize)xn + const int4* __restrict__ zp_ptr, // 4bit packed zero-points of shape + // (k/groupsize)x(n/pack_factor) + const int* __restrict__ g_idx, // int32 group indices of shape k + const int32_t* __restrict__ sorted_token_ids_ptr, // moe sorted_ids + const int32_t* __restrict__ expert_ids_ptr, // moe expert ids + const int32_t* __restrict__ num_tokens_past_padded_ptr, // moe num tokens + const float* __restrict__ topk_weights_ptr, // moe top weights + int top_k, // num of experts per token + bool mul_topk_weights, // mul topk weights or not + bool is_ep, // expert parallelism + int num_groups, // number of scale groups per output channel + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int* locks, // extra global storage for barrier synchronization + bool use_atomic_add, // whether to use atomic add to reduce + bool use_fp32_reduce // whether to use fp32 global reduce +) {} + +} // namespace MARLIN_NAMESPACE_NAME + +#else + +// m16n8k16 tensor core mma instruction with fp16 inputs and fp32 +// output/accumulation. +template +__device__ inline void mma(const typename ScalarType::FragA& a_frag, + const typename ScalarType::FragB& frag_b, + typename ScalarType::FragC& frag_c) { + const uint32_t* a = reinterpret_cast(&a_frag); + const uint32_t* b = reinterpret_cast(&frag_b); + float* c = reinterpret_cast(&frag_c); + if constexpr (std::is_same::value) { + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), + "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); + } else if constexpr (std::is_same::value) { + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), + "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); + } else { + STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t); + } +} + +template +__device__ inline void mma_trans( + const typename ScalarType::FragA& a_frag, + const typename ScalarType::FragB& frag_b, + const typename ScalarType::FragB& frag_b2, + typename ScalarType::FragC& frag_c) { + const uint32_t* a = reinterpret_cast(&a_frag); + const uint32_t* b = reinterpret_cast(&frag_b); + const uint32_t* b2 = reinterpret_cast(&frag_b2); + float* c = reinterpret_cast(&frag_c); + if constexpr (std::is_same::value) { + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]), + "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); + } else if constexpr (std::is_same::value) { + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]), + "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); + } else { + STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t); + } +} + +// Instruction for loading a full 16x16 matrix fragment of operand A from shared +// memory, directly in tensor core layout. +template +__device__ inline void ldsm(typename ScalarType::FragA& frag_a, + const void* smem_ptr) { + uint32_t* a = reinterpret_cast(&frag_a); + uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); + if constexpr (count == 4) { + asm volatile( + "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n" + : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3]) + : "r"(smem)); + } else if constexpr (count == 2) { + asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n" + : "=r"(a[0]), "=r"(a[1]) + : "r"(smem)); + } else if constexpr (count == 1) { + asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n" + : "=r"(a[0]) + : "r"(smem)); + } else { + static_assert(count == 1 || count == 2 || count == 4, "invalid count"); + } +} + +// Lookup-table based 3-input logical operation; explicitly used for +// dequantization as the compiler does not seem to automatically recognize it in +// all cases. +template +__device__ inline int lop3(int a, int b, int c) { + int res; + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(res) + : "r"(a), "r"(b), "r"(c), "n"(lut)); + return res; +} + +// Constructs destination register by taking bytes from 2 sources (based on +// mask) +template +__device__ inline uint32_t prmt(uint32_t a) { + uint32_t res; + asm volatile("prmt.b32 %0, %1, %2, %3;\n" + : "=r"(res) + : "r"(a), "n"(start_byte), "n"(mask)); + return res; +} + +template +__device__ inline typename ScalarType::FragB dequant( + int q, typename ScalarType::FragB& frag_b); + +// +// Efficiently dequantize 4bit values packed in an int32 value into a full +// B-fragment of 4 fp16 values. We mostly follow the strategy in the link below, +// with some small changes: +// - FP16: +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287 +// - BF16: +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385 +// +template <> +__device__ inline typename ScalarType::FragB dequant( + int q, typename ScalarType::FragB& frag_b) { + const int LO = 0x000f000f; + const int HI = 0x00f000f0; + const int EX = 0x64006400; + // Guarantee that the `(a & b) | c` operations are LOP3s. + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); + // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point + // directly into `SUB` and `ADD`. + const int SUB = 0x64086408; + const int MUL = 0x2c002c00; + const int ADD = 0xd480d480; + frag_b[0] = __hsub2(*reinterpret_cast(&lo), + *reinterpret_cast(&SUB)); + frag_b[1] = __hfma2(*reinterpret_cast(&hi), + *reinterpret_cast(&MUL), + *reinterpret_cast(&ADD)); + return frag_b; +} + +template <> +__device__ inline typename ScalarType::FragB +dequant(int q, + typename ScalarType::FragB& frag_b) { + static constexpr uint32_t MASK = 0x000f000f; + static constexpr uint32_t EX = 0x43004300; + + // Guarantee that the `(a & b) | c` operations are LOP3s. + + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + q >>= 4; + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); + + static constexpr uint32_t MUL = 0x3F803F80; + static constexpr uint32_t ADD = 0xC308C308; + + frag_b[0] = __hfma2(*reinterpret_cast(&lo), + *reinterpret_cast(&MUL), + *reinterpret_cast(&ADD)); + frag_b[1] = __hfma2(*reinterpret_cast(&hi), + *reinterpret_cast(&MUL), + *reinterpret_cast(&ADD)); + return frag_b; +} + +// +// Fast Int8ToFp16/Int8ToBf16: Efficiently dequantize 8bit int values to fp16 or +// bf16 Reference: +// - FP16: +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85 +// - BF16: +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175 +// +template <> +__device__ inline typename ScalarType::FragB dequant( + int q, typename ScalarType::FragB& frag_b) { + static constexpr uint32_t mask_for_elt_01 = 0x5250; + static constexpr uint32_t mask_for_elt_23 = 0x5351; + static constexpr uint32_t start_byte_for_fp16 = 0x64646464; + + uint32_t lo = prmt(q); + uint32_t hi = prmt(q); + + static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480; + + frag_b[0] = __hsub2(*reinterpret_cast(&lo), + *reinterpret_cast(&I8s_TO_F16s_MAGIC_NUM)); + frag_b[1] = __hsub2(*reinterpret_cast(&hi), + *reinterpret_cast(&I8s_TO_F16s_MAGIC_NUM)); + return frag_b; +} + +template <> +__device__ inline typename ScalarType::FragB +dequant(int q, + typename ScalarType::FragB& frag_b) { + float fp32_intermediates[4]; + uint32_t* fp32_intermediates_casted = + reinterpret_cast(fp32_intermediates); + + static constexpr uint32_t fp32_base = 0x4B000000; + fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650); + fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652); + fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651); + fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653); + + fp32_intermediates[0] -= 8388736.f; + fp32_intermediates[1] -= 8388736.f; + fp32_intermediates[2] -= 8388736.f; + fp32_intermediates[3] -= 8388736.f; + + uint32_t* bf16_result_ptr = reinterpret_cast(&frag_b); + bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0], + fp32_intermediates_casted[1], 0x7632); + bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2], + fp32_intermediates_casted[3], 0x7632); + + return frag_b; +} + +// Multiply dequantized values by the corresponding quantization scale; used +// only for grouped quantization. +template +__device__ inline void scale(typename ScalarType::FragB& frag_b, + typename ScalarType::FragS& frag_s, + int i) { + using scalar_t2 = typename ScalarType::scalar_t2; + scalar_t2 s = + ScalarType::num2num2(reinterpret_cast(&frag_s)[i]); + frag_b[0] = __hmul2(frag_b[0], s); + frag_b[1] = __hmul2(frag_b[1], s); +} + +template +__device__ inline void scale_and_sub( + typename ScalarType::FragB& frag_b, scalar_t s, scalar_t zp) { + using scalar_t2 = typename ScalarType::scalar_t2; + scalar_t2 s2 = ScalarType::num2num2(s); + scalar_t2 zp2 = ScalarType::num2num2(zp); + frag_b[0] = __hfma2(frag_b[0], s2, __hneg2(zp2)); + frag_b[1] = __hfma2(frag_b[1], s2, __hneg2(zp2)); +} + +template +__device__ inline void sub_zp(typename ScalarType::FragB& frag_b, + typename ScalarType::scalar_t2& frag_zp, + int i) { + using scalar_t2 = typename ScalarType::scalar_t2; + scalar_t2 zp = + ScalarType::num2num2(reinterpret_cast(&frag_zp)[i]); + frag_b[0] = __hsub2(frag_b[0], zp); + frag_b[1] = __hsub2(frag_b[1], zp); +} + +// Same as above, but for act_order (each K is multiplied individually) +template +__device__ inline void scale4(typename ScalarType::FragB& frag_b, + typename ScalarType::FragS& frag_s_1, + typename ScalarType::FragS& frag_s_2, + typename ScalarType::FragS& frag_s_3, + typename ScalarType::FragS& frag_s_4, + int i) { + using scalar_t2 = typename ScalarType::scalar_t2; + scalar_t2 s_val_1_2; + s_val_1_2.x = reinterpret_cast(&frag_s_1)[i]; + s_val_1_2.y = reinterpret_cast(&frag_s_2)[i]; + + scalar_t2 s_val_3_4; + s_val_3_4.x = reinterpret_cast(&frag_s_3)[i]; + s_val_3_4.y = reinterpret_cast(&frag_s_4)[i]; + + frag_b[0] = __hmul2(frag_b[0], s_val_1_2); + frag_b[1] = __hmul2(frag_b[1], s_val_3_4); +} + +// Given 2 floats multiply by 2 scales (halves) +template +__device__ inline void scale_float(float* c, + typename ScalarType::FragS& s) { + scalar_t* s_ptr = reinterpret_cast(&s); + c[0] = __fmul_rn(c[0], ScalarType::num2float(s_ptr[0])); + c[1] = __fmul_rn(c[1], ScalarType::num2float(s_ptr[1])); +} + +// Wait until barrier reaches `count`, then lock for current threadblock. +__device__ inline void barrier_acquire(int* lock, int count) { + if (threadIdx.x == 0) { + int state = -1; + do + // Guarantee that subsequent writes by this threadblock will be visible + // globally. + asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" + : "=r"(state) + : "l"(lock)); + while (state != count); + } + __syncthreads(); +} + +// Release barrier and increment visitation count. +__device__ inline void barrier_release(int* lock, bool reset = false) { + __syncthreads(); + if (threadIdx.x == 0) { + if (reset) { + lock[0] = 0; + return; + } + int val = 1; + // Make sure that all writes since acquiring this barrier are visible + // globally, while releasing the barrier. + asm volatile("fence.acq_rel.gpu;\n"); + asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n" + : + : "l"(lock), "r"(val)); + } +} + +// Wait until value of lock to be negative, and then add 1 +__device__ inline void wait_negative_and_add(int* lock) { + if (threadIdx.x == 0) { + int state = 0; + do + // Guarantee that subsequent writes by this threadblock will be visible + // globally. + asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" + : "=r"(state) + : "l"(lock)); + while (state >= 0); + atomicAdd(lock, 1); + } + __syncthreads(); +} + +template shared + // fetch pipeline + const bool has_act_order, // whether act_order is enabled + const bool has_zp, // whether zero-points are enabled + const int group_blocks, // number of consecutive 16x16 blocks + // with a separate quantization scale + const bool is_zp_float // is zero point of float16 type? + > +__global__ void Marlin( + const int4* __restrict__ A, // fp16 input matrix of shape mxk + const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn + int4* __restrict__ C, // fp16 output buffer of shape mxn + int4* __restrict__ C_tmp, // fp32 tmp output buffer (for reduce) + const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape + // (k/groupsize)xn + const int4* __restrict__ zp_ptr, // 4bit packed zero-points of shape + // (k/groupsize)x(n/pack_factor) + const int* __restrict__ g_idx, // int32 group indices of shape k + const int32_t* __restrict__ sorted_token_ids_ptr, // moe sorted_ids + const int32_t* __restrict__ expert_ids_ptr, // moe expert ids + const int32_t* __restrict__ num_tokens_past_padded_ptr, // moe num tokens + const float* __restrict__ topk_weights_ptr, // moe top weights + int top_k, // num of experts per token + bool mul_topk_weights, // mul topk weights or not + bool is_ep, // expert parallelism + int num_groups, // number of scale groups per output channel + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int* locks, // extra global storage for barrier synchronization + bool use_atomic_add, // whether to use atomic add to reduce + bool use_fp32_reduce // whether to use fp32 global reduce +) { + // Each threadblock processes one "stripe" of the B matrix with (roughly) the + // same size, which might involve multiple column "slices" (of width 16 * + // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM + // example: + // 0 1 3 + // 0 2 3 + // 1 2 4 + // While this kind of partitioning makes things somewhat more complicated, it + // ensures good utilization of all SMs for many kinds of shape and GPU + // configurations, while requiring as few slow global cross-threadblock + // reductions as possible. + using Dtype = ScalarType; + using scalar_t2 = typename ScalarType::scalar_t2; + using FragA = typename ScalarType::FragA; + using FragB = typename ScalarType::FragB; + using FragC = typename ScalarType::FragC; + using FragS = typename ScalarType::FragS; + using FragZP = typename ScalarType::FragZP; + + extern __shared__ int4 sh[]; + static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id); + + constexpr int pack_factor = 32 / w_type.size_bits(); + static_assert(thread_m_blocks == 1 || !m_block_size_8); + constexpr int moe_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks); + const int group_size = + (!has_act_order && group_blocks == -1) ? prob_k : prob_k / num_groups; + const int scales_expert_stride = prob_n * prob_k / group_size / 8; + const int zp_expert_stride = + is_zp_float ? prob_n * prob_k / group_size / 8 + : prob_n * prob_k / group_size / (pack_factor * 4); + + // parallel: num valid moe blocks + int num_tokens_past_padded = num_tokens_past_padded_ptr[0]; + int parallel = num_tokens_past_padded / moe_block_size; + int num_valid_blocks = parallel; + if (is_ep) { + for (int i = 0; i < parallel; i++) { + if (expert_ids_ptr[i] == -1) num_valid_blocks--; + } + } + int num_invalid_blocks = parallel - num_valid_blocks; + parallel = num_valid_blocks; + + int k_tiles = prob_k / 16 / thread_k_blocks; + int n_tiles = prob_n / 16 / thread_n_blocks; + int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x); + + if constexpr (!has_act_order && group_blocks != -1) { + if (group_blocks >= thread_k_blocks) { + // Ensure that the number of tiles in each stripe is a multiple of the + // groupsize; this avoids an annoying special case where a stripe starts + // in the middle of group. + iters = (group_blocks / thread_k_blocks) * + div_ceil(iters, (group_blocks / thread_k_blocks)); + } + } + + int slice_row = (iters * blockIdx.x) % k_tiles; + int slice_col_par = (iters * blockIdx.x) / k_tiles; + int slice_col = slice_col_par; + int slice_iters; // number of threadblock tiles in the current slice + int slice_count = + 0; // total number of active threadblocks in the current slice + int slice_idx; // index of threadblock in current slice; numbered bottom to + // top + + int par_id = 0; + int block_id = -1; + int64_t expert_id = 0; // use int64 to avoid computation result overflow + int old_expert_id = 0; + int64_t B_expert_off = 0; + + int4* sh_block_sorted_ids_int4 = sh; + int32_t* sh_block_sorted_ids = + reinterpret_cast(sh_block_sorted_ids_int4); + int4* sh_block_topk_weights_int4 = + sh_block_sorted_ids_int4 + moe_block_size / 4; + scalar_t2* sh_block_topk_weights = + reinterpret_cast(sh_block_topk_weights_int4); + int4* sh_new = sh_block_topk_weights_int4 + moe_block_size / 4; + + int32_t block_num_valid_tokens = 0; + int32_t locks_off = 0; + + // We can easily implement parallel problem execution by just remapping + // indices and advancing global pointers + if (slice_col_par >= n_tiles) { + slice_col = slice_col_par % n_tiles; + par_id = slice_col_par / n_tiles; + } + if (parallel * n_tiles >= gridDim.x) { + // when parallel * n_tiles >= sms + // then there are at most $sms$ conflict tile blocks + locks_off = blockIdx.x; + } else { + locks_off = (iters * blockIdx.x) / k_tiles - 1; + } + + // read moe block data given block_id + // block_sorted_ids / block_num_valid_tokens / block_topk_weights + auto read_moe_block_data = [&](int block_id) { + block_num_valid_tokens = moe_block_size; + #pragma unroll + for (int i = 0; i < moe_block_size / 4; i++) { + int4 sorted_token_ids_int4 = reinterpret_cast( + sorted_token_ids_ptr)[block_id * moe_block_size / 4 + i]; + int* sorted_token_ids = reinterpret_cast(&sorted_token_ids_int4); + #pragma unroll + for (int j = 0; j < 4; j++) { + if (sorted_token_ids[j] >= prob_m * top_k) { + block_num_valid_tokens = i * 4 + j; + break; + } + } + if (block_num_valid_tokens != moe_block_size) break; + } + + __syncthreads(); + int tid4 = threadIdx.x / 4; + if (threadIdx.x % 4 == 0 && threadIdx.x < block_num_valid_tokens) { + sh_block_sorted_ids_int4[tid4] = reinterpret_cast( + sorted_token_ids_ptr)[block_id * moe_block_size / 4 + tid4]; + + if (mul_topk_weights) { + #pragma unroll + for (int i = 0; i < 4; i++) { + sh_block_topk_weights[tid4 * 4 + i] = + Dtype::num2num2(Dtype::float2num( + topk_weights_ptr[sh_block_sorted_ids[tid4 * 4 + i]])); + } + } + } + __syncthreads(); + }; + + // when move to next moe block, find the next block_id and expert_id + // and then read moe block data + auto update_next_moe_block_data = [&]() { + if (par_id >= parallel) return; + + old_expert_id = expert_id; + if (num_invalid_blocks > 0) { + int skip_count = block_id == -1 ? par_id : 0; + block_id++; + for (int i = block_id; i < num_tokens_past_padded / moe_block_size; i++) { + expert_id = expert_ids_ptr[i]; + if (expert_id != -1) { + if (skip_count == 0) { + block_id = i; + break; + }; + skip_count--; + }; + } + } else { + block_id = par_id; + expert_id = expert_ids_ptr[block_id]; + } + + B_expert_off = expert_id * prob_n * prob_k / (pack_factor * 4); + scales_ptr += (expert_id - old_expert_id) * scales_expert_stride; + if constexpr (has_zp) { + zp_ptr += (expert_id - old_expert_id) * zp_expert_stride; + } + if constexpr (has_act_order) { + g_idx += (expert_id - old_expert_id) * prob_k; + } + + read_moe_block_data(block_id); + }; + + // Compute all information about the current slice which is required for + // synchronization. + auto init_slice = [&](bool first_init = false) { + slice_iters = + iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row); + if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0; + if (slice_iters == 0) return; + if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row; + slice_count = 1; + slice_idx = 0; + int col_first = iters * div_ceil(k_tiles * slice_col_par, iters); + if (col_first <= k_tiles * (slice_col_par + 1)) { + int col_off = col_first - k_tiles * slice_col_par; + slice_count = div_ceil(k_tiles - col_off, iters); + if (col_off > 0) slice_count++; + int delta_first = iters * blockIdx.x - col_first; + if (delta_first < 0 || (col_off == 0 && delta_first == 0)) + slice_idx = slice_count - 1; + else { + slice_idx = slice_count - 1 - delta_first / iters; + if (col_off > 0) slice_idx--; + } + } + if (parallel * n_tiles >= gridDim.x) { + if (slice_count > 1 && slice_idx == slice_count - 1) { + locks_off++; + } + } else { + locks_off++; + } + + if (first_init && use_atomic_add && slice_count > 1 && slice_idx == 0) { + constexpr int threads_per_m = 16 * thread_n_blocks / 8; + int m_per_thread = + div_ceil(block_num_valid_tokens, threads / threads_per_m); + for (int i = 0; i < m_per_thread; i++) { + int row = threads / threads_per_m * i + threadIdx.x / threads_per_m; + if (row < block_num_valid_tokens) { + int64_t sorted_row = sh_block_sorted_ids[row]; + int col = slice_col * 16 * thread_n_blocks / 8 + + threadIdx.x % threads_per_m; + C[sorted_row * prob_n / 8 + col] = {0, 0, 0, 0}; + } + } + // After write zero to output, write a negative value to lock. + // Every SM that processes the same slice would wait for + // the negative value, and then atomicAdd 1 to it. + // After all SMs are processed, the lock value would back to 0 again. + __syncthreads(); + if (threadIdx.x == 0) locks[locks_off] = 1 - slice_count; + } + + if (slice_col == n_tiles) { + slice_col = 0; + par_id++; + update_next_moe_block_data(); + } + }; + + update_next_moe_block_data(); + init_slice(true); + + // A sizes/strides + + // stride of the A matrix in global memory + int a_gl_stride = prob_k / 8; + // stride of an A matrix tile in shared memory + constexpr int a_sh_stride = 16 * thread_k_blocks / 8; + // delta between subsequent A tiles in global memory + constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8; + // between subsequent accesses within a tile + int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o); + // between shared memory writes + constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o); + // between shared memory tile reads + constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4)); + // within a shared memory tile + constexpr int a_sh_rd_delta_i = a_sh_stride * 16; + // overall size of a tile + constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks); + // number of shared write iterations for a tile + constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta); + + // B sizes/strides + int b_gl_stride = 16 * prob_n / (pack_factor * 4); + constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4; + constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2; + constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs; + + int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks; + int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads); + constexpr int b_sh_wr_delta = threads * b_thread_vecs; + constexpr int b_sh_rd_delta = threads * b_thread_vecs; + constexpr int b_sh_stage = b_sh_stride * thread_k_blocks; + constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta; + + // Scale sizes/strides without act_order + int s_gl_stride = prob_n / 8; + constexpr int s_sh_stride = 16 * thread_n_blocks / 8; + constexpr int s_tb_groups = + !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks + ? thread_k_blocks / group_blocks + : 1; + constexpr int s_sh_stage = s_tb_groups * s_sh_stride; + int s_gl_rd_delta = s_gl_stride; + + // Scale size/strides with act_order + constexpr int tb_k = 16 * thread_k_blocks; + constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0; + // constexpr int act_s_row_stride = 1; + // int act_s_col_stride = act_s_row_stride * num_groups; + int act_s_col_stride = 1; + int act_s_col_warp_stride = act_s_col_stride * 8; + int tb_n_warps = thread_n_blocks / 4; + int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps; + + // Zero-points sizes/strides + int zp_gl_stride = is_zp_float ? prob_n / 8 : (prob_n / pack_factor) / 4; + constexpr int zp_sh_stride = is_zp_float + ? 16 * thread_n_blocks / 8 + : ((16 * thread_n_blocks) / pack_factor) / 4; + constexpr int zp_tb_groups = s_tb_groups; + constexpr int zp_sh_stage = has_zp ? zp_tb_groups * zp_sh_stride : 0; + int zp_gl_rd_delta = zp_gl_stride; + + // Global A read index of current thread. + int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + + (threadIdx.x % a_gl_rd_delta_o); + a_gl_rd += a_gl_rd_delta_o * slice_row; + // Shared write index of current thread. + int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) + + (threadIdx.x % a_gl_rd_delta_o); + // Shared read index. + int a_sh_rd = + a_sh_stride * ((threadIdx.x % 32) % (16 / (m_block_size_8 ? 2 : 1))) + + (threadIdx.x % 32) / (16 / (m_block_size_8 ? 2 : 1)); + a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4)); + + int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) + + (threadIdx.x % b_sh_stride_threads) * b_thread_vecs; + b_gl_rd += b_sh_stride * slice_col; + b_gl_rd += b_gl_rd_delta_o * slice_row; + int b_sh_wr = threadIdx.x * b_thread_vecs; + int b_sh_rd = threadIdx.x * b_thread_vecs; + + // For act_order + constexpr int k_iter_size = tb_k / b_sh_wr_iters; + int slice_k_start = tb_k * slice_row; + int slice_k_finish = slice_k_start + tb_k * slice_iters; + int slice_k_start_shared_fetch = slice_k_start; + int slice_n_offset = act_s_col_tb_stride * slice_col; + + // No act_order + int s_gl_rd; + if constexpr (!has_act_order) { + if constexpr (group_blocks == -1) { + s_gl_rd = s_sh_stride * slice_col + threadIdx.x; + } else { + s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) + + s_sh_stride * slice_col + threadIdx.x; + } + } + int s_sh_wr = threadIdx.x; + bool s_sh_wr_pred = threadIdx.x < s_sh_stride; + + // Zero-points + int zp_gl_rd; + if constexpr (has_zp) { + if constexpr (group_blocks == -1) { + zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x; + } else { + zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) + + zp_sh_stride * slice_col + threadIdx.x; + } + } + int zp_sh_wr = threadIdx.x; + bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride; + + // We use a different scale layout for grouped and column-wise quantization as + // we scale a `half2` tile in column-major layout in the former and in + // row-major in the latter case. + int s_sh_rd; + if constexpr (group_blocks != -1) + s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + (threadIdx.x % 32) / 4; + else if constexpr (group_blocks == -1 && (m_block_size_8 || has_zp)) + s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + (threadIdx.x % 32) / 8; + else + s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + (threadIdx.x % 32) % 4; + + // Zero-points have the same read layout as the scales + // (without column-wise case) + constexpr int num_col_threads = 8; + constexpr int num_row_threads = 4; + constexpr int num_ints_per_thread = 8 / pack_factor; + int zp_sh_rd; + if constexpr (has_zp) { + if constexpr (is_zp_float) { + if constexpr (group_blocks != -1) { + zp_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + (threadIdx.x % 32) / 4; + } + } else { + zp_sh_rd = num_ints_per_thread * num_col_threads * + ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads); + } + } + + // To ensure that writing and reading A tiles to/from shared memory, the + // latter in fragment format, is fully bank conflict free, we need to use a + // rather fancy XOR-based layout. The key here is that neither reads nor + // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the + // same shared memory banks. Further, it seems (based on NSight-Compute) that + // each warp must also write a consecutive memory segment? + auto transform_a = [&](int i) { + int row = i / a_gl_rd_delta_o; + return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row; + }; + // Since the computation of this remapping is non-trivial and, due to our main + // loop unrolls, all shared memory accesses are static, we simply precompute + // both transformed reads and writes. + int a_sh_wr_trans[a_sh_wr_iters]; + #pragma unroll + for (int i = 0; i < a_sh_wr_iters; i++) + a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr); + int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks]; + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) { + #pragma unroll + for (int j = 0; j < thread_m_blocks; j++) + a_sh_rd_trans[i][j] = + transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd); + } + + // Since B-accesses have non-constant stride they have to be computed at + // runtime; we break dependencies between subsequent accesses with a tile by + // maintining multiple pointers (we have enough registers), a tiny + // optimization. + const int4* B_ptr[b_sh_wr_iters]; + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) + B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd; + + // Shared memory storage for global fetch pipelines. + int4* sh_a = sh_new; + int4* sh_b = sh_a + (stages * a_sh_stage); + int4* sh_g_idx = sh_b + (stages * b_sh_stage); + int4* sh_zp = sh_g_idx + (stages * g_idx_stage); + int4* sh_s = sh_zp + (stages * zp_sh_stage); + int4* sh_red = sh_b; + + // Register storage for double buffer of shared memory reads. + FragA frag_a[2][thread_m_blocks]; + I4 frag_b_quant[2][b_thread_vecs]; + FragC frag_c[thread_m_blocks][4][2]; + FragS frag_s[2][4]; // No act-order + FragS act_frag_s[2][4][4]; // For act-order + int frag_qzp[2][num_ints_per_thread]; // Zero-points + FragZP frag_zp; // Zero-points in fp16 + FragZP frag_zpf[2]; // Zero-points in fp16 in HQQ + + // Zero accumulators. + auto zero_accums = [&]() { + #pragma unroll + for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++) + reinterpret_cast(frag_c)[i] = 0; + }; + + int sh_first_group_id = -1; + int sh_num_groups = -1; + constexpr int sh_max_num_groups = 32; + + auto fetch_act_order_scales_to_shared = [&](bool is_async, int first_group_id, + int last_group_id) { + sh_first_group_id = first_group_id; + sh_num_groups = last_group_id - first_group_id + 1; + + if (sh_num_groups < sh_max_num_groups) { + sh_num_groups = sh_max_num_groups; + } + + if (sh_first_group_id + sh_num_groups > num_groups) { + sh_num_groups = num_groups - sh_first_group_id; + } + + int row_offset = first_group_id * s_gl_stride; + + if (is_async) { + for (int i = 0; i < sh_num_groups; i++) { + if (threadIdx.x < s_sh_stride) { + cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x], + &scales_ptr[row_offset + (i * s_gl_stride) + + slice_n_offset + threadIdx.x]); + } + } + } else { + for (int i = 0; i < sh_num_groups; i++) { + if (threadIdx.x < s_sh_stride) { + sh_s[(i * s_sh_stride) + threadIdx.x] = + scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset + + threadIdx.x]; + } + } + } + }; + // Asynchronously fetch the next A, B and s tile from global to the next + // shared memory pipeline location. + int a_remaining_load_count_in_slice = stages; + auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) { + if (pred) { + int4* sh_a_stage = sh_a + a_sh_stage * pipe; + if (prob_k > thread_k_blocks * 16 * stages || slice_col == 0 || + a_remaining_load_count_in_slice > 0) { + a_remaining_load_count_in_slice--; + #pragma unroll + for (int i = 0; i < a_sh_wr_iters; i++) { + int a_idx = a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off; + int row = a_idx / a_gl_stride; + int64_t sorted_row = 0; + if (!m_block_size_8 || row < 8) + sorted_row = sh_block_sorted_ids[row] / top_k; + int64_t true_idx = sorted_row * a_gl_stride + a_idx % a_gl_stride; + cp_async4_pred(&sh_a_stage[a_sh_wr_trans[i]], &A[true_idx], + row < block_num_valid_tokens); + } + } + int4* sh_b_stage = sh_b + b_sh_stage * pipe; + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) { + #pragma unroll + for (int j = 0; j < b_thread_vecs; j++) { + cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], + B_ptr[i] + j + B_expert_off); + } + + B_ptr[i] += b_gl_rd_delta_o; + } + + if constexpr (has_act_order) { + // Fetch g_idx thread-block portion + int full_pipe = a_off; + int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe; + if (cur_k < prob_k && cur_k < slice_k_finish) { + int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe; + + int4 const* cur_g_idx_stage_ptr = + reinterpret_cast(&g_idx[cur_k]); + + if (threadIdx.x < g_idx_stage) { + cp_async4_pred(&sh_g_idx_stage[threadIdx.x], + &cur_g_idx_stage_ptr[threadIdx.x]); + } + } + } else { + if constexpr (group_blocks != -1) { + int4* sh_s_stage = sh_s + s_sh_stage * pipe; + + if constexpr (group_blocks >= thread_k_blocks) { + // Only fetch scales if this tile starts a new group + if (pipe % (group_blocks / thread_k_blocks) == 0) { + if (s_sh_wr_pred) { + cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]); + } + s_gl_rd += s_gl_rd_delta; + } + } else { + for (int i = 0; i < s_tb_groups; i++) { + if (s_sh_wr_pred) { + cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr], + &scales_ptr[s_gl_rd]); + } + s_gl_rd += s_gl_rd_delta; + } + } + } + + if constexpr (has_zp && group_blocks != -1) { + int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe; + + if constexpr (group_blocks >= thread_k_blocks) { + // Only fetch zero-points if this tile starts a new group + if (pipe % (group_blocks / thread_k_blocks) == 0) { + if (zp_sh_wr_pred) { + cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]); + } + zp_gl_rd += zp_gl_rd_delta; + } + } else { + for (int i = 0; i < zp_tb_groups; i++) { + if (zp_sh_wr_pred) { + cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr], + &zp_ptr[zp_gl_rd]); + } + zp_gl_rd += zp_gl_rd_delta; + } + } + } + } + } + // Insert a fence even when we are winding down the pipeline to ensure that + // waiting is also correct at this point. + cp_async_fence(); + }; + + auto fetch_col_zp_to_shared = [&]() { + if (zp_sh_wr_pred) { + cp_async4(&sh_zp[zp_sh_wr], &zp_ptr[zp_gl_rd]); + } + }; + + auto fetch_col_scale_to_shared = [&]() { + if (s_sh_wr_pred) { + cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]); + } + }; + + // Wait until the next thread tile has been loaded to shared memory. + auto wait_for_stage = [&]() { + // We only have `stages - 2` active fetches since we are double buffering + // and can only issue the next fetch when it is guaranteed that the previous + // shared memory load is fully complete (as it may otherwise be + // overwritten). + cp_async_wait(); + __syncthreads(); + }; + + // Load the next sub-tile from the current location in the shared memory pipe + // into the current register buffer. + auto fetch_to_registers = [&](int k, int pipe) { + int4* sh_a_stage = sh_a + a_sh_stage * pipe; + #pragma unroll + for (int i = 0; i < thread_m_blocks; i++) + ldsm( + frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]); + int4* sh_b_stage = sh_b + b_sh_stage * pipe; + + #pragma unroll + for (int i = 0; i < b_thread_vecs; i++) { + frag_b_quant[k % 2][i] = *reinterpret_cast( + &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]); + } + }; + + bool is_same_group[stages]; + int same_group_id[stages]; + + auto init_same_group = [&](int pipe) { + if constexpr (!has_act_order) { + return; + } + + int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe; + int* sh_g_idx_int_ptr = reinterpret_cast(sh_g_idx_stage); + + int group_id_1 = sh_g_idx_int_ptr[0]; + int group_id_2 = sh_g_idx_int_ptr[tb_k - 1]; + + is_same_group[pipe] = group_id_1 == group_id_2; + same_group_id[pipe] = group_id_1; + }; + + auto fetch_scales_to_registers = [&](int k, int full_pipe) { + int pipe = full_pipe % stages; + + if constexpr (!has_act_order) { + // No act-order case + if constexpr (group_blocks == -1) { + // load only when starting a new slice + if (k == 0 && full_pipe == 0) { + reinterpret_cast(&frag_s)[0] = sh_s[s_sh_rd]; + reinterpret_cast(&frag_s)[1] = sh_s[s_sh_rd + 4]; + } + } else if constexpr (group_blocks != -1) { + if constexpr (group_blocks >= thread_k_blocks) { + int4* sh_s_stage = + sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) * + (pipe / (group_blocks / thread_k_blocks))); + reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd]; + } else { + int warp_id = threadIdx.x / 32; + int n_warps = thread_n_blocks / 4; + + int warp_row = warp_id / n_warps; + + int cur_k = warp_row * 16; + cur_k += k_iter_size * (k % b_sh_wr_iters); + + int k_blocks = cur_k / 16; + int cur_group_id = k_blocks / group_blocks; + + int4* sh_s_stage = sh_s + s_sh_stage * pipe; + + reinterpret_cast(&frag_s[k % 2])[0] = + sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride]; + } + } + + return; + } + + // Act-order case + + // Determine K of the "current" thread-block + int cur_k = slice_k_start + tb_k * full_pipe; + if (cur_k >= prob_k || cur_k >= slice_k_finish) { + return; + } + + // Reset (to current thread-block) since we read g_idx portion from the + // shared memory + cur_k = 0; + + // Progress to current iteration + cur_k += k_iter_size * (k % b_sh_wr_iters); + + // Determine "position" inside the thread-block (based on warp and + // thread-id) + int warp_id = threadIdx.x / 32; + int n_warps = + thread_n_blocks / 4; // Each warp processes 4 16-size tiles over N + + int warp_row = warp_id / n_warps; + int warp_col = warp_id % n_warps; + + cur_k += warp_row * 16; + + int th_id = threadIdx.x % 32; + cur_k += (th_id % 4) * 2; // Due to tensor-core layout for fp16 B matrix + + int s_col_shift = + /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) + + (th_id / 4) * act_s_col_stride; + + if (is_same_group[pipe]) { + if (k % 2 == 0) { + *(reinterpret_cast(&(act_frag_s[k % 2][0][0]))) = + sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride + + s_col_shift]; + } else { + *(reinterpret_cast(&(act_frag_s[k % 2][0][0]))) = + *(reinterpret_cast(&(act_frag_s[(k - 1) % 2][0][0]))); + } + + for (int i = 1; i < 4; i++) { + *(reinterpret_cast(&(act_frag_s[k % 2][i][0]))) = + *(reinterpret_cast(&(act_frag_s[k % 2][0][0]))); + } + return; + } + + int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe; + int* sh_g_idx_int_ptr = reinterpret_cast(sh_g_idx_stage); + + constexpr int k_frag_offsets[4] = {0, 1, 8, + 9}; // Tensor core offsets per thread + + #pragma unroll + for (int i = 0; i < 4; i++) { + int actual_k = cur_k + k_frag_offsets[i]; + + int group_id = sh_g_idx_int_ptr[actual_k]; + int rel_group_id = group_id - sh_first_group_id; + + *(reinterpret_cast(&(act_frag_s[k % 2][i][0]))) = + sh_s[rel_group_id * s_sh_stride + s_col_shift]; + } + }; + + auto fetch_zp_to_registers = [&](int k, int full_pipe) { + // This code does not handle group_blocks == 0, + // which signifies act_order. + // has_zp implies AWQ, which doesn't have act_order, + static_assert(!has_zp || group_blocks != 0); + + if constexpr (has_zp && !is_zp_float) { + int pipe = full_pipe % stages; + + if constexpr (group_blocks == -1) { + // load only when starting a new slice + if (k == 0 && full_pipe == 0) { + #pragma unroll + for (int i = 0; i < num_ints_per_thread; i++) { + frag_qzp[k % 2][i] = (reinterpret_cast(sh_zp))[zp_sh_rd + i]; + } + } + + } else if constexpr (group_blocks >= thread_k_blocks) { + int4* sh_zp_stage = + sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) * + (pipe / (group_blocks / thread_k_blocks))); + for (int i = 0; i < num_ints_per_thread; i++) { + frag_qzp[k % 2][i] = + (reinterpret_cast(sh_zp_stage))[zp_sh_rd + i]; + } + } else { + int warp_id = threadIdx.x / 32; + int n_warps = thread_n_blocks / 4; + + int warp_row = warp_id / n_warps; + + int cur_k = warp_row * 16; + cur_k += k_iter_size * (k % b_sh_wr_iters); + + int k_blocks = cur_k / 16; + int cur_group_id = 0; + + // Suppress bogus and persistent divide-by-zero warning + #pragma nv_diagnostic push + #pragma nv_diag_suppress divide_by_zero + cur_group_id = k_blocks / group_blocks; + #pragma nv_diagnostic pop + + int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe; + + sh_zp_stage += cur_group_id * zp_sh_stride; + + for (int i = 0; i < num_ints_per_thread; i++) { + frag_qzp[k % 2][i] = + (reinterpret_cast(sh_zp_stage))[zp_sh_rd + i]; + } + } + } + + else if constexpr (has_zp && is_zp_float) { + int pipe = full_pipe % stages; + + if constexpr (group_blocks != -1) { + if constexpr (group_blocks >= thread_k_blocks) { + int4* sh_zp_stage = + sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) * + (pipe / (group_blocks / thread_k_blocks))); + reinterpret_cast(&frag_zpf[k % 2])[0] = sh_zp_stage[zp_sh_rd]; + } else { + int warp_id = threadIdx.x / 32; + int n_warps = thread_n_blocks / 4; + + int warp_row = warp_id / n_warps; + + int cur_k = warp_row * 16; + cur_k += k_iter_size * (k % b_sh_wr_iters); + + int k_blocks = cur_k / 16; + // Suppress bogus and persistent divide-by-zero warning + #pragma nv_diagnostic push + #pragma nv_diag_suppress divide_by_zero + int cur_group_id = k_blocks / group_blocks; + #pragma nv_diagnostic pop + + int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe; + + reinterpret_cast(&frag_zpf[k % 2])[0] = + sh_zp_stage[zp_sh_rd + cur_group_id * zp_sh_stride]; + } + } + } + }; + + // Execute the actual tensor core matmul of a sub-tile. + bool is_first_matmul_in_slice = true; + auto matmul = [&](int k) { + int k2 = k % 2; + const bool is_new_zp = + ((group_blocks != -1) && (group_blocks < thread_k_blocks || k == 0)) || + (group_blocks == -1 && is_first_matmul_in_slice); + if constexpr (has_zp && !is_zp_float) { + if (is_new_zp) { + if constexpr (group_blocks == -1) is_first_matmul_in_slice = false; + FragB frag_zp_0; + FragB frag_zp_1; + int zp_quant_0, zp_quant_1; + + if constexpr (w_type.size_bits() == 4) { + zp_quant_0 = frag_qzp[k2][0]; + zp_quant_1 = zp_quant_0 >> 8; + } else { + static_assert(w_type.size_bits() == 8); + zp_quant_0 = frag_qzp[k2][0]; + zp_quant_1 = frag_qzp[k2][1]; + } + + dequant(zp_quant_0, frag_zp_0); + dequant(zp_quant_1, frag_zp_1); + + frag_zp[0] = frag_zp_0[0]; + frag_zp[1] = frag_zp_0[1]; + frag_zp[2] = frag_zp_1[0]; + frag_zp[3] = frag_zp_1[1]; + } + } + // We have the m dimension as the inner loop in order to encourage overlapping + // dequantization and matmul operations. + #pragma unroll + for (int j = 0; j < 4; j++) { + FragB frag_b0; + FragB frag_b1; + int b_quant_0, b_quant_1; + + if constexpr (w_type.size_bits() == 4) { + b_quant_0 = frag_b_quant[k2][0][j]; + b_quant_1 = b_quant_0 >> 8; + } else { + static_assert(w_type.size_bits() == 8); + int* frag_b_quant_ptr = reinterpret_cast(frag_b_quant[k2]); + b_quant_0 = frag_b_quant_ptr[j * 2 + 0]; + b_quant_1 = frag_b_quant_ptr[j * 2 + 1]; + } + + dequant(b_quant_0, frag_b0); + dequant(b_quant_1, frag_b1); + + // Apply scale to frag_b0 + if constexpr (has_act_order) { + static_assert(group_blocks != -1); + scale4(frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j], + act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0); + scale4(frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j], + act_frag_s[k][2][j], act_frag_s[k2][3][j], 1); + + } else if constexpr (has_zp && !is_zp_float && group_blocks == -1) { + int idx = (threadIdx.x / 4) % 2; + scalar_t2 s2 = Dtype::nums2num2( + reinterpret_cast(&frag_s[j / 2][j % 2 * 2 + 0])[idx], + reinterpret_cast(&frag_s[j / 2][j % 2 * 2 + 1])[idx]); + if (is_new_zp) frag_zp[j] = __hmul2(frag_zp[j], s2); + scale_and_sub(frag_b0, s2.x, frag_zp[j].x); + scale_and_sub(frag_b1, s2.y, frag_zp[j].y); + } else if constexpr (has_zp && !is_zp_float && group_blocks != -1) { + if (is_new_zp) + frag_zp[j] = __hmul2(frag_zp[j], + *reinterpret_cast(&frag_s[k2][j])); + scale_and_sub(frag_b0, frag_s[k % 2][j][0].x, frag_zp[j].x); + scale_and_sub(frag_b1, frag_s[k % 2][j][0].y, frag_zp[j].y); + } else if constexpr (has_zp && is_zp_float && group_blocks != -1) { + if (is_new_zp) + frag_zpf[k2][j] = __hmul2( + frag_zpf[k2][j], *reinterpret_cast(&frag_s[k2][j])); + scale_and_sub(frag_b0, frag_s[k2][j].x, frag_zpf[k2][j].x); + scale_and_sub(frag_b1, frag_s[k2][j].y, frag_zpf[k2][j].y); + } else if constexpr (group_blocks != -1) { + scale(frag_b0, frag_s[k2][j], 0); + scale(frag_b1, frag_s[k2][j], 1); + } + + #pragma unroll + for (int i = 0; i < thread_m_blocks; i++) { + if constexpr (m_block_size_8) { + mma_trans(frag_a[k2][i], frag_b0, frag_b1, frag_c[i][j][0]); + } else { + mma(frag_a[k2][i], frag_b0, frag_c[i][j][0]); + mma(frag_a[k2][i], frag_b1, frag_c[i][j][1]); + } + } + } + }; + + // Since we slice across the k dimension of a tile in order to increase the + // number of warps while keeping the n dimension of a tile reasonable, we have + // multiple warps that accumulate their partial sums of the same output + // location; which we have to reduce over in the end. We do in shared memory. + auto thread_block_reduce = [&]() { + constexpr int red_off = threads / b_sh_stride_threads / 2; + if (red_off >= 1) { + int red_idx = threadIdx.x / b_sh_stride_threads; + constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2; + constexpr int red_sh_delta = b_sh_stride_threads; + int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) + + (threadIdx.x % b_sh_stride_threads); + + // Parallel logarithmic shared memory reduction. We make sure to avoid any + // unnecessary read or write iterations, e.g., for two warps we write only + // once by warp 1 and read only once by warp 0. + + #pragma unroll + for (int m_block = 0; m_block < thread_m_blocks; m_block++) { + #pragma unroll + for (int i = red_off; i > 0; i /= 2) { + if (i <= red_idx && red_idx < 2 * i) { + #pragma unroll + for (int j = 0; j < 4 * 2; j += (m_block_size_8 ? 2 : 1)) { + int red_sh_wr = + red_sh_delta * j + (red_sh_rd - red_sh_stride * i); + if (i < red_off) { + float* c_rd = reinterpret_cast( + &sh_red[red_sh_delta * j + red_sh_rd]); + float* c_wr = reinterpret_cast(&sh_red[red_sh_wr]); + #pragma unroll + for (int k = 0; k < 4; k++) + reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += + c_rd[k] + c_wr[k]; + } + sh_red[red_sh_wr] = + reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; + } + } + __syncthreads(); + } + if (red_idx == 0) { + #pragma unroll + for (int i = 0; i < 4 * 2; i += (m_block_size_8 ? 2 : 1)) { + float* c_rd = + reinterpret_cast(&sh_red[red_sh_delta * i + red_sh_rd]); + #pragma unroll + for (int j = 0; j < 4; j++) + reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += + c_rd[j]; + } + } + __syncthreads(); + } + } + }; + + // Since multiple threadblocks may process parts of the same column slice, we + // finally have to globally reduce over the results. As the striped + // partitioning minimizes the number of such reductions and our outputs are + // usually rather small, we perform this reduction serially in L2 cache. + auto global_reduce_fp16 = [&](bool first = false, bool last = false) { + // We are very careful here to reduce directly in the output buffer to + // maximize L2 cache utilization in this step. To do this, we write out + // results in FP16 (but still reduce with FP32 compute). + constexpr int active_threads = 32 * thread_n_blocks / 4; + bool is_th_active = threadIdx.x < active_threads; + if (!is_th_active) { + return; + } + + int c_gl_stride = prob_n / 8; + int c_gl_wr_delta_o = 8 * c_gl_stride; + int c_gl_wr_delta_i = 4 * (active_threads / 32); + int c_gl_wr; + if constexpr (m_block_size_8) { + c_gl_wr = c_gl_stride * ((threadIdx.x % 4) * 2) + 4 * (threadIdx.x / 32) + + (threadIdx.x % 32) / 8; + c_gl_wr += (2 * thread_n_blocks) * slice_col; + } else { + c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) + + 4 * (threadIdx.x / 32) + threadIdx.x % 4; + c_gl_wr += (2 * thread_n_blocks) * slice_col; + } + constexpr int c_sh_wr_delta = active_threads; + int c_sh_wr = threadIdx.x; + + if (!first) { + + #pragma unroll + for (int i = 0; i < (m_block_size_8 ? 2 : thread_m_blocks * 4); i++) { + int c_idx; + if constexpr (m_block_size_8) + c_idx = c_gl_wr + i * c_gl_stride + + (threadIdx.x % 8) / 4 * c_gl_wr_delta_i; + else + c_idx = + c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2); + if (c_idx / c_gl_stride < block_num_valid_tokens) { + int64_t sorted_row = sh_block_sorted_ids[c_idx / c_gl_stride]; + int64_t true_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride; + sh_red[c_sh_wr + c_sh_wr_delta * i] = C[true_idx]; + } + } + } + + #pragma unroll + for (int i = 0; i < (m_block_size_8 ? 2 : thread_m_blocks * 4); i++) { + if (!first) { + int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta]; + #pragma unroll + for (int j = 0; j < 2 * 4; j++) { + int delta = 0; + if constexpr (m_block_size_8) { + delta = j % 2 == 1 ? -2 : 0; + } + reinterpret_cast( + &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta] += + Dtype::num2float(reinterpret_cast(&c_red)[j]); + } + } + if (!last) { + int4 c; + #pragma unroll + for (int j = 0; j < 2 * 4; j++) { + int delta = 0; + if constexpr (m_block_size_8) { + delta = j % 2 == 1 ? -2 : 0; + } + reinterpret_cast(&c)[j] = + Dtype::float2num(reinterpret_cast( + &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta]); + } + + int c_idx; + if constexpr (m_block_size_8) + c_idx = c_gl_wr + i * c_gl_stride + + (threadIdx.x % 8) / 4 * c_gl_wr_delta_i; + else + c_idx = + c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2); + if (c_idx / c_gl_stride < block_num_valid_tokens) { + int64_t sorted_row = sh_block_sorted_ids[c_idx / c_gl_stride]; + int64_t true_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride; + C[true_idx] = c; + } + } + } + }; + + // Globally reduce over threadblocks that compute the same column block. + // We use a tmp C buffer to reduce in full fp32 precision. + auto global_reduce_fp32 = [&](bool first = false, bool last = false) { + constexpr int tb_m = thread_m_blocks * 16; + constexpr int tb_n = thread_n_blocks * 16; + + constexpr int c_size = tb_m * tb_n * sizeof(float) / 16; + + constexpr int active_threads = 32 * thread_n_blocks / 4; + bool is_th_active = threadIdx.x < active_threads; + + constexpr int num_floats = thread_m_blocks * 4 * 2 * 4; + constexpr int th_size = num_floats * sizeof(float) / 16; + + int c_cur_offset = locks_off * c_size; + + if (!is_th_active) { + return; + } + + if (!first) { + float* frag_c_ptr = reinterpret_cast(&frag_c); + #pragma unroll + for (int k = 0; k < th_size; k++) { + if constexpr (m_block_size_8) { + if (k % 2) continue; + } else { + if (k / 8 * 16 + (threadIdx.x % 32) / 4 >= block_num_valid_tokens) + continue; + } + + sh_red[threadIdx.x] = + C_tmp[c_cur_offset + active_threads * k + threadIdx.x]; + + float* sh_c_ptr = reinterpret_cast(&sh_red[threadIdx.x]); + #pragma unroll + for (int f = 0; f < 4; f++) { + frag_c_ptr[k * 4 + f] += sh_c_ptr[f]; + } + } + } + + if (!last) { + int4* frag_c_ptr = reinterpret_cast(&frag_c); + #pragma unroll + for (int k = 0; k < th_size; k++) { + if constexpr (m_block_size_8) { + if (k % 2) continue; + } else { + if (k / 8 * 16 + (threadIdx.x % 32) / 4 >= block_num_valid_tokens) + continue; + } + + C_tmp[c_cur_offset + active_threads * k + threadIdx.x] = frag_c_ptr[k]; + } + } + }; + + // Write out the reduce final result in the correct layout. We only actually + // reshuffle matrix fragments in this step, the reduction above is performed + // in fragment layout. + auto write_result = [&]() { + int c_gl_stride = prob_n / 8; + constexpr int c_sh_stride = 2 * thread_n_blocks + 1; + int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks)); + constexpr int c_sh_rd_delta = + c_sh_stride * (threads / (2 * thread_n_blocks)); + + int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) + + (threadIdx.x % (2 * thread_n_blocks)); + c_gl_wr += (2 * thread_n_blocks) * slice_col; + int c_sh_wr; + if constexpr (m_block_size_8) { + c_sh_wr = (8 * c_sh_stride) * ((threadIdx.x % 32) % 4 * 2) + + (threadIdx.x % 32) / 4; + c_sh_wr += 64 * (threadIdx.x / 32); + } else { + c_sh_wr = + (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4; + c_sh_wr += 32 * (threadIdx.x / 32); + } + + int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) + + (threadIdx.x % (2 * thread_n_blocks)); + + // We first reorder in shared memory to guarantee the most efficient final + // global write patterns + auto write = [&](int idx, float c0, float c1, FragS& s) { + scalar_t2 res = + Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1)); + + // For per-column quantization we finally apply the scale here (only for + // 4-bit) + if constexpr (!has_act_order && group_blocks == -1 && + w_type.size_bits() == 4 && !has_zp) { + res = __hmul2(res, s[0]); + } + + if constexpr (m_block_size_8) { + ((scalar_t*)sh_red)[idx] = res.x; + ((scalar_t*)sh_red)[idx + 8 * c_sh_stride] = res.y; + } else { + ((scalar_t2*)sh_red)[idx] = res; + } + }; + + if (threadIdx.x / 32 < thread_n_blocks / 4) { + #pragma unroll + for (int i = 0; i < thread_m_blocks; i++) { + #pragma unroll + for (int j = 0; j < 4; j++) { + if constexpr (m_block_size_8) { + int wr = c_sh_wr + 16 * j; + write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1], + frag_s[j / 2][2 * (j % 2) + 0]); + write(wr + 8, frag_c[i][j][0][2], frag_c[i][j][0][3], + frag_s[j / 2][2 * (j % 2) + 1]); + } else { + int wr = c_sh_wr + 8 * j; + write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0], + frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]); + write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2], + frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]); + write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0], + frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]); + write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2], + frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]); + } + } + c_sh_wr += 16 * (4 * c_sh_stride); + } + } + __syncthreads(); + + #pragma unroll + for (int i = 0; + i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks)); + i++) { + int row = c_gl_wr / c_gl_stride; + if (row < block_num_valid_tokens) { + int64_t sorted_row = sh_block_sorted_ids[row]; + int64_t true_idx = sorted_row * c_gl_stride + c_gl_wr % c_gl_stride; + scalar_t2 topk_weight_score; + if (mul_topk_weights) topk_weight_score = sh_block_topk_weights[row]; + if (use_atomic_add && slice_count > 1 || mul_topk_weights) { + scalar_t2* C_half2 = reinterpret_cast(&C[true_idx]); + scalar_t2* sh_red_half2 = + reinterpret_cast(&sh_red[c_sh_rd]); + #pragma unroll + for (int a = 0; a < 4; a++) { + scalar_t2 res = sh_red_half2[a]; + if (mul_topk_weights) { + res = __hmul2(res, topk_weight_score); + } + + if (use_atomic_add && slice_count > 1) { + atomicAdd(&C_half2[a], res); + } else { + C_half2[a] = res; + }; + } + } else { + C[true_idx] = sh_red[c_sh_rd]; + } + c_gl_wr += c_gl_wr_delta; + c_sh_rd += c_sh_rd_delta; + } + } + __syncthreads(); + }; + + // Start global fetch and register load pipelines. + auto start_pipes = [&]() { + + #pragma unroll + for (int i = 0; i < stages - 1; i++) { + if (has_act_order && i == 0) { + int last_g_idx = slice_k_start + stages * tb_k * 2; + if (last_g_idx >= prob_k) { + last_g_idx = prob_k - 1; + } + fetch_act_order_scales_to_shared(true, g_idx[slice_k_start], + g_idx[last_g_idx]); + } + + if constexpr (has_zp && !is_zp_float && group_blocks == -1) { + if (i == 0) { + fetch_col_zp_to_shared(); + fetch_col_scale_to_shared(); + } + } + fetch_to_shared(i, i, i < slice_iters); + } + + zero_accums(); + wait_for_stage(); + init_same_group(0); + fetch_to_registers(0, 0); + fetch_scales_to_registers(0, 0); + fetch_zp_to_registers(0, 0); + a_gl_rd += a_gl_rd_delta_o * (stages - 1); + slice_k_start_shared_fetch += tb_k * (stages - 1); + }; + if (slice_iters) { + start_pipes(); + } + + // Main loop. + while (slice_iters) { + // We unroll over both the global fetch and the register load pipeline to + // ensure all shared memory accesses are static. Note that both pipelines + // have even length meaning that the next iteration will always start at + // index 0. + + #pragma unroll + for (int pipe = 0; pipe < stages;) { + #pragma unroll + for (int k = 0; k < b_sh_wr_iters; k++) { + fetch_to_registers(k + 1, pipe % stages); + fetch_scales_to_registers(k + 1, pipe); + fetch_zp_to_registers(k + 1, pipe); + if (k == b_sh_wr_iters - 2) { + fetch_to_shared((pipe + stages - 1) % stages, pipe, + slice_iters >= stages); + pipe++; + wait_for_stage(); + init_same_group(pipe % stages); + } + matmul(k); + } + slice_iters--; + if (slice_iters == 0) { + break; + } + } + a_remaining_load_count_in_slice = 0; + + a_gl_rd += a_gl_rd_delta_o * stages; + slice_k_start += tb_k * stages; + slice_k_start_shared_fetch += tb_k * stages; + + if constexpr (has_act_order) { + int first_group_id = g_idx[slice_k_start]; + int last_g_idx = slice_k_start + stages * tb_k * 2; + if (last_g_idx >= prob_k) { + last_g_idx = prob_k - 1; + } + int last_group_id = g_idx[last_g_idx]; + if (last_group_id >= sh_first_group_id + sh_num_groups) { + fetch_act_order_scales_to_shared(false, first_group_id, last_group_id); + __syncthreads(); + } + } + + // Process results and, if necessary, proceed to the next column slice. + // While this pattern may not be the most readable, other ways of writing + // the loop seemed to noticeably worse performance after compilation. + if (slice_iters == 0) { + cp_async_wait<0>(); + bool last = slice_idx == slice_count - 1; + // For per-column scales, we only fetch them here in the final step before + // write-out + if constexpr (!has_act_order && group_blocks == -1 && !has_zp) { + if (w_type.size_bits() == 8 || (last || use_atomic_add)) { + if (s_sh_wr_pred) { + cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]); + } + cp_async_fence(); + } + } + + thread_block_reduce(); + if constexpr (!has_act_order && group_blocks == -1 && !has_zp) { + if (w_type.size_bits() == 8 || (last || use_atomic_add)) { + cp_async_wait<0>(); + __syncthreads(); + if (threadIdx.x / 32 < thread_n_blocks / 4) { + reinterpret_cast(&frag_s)[0] = sh_s[s_sh_rd + 0]; + reinterpret_cast(&frag_s)[1] = sh_s[s_sh_rd + 4]; + if constexpr (m_block_size_8) { + int idx = (threadIdx.x / 4) % 2; + scalar_t2* frag_s_half2 = reinterpret_cast(frag_s); + #pragma unroll + for (int i = 0; i < 8; i++) { + frag_s_half2[i] = Dtype::num2num2( + reinterpret_cast(&frag_s_half2[i])[idx]); + } + } + } + } + } + + // For 8-bit channelwise, we apply the scale before the global reduction + // that converts the fp32 results to fp16 (so that we avoid possible + // overflow in fp16) + if constexpr (!has_act_order && group_blocks == -1 && + w_type.size_bits() == 8 && !has_zp) { + if (threadIdx.x / 32 < thread_n_blocks / 4) { + #pragma unroll + for (int i = 0; i < thread_m_blocks; i++) { + #pragma unroll + for (int j = 0; j < 4; j++) { + scale_float( + reinterpret_cast(&frag_c[i][j][0][0]), + frag_s[j / 2][2 * (j % 2) + 0]); + scale_float( + reinterpret_cast(&frag_c[i][j][0][2]), + frag_s[j / 2][2 * (j % 2) + (m_block_size_8 ? 1 : 0)]); + + if constexpr (!m_block_size_8) { + scale_float( + reinterpret_cast(&frag_c[i][j][1][0]), + frag_s[j / 2][2 * (j % 2) + 1]); + scale_float( + reinterpret_cast(&frag_c[i][j][1][2]), + frag_s[j / 2][2 * (j % 2) + 1]); + } + } + } + } + } + + if (slice_count > 1 && !use_atomic_add) { + // only globally reduce if there is more than one block in a slice + barrier_acquire(&locks[locks_off], slice_idx); + if (use_fp32_reduce) { + global_reduce_fp32(slice_idx == 0, last); + } else { + global_reduce_fp16(slice_idx == 0, last); + } + barrier_release(&locks[locks_off], last); + } + if (use_atomic_add && slice_count > 1 && slice_idx != 0) + wait_negative_and_add(&locks[locks_off]); + if (last || use_atomic_add) + // only the last block in a slice actually writes the result + write_result(); + if (slice_row) a_remaining_load_count_in_slice = stages; + slice_row = 0; + slice_col_par++; + slice_col++; + is_first_matmul_in_slice = true; + init_slice(); + if (slice_iters) { + a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + + (threadIdx.x % a_gl_rd_delta_o); + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) + B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles; + if (slice_col == 0) { + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride; + } + + // Update slice k/n for scales loading + if constexpr (has_act_order) { + slice_k_start = tb_k * slice_row; + slice_k_finish = slice_k_start + tb_k * slice_iters; + slice_k_start_shared_fetch = slice_k_start; + slice_n_offset = act_s_col_tb_stride * slice_col; + + } else { + s_gl_rd = s_sh_stride * slice_col + threadIdx.x; + zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x; + } + + start_pipes(); + } + } + } +} + +} // namespace MARLIN_NAMESPACE_NAME + +#endif diff --git a/csrc/moe/marlin_moe_wna16/ops.cu b/csrc/moe/marlin_moe_wna16/ops.cu new file mode 100644 index 0000000000000..a16e955a325e2 --- /dev/null +++ b/csrc/moe/marlin_moe_wna16/ops.cu @@ -0,0 +1,927 @@ +/* + * Modified by Neural Magic + * Copyright (C) Marlin.2024 Elias Frantar + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Adapted from https://github.com/IST-DASLab/marlin + */ + +#ifndef MARLIN_NAMESPACE_NAME + #define MARLIN_NAMESPACE_NAME marlin_moe_wna16 +#endif + +#include "kernel.h" +#include "core/registration.h" + +#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \ + static_assert(std::is_same::value || \ + std::is_same::value, \ + "only float16 and bfloat16 is supported"); + +namespace MARLIN_NAMESPACE_NAME { + +__global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){}; + +using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS); + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + +template +__global__ void permute_cols_kernel( + int4 const* __restrict__ a_int4_ptr, int const* __restrict__ perm_int_ptr, + int4* __restrict__ out_int4_ptr, + const int32_t* __restrict__ sorted_token_ids_ptr, + const int32_t* __restrict__ expert_ids_ptr, + const int32_t* __restrict__ num_tokens_past_padded_ptr, int size_m, + int size_k, int top_k) {}; + +} // namespace marlin + +torch::Tensor moe_wna16_marlin_gemm( + torch::Tensor& a, std::optional const& c_or_none, + torch::Tensor& b_q_weight, torch::Tensor& b_scales, + std::optional const& b_zeros_or_none, + std::optional const& g_idx_or_none, + std::optional const& perm_or_none, torch::Tensor& workspace, + torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids, + torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights, + int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep, + vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n, + int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce, + bool is_zp_float) { + TORCH_CHECK_NOT_IMPLEMENTED(false, + "marlin_gemm(..) requires CUDA_ARCH >= 8.0"); + return torch::empty({1, 1}); +} + +#else + +// For a given "a" of size [M,K] performs a permutation of the K columns based +// on the given "perm" indices. +template +__global__ void permute_cols_kernel( + int4 const* __restrict__ a_int4_ptr, int const* __restrict__ perm_int_ptr, + int4* __restrict__ out_int4_ptr, + const int32_t* __restrict__ sorted_token_ids_ptr, + const int32_t* __restrict__ expert_ids_ptr, + const int32_t* __restrict__ num_tokens_past_padded_ptr, int size_m, + int size_k, int top_k) { + int num_tokens_past_padded = num_tokens_past_padded_ptr[0]; + int num_moe_blocks = div_ceil(num_tokens_past_padded, moe_block_size); + int32_t block_sorted_ids[moe_block_size]; + int block_num_valid_tokens = 0; + int64_t old_expert_id = 0; + int64_t expert_id = 0; + int row_stride = size_k * sizeof(half) / 16; + + auto read_moe_block_data = [&](int block_id) { + block_num_valid_tokens = moe_block_size; + int4* tmp_block_sorted_ids = reinterpret_cast(block_sorted_ids); + for (int i = 0; i < moe_block_size / 4; i++) { + tmp_block_sorted_ids[i] = + ((int4*)sorted_token_ids_ptr)[block_id * moe_block_size / 4 + i]; + } + for (int i = 0; i < moe_block_size; i++) { + if (block_sorted_ids[i] >= size_m * top_k) { + block_num_valid_tokens = i; + break; + }; + } + }; + + auto permute_row = [&](int row) { + int iters = size_k / default_threads; + int rest = size_k % default_threads; + + int in_offset = (row / top_k) * row_stride; + int out_offset = row * row_stride; + + half const* a_row_half = + reinterpret_cast(a_int4_ptr + in_offset); + half* out_half = reinterpret_cast(out_int4_ptr + out_offset); + + int base_k = 0; + + for (int i = 0; i < iters; i++) { + int cur_k = base_k + threadIdx.x; + int src_pos = perm_int_ptr[cur_k]; + + out_half[cur_k] = a_row_half[src_pos]; + + base_k += default_threads; + } + + if (rest) { + if (threadIdx.x < rest) { + int cur_k = base_k + threadIdx.x; + int src_pos = perm_int_ptr[cur_k]; + + out_half[cur_k] = a_row_half[src_pos]; + } + } + }; + + for (int index = blockIdx.x; index < num_moe_blocks; index += gridDim.x) { + old_expert_id = expert_id; + int tmp_expert_id = expert_ids_ptr[index]; + if (tmp_expert_id == -1) continue; + expert_id = tmp_expert_id; + perm_int_ptr += (expert_id - old_expert_id) * size_k; + read_moe_block_data(index); + + for (int i = 0; i < block_num_valid_tokens; i++) + permute_row(block_sorted_ids[i]); + } +} + +typedef struct { + int thread_k; + int thread_n; + int num_threads; +} thread_config_t; + +thread_config_t small_batch_thread_configs[] = { + // Ordered by priority + + // thread_k, thread_n, num_threads + {128, 128, 256}, + {64, 128, 128}}; + +thread_config_t large_batch_thread_configs[] = { + // Ordered by priority + + // thread_k, thread_n, num_threads + {64, 256, 256}, + {64, 128, 128}}; + +typedef struct { + int blocks_per_sm; + thread_config_t tb_cfg; +} exec_config_t; + +int get_scales_cache_size(thread_config_t const& th_config, int prob_m, + int prob_n, int prob_k, int num_bits, int group_size, + bool has_act_order, bool is_k_full) { + bool cache_scales_chunk = has_act_order && !is_k_full; + + int tb_n = th_config.thread_n; + int tb_k = th_config.thread_k; + + // Get max scale groups per thread-block + int tb_groups; + if (group_size == -1) { + tb_groups = 1; + } else if (group_size == 0) { + tb_groups = div_ceil(tb_k, 32); // Worst case is 32 group size + } else { + tb_groups = div_ceil(tb_k, group_size); + } + + if (cache_scales_chunk) { + int load_groups = + tb_groups * pipe_stages * 2; // Chunk size is 2x pipeline over dim K + load_groups = max(load_groups, 32); // We load at least 32 scale groups + return load_groups * tb_n * 2; + + } else { + int tb_scales = tb_groups * tb_n * 2; + + return tb_scales * pipe_stages; + } +} + +int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks, + int prob_m, int prob_n, int prob_k, int num_bits, + int group_size, bool has_act_order, bool is_k_full, + int has_zp, int is_zp_float) { + int pack_factor = 32 / num_bits; + + // Get B size + int tb_k = th_config.thread_k; + int tb_n = th_config.thread_n; + int tb_m = thread_m_blocks * 16; + + // shm size for block_sorted_ids/block_topk_weights + // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32) + int sh_block_meta_size = tb_m * 4 * 2; + int sh_a_size = pipe_stages * (tb_m * tb_k) * 2; + int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4; + int sh_s_size = + get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits, + group_size, has_act_order, is_k_full); + int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0; + int sh_zp_size = 0; + if (has_zp) { + if (is_zp_float) + sh_zp_size = sh_s_size; + else if (num_bits == 4) + sh_zp_size = sh_s_size / 4; + else if (num_bits == 8) + sh_zp_size = sh_s_size / 2; + } + + int total_size = sh_a_size + sh_b_size + sh_s_size + sh_zp_size + + sh_g_idx_size + sh_block_meta_size; + + return total_size; +} + +bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks, + int prob_m, int prob_n, int prob_k, int num_bits, + int group_size, bool has_act_order, bool is_k_full, + int has_zp, int is_zp_float, int max_shared_mem) { + // Sanity + if (th_config.thread_k == -1 || th_config.thread_n == -1 || + th_config.num_threads == -1) { + return false; + } + + // Verify K/N are divisible by thread K/N + if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) { + return false; + } + + // Verify min for thread K/N + if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) { + return false; + } + + // num_threads must be at least 128 (= 4 warps) + if (th_config.num_threads < 128) { + return false; + } + + // Check that pipeline fits into cache + int cache_size = get_kernel_cache_size( + th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size, + has_act_order, is_k_full, has_zp, is_zp_float); + return cache_size <= max_shared_mem; +} + + #define __GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ + M_BLOCK_SIZE_8, HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS, \ + NUM_THREADS, IS_ZP_FLOAT) \ + else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS && \ + thread_n_blocks == THREAD_N_BLOCKS && \ + thread_k_blocks == THREAD_K_BLOCKS && \ + m_block_size_8 == M_BLOCK_SIZE_8 && \ + has_act_order == HAS_ACT_ORDER && has_zp == HAS_ZP && \ + group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS && \ + is_zp_float == IS_ZP_FLOAT) { \ + kernel = Marlin; \ + } + + #define GPTQ_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, true, false, 0, NUM_THREADS, \ + false) \ + __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, false, 0, \ + NUM_THREADS, false) \ + \ + __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, -1, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, 2, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, 4, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, 8, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, -1, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, 2, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, 4, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, 8, \ + NUM_THREADS, false) + + #define GPTQ_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, false, 0, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, false, 0, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, false, 0, \ + NUM_THREADS, false) \ + \ + __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, -1, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, 2, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, 4, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, 8, \ + NUM_THREADS, false) \ + \ + __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, -1, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, 2, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, 4, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, 8, \ + NUM_THREADS, false) \ + \ + __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, -1, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, 2, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, 4, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, 8, \ + NUM_THREADS, false) + + #define AWQ_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, -1, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 2, NUM_THREADS, \ + false) \ + __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 4, NUM_THREADS, \ + false) \ + __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 8, NUM_THREADS, \ + false) \ + __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, -1, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 2, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 4, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 8, \ + NUM_THREADS, false) + + #define AWQ_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, -1, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 2, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 4, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 8, \ + NUM_THREADS, false) \ + \ + __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, -1, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 2, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 4, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 8, \ + NUM_THREADS, false) \ + \ + __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, -1, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 2, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 4, \ + NUM_THREADS, false) \ + __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 8, \ + NUM_THREADS, false) + + // We currently have 4-bit models only with group_blocks == 4 + #define HQQ_GET_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 4, NUM_THREADS, \ + true) \ + __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 4, \ + NUM_THREADS, true) \ + __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 4, \ + NUM_THREADS, true) \ + __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 4, \ + NUM_THREADS, true) \ + __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 4, \ + NUM_THREADS, true) + +template +MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type, + int thread_m_blocks, int thread_n_blocks, + int thread_k_blocks, bool m_block_size_8, + bool has_act_order, bool has_zp, + int group_blocks, int num_threads, + bool is_zp_float) { + int num_bits = q_type.size_bits(); + auto kernel = MarlinDefault; + if (false) { + } + GPTQ_GET_IF_M1(vllm::kU4B8, 8, 8, 256) + GPTQ_GET_IF_M1(vllm::kU4B8, 8, 4, 128) + + GPTQ_GET_IF_M234(vllm::kU4B8, 16, 4, 256) + GPTQ_GET_IF_M234(vllm::kU4B8, 8, 4, 128) + + GPTQ_GET_IF_M1(vllm::kU8B128, 8, 8, 256) + GPTQ_GET_IF_M1(vllm::kU8B128, 8, 4, 128) + + GPTQ_GET_IF_M234(vllm::kU8B128, 16, 4, 256) + GPTQ_GET_IF_M234(vllm::kU8B128, 8, 4, 128) + + AWQ_GET_IF_M1(vllm::kU4, 8, 8, 256) + AWQ_GET_IF_M1(vllm::kU4, 8, 4, 128) + + AWQ_GET_IF_M234(vllm::kU4, 16, 4, 256) + AWQ_GET_IF_M234(vllm::kU4, 8, 4, 128) + + return kernel; +} + +template +exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m, + int prob_n, int prob_k, int thread_m_blocks, + bool m_block_size_8, int num_bits, + int group_size, bool has_act_order, + bool is_k_full, bool has_zp, + bool is_zp_float, int max_shared_mem) { + exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}}; + thread_config_t* thread_configs = thread_m_blocks > 1 + ? large_batch_thread_configs + : small_batch_thread_configs; + int thread_configs_size = + thread_m_blocks > 1 + ? sizeof(large_batch_thread_configs) / sizeof(thread_config_t) + : sizeof(small_batch_thread_configs) / sizeof(thread_config_t); + + int count = 0; + constexpr int device_max_reg_size = 255 * 1024; + for (int i = 0; i < thread_configs_size; i++) { + thread_config_t th_config = thread_configs[i]; + + if (!is_valid_config(th_config, thread_m_blocks, prob_m, prob_n, prob_k, + num_bits, group_size, has_act_order, is_k_full, has_zp, + is_zp_float, max_shared_mem)) { + continue; + } + + int cache_size = get_kernel_cache_size( + th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, + group_size, has_act_order, is_k_full, has_zp, is_zp_float); + + int group_blocks = 0; + if (!has_act_order) { + group_blocks = group_size == -1 ? -1 : group_size / 16; + } + + auto kernel = get_marlin_kernel( + q_type, thread_m_blocks, th_config.thread_n / 16, + th_config.thread_k / 16, m_block_size_8, has_act_order, has_zp, + group_blocks, th_config.num_threads, is_zp_float); + + if (kernel == MarlinDefault) continue; + + if (thread_m_blocks > 1) { + exec_cfg = {1, th_config}; + break; + } else { + cudaFuncAttributes attr; + cudaFuncGetAttributes(&attr, kernel); + int reg_size = max(attr.numRegs, 1) * th_config.num_threads * 4; + int allow_count = min(device_max_reg_size / reg_size, + max_shared_mem / (cache_size + 1024)); + allow_count = max(min(allow_count, 4), 1); + if (allow_count > count) { + count = allow_count; + exec_cfg = {count, th_config}; + }; + } + } + + return exec_cfg; +} + +template +void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, + void* zp, void* g_idx, void* perm, void* a_tmp, + void* sorted_token_ids, void* expert_ids, + void* num_tokens_past_padded, void* topk_weights, + int moe_block_size, int top_k, bool mul_topk_weights, bool is_ep, + int prob_m, int prob_n, int prob_k, void* workspace, + vllm::ScalarType const& q_type, bool has_act_order, + bool is_k_full, bool has_zp, int num_groups, int group_size, + int dev, cudaStream_t stream, int thread_k, int thread_n, + int sms, bool use_atomic_add, bool use_fp32_reduce, + bool is_zp_float) { + int thread_m_blocks = div_ceil(moe_block_size, 16); + bool m_block_size_8 = moe_block_size == 8; + + if (has_zp) { + TORCH_CHECK( + q_type == vllm::kU4 || q_type == vllm::kU8, + "q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str()); + } else { + TORCH_CHECK( + q_type == vllm::kU4B8 || q_type == vllm::kU8B128, + "q_type must be uint4b8 or uint8b128 when has_zp = False. Got = ", + q_type.str()); + } + + TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m, + ", ", prob_n, ", ", prob_k, "]"); + + int group_blocks = 0; + if (has_act_order) { + if (is_k_full) { + TORCH_CHECK(group_size != -1); + group_blocks = group_size / 16; + TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k, + " is not divisible by group_blocks = ", group_blocks); + } else { + TORCH_CHECK(group_size == 0); + group_blocks = 0; + } + } else { + if (group_size == -1) { + group_blocks = -1; + } else { + group_blocks = group_size / 16; + TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k, + " is not divisible by group_blocks = ", group_blocks); + } + } + + int num_bits = q_type.size_bits(); + const int4* A_ptr = (const int4*)A; + const int4* B_ptr = (const int4*)B; + int4* C_ptr = (int4*)C; + int4* C_tmp_ptr = (int4*)C_tmp; + const int4* s_ptr = (const int4*)s; + const int4* zp_ptr = (const int4*)zp; + const int* g_idx_ptr = (const int*)g_idx; + const int* perm_ptr = (const int*)perm; + int4* a_tmp_ptr = (int4*)a_tmp; + const int32_t* sorted_token_ids_ptr = (const int32_t*)sorted_token_ids; + const int32_t* expert_ids_ptr = (const int32_t*)expert_ids; + const int32_t* num_tokens_past_padded_ptr = + (const int32_t*)num_tokens_past_padded; + const float* topk_weights_ptr = (const float*)topk_weights; + int* locks = (int*)workspace; + + if (has_act_order) { + // Permute A columns + auto kernel = permute_cols_kernel<8>; + if (moe_block_size == 8) { + } else if (moe_block_size == 16) + kernel = permute_cols_kernel<16>; + else if (moe_block_size == 32) + kernel = permute_cols_kernel<32>; + else if (moe_block_size == 48) + kernel = permute_cols_kernel<48>; + else if (moe_block_size == 64) + kernel = permute_cols_kernel<64>; + else + TORCH_CHECK(false, "unsupported moe_block_size ", moe_block_size); + + // avoid ">>>" being formatted to "> > >" + // clang-format off + kernel<<>>( + A_ptr, perm_ptr, a_tmp_ptr, sorted_token_ids_ptr, expert_ids_ptr, + num_tokens_past_padded_ptr, prob_m, prob_k, top_k); + // clang-format on + A_ptr = a_tmp_ptr; + prob_m = prob_m * top_k; + top_k = 1; + + // If we have a full K, then we can run the non-act-order version of Marlin + // (since the weight rows are reordered by increasing group ids, and by + // having a full K, we have full original groups) + if (is_k_full) has_act_order = false; + } + + int max_shared_mem = 0; + cudaDeviceGetAttribute(&max_shared_mem, + cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); + TORCH_CHECK(max_shared_mem > 0); + + // Set thread config + exec_config_t exec_cfg; + thread_config_t thread_tfg; + if (thread_k != -1 && thread_n != -1) { + thread_tfg = thread_config_t{thread_k, thread_n, default_threads}; + exec_cfg = exec_config_t{1, thread_tfg}; + TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n, + " is not divisible by thread_n = ", thread_n); + TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k, + " is not divisible by thread_k = ", thread_k); + } else { + // Auto config + exec_cfg = determine_exec_config( + q_type, prob_m, prob_n, prob_k, thread_m_blocks, m_block_size_8, + num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float, + max_shared_mem); + thread_tfg = exec_cfg.tb_cfg; + } + + int num_threads = thread_tfg.num_threads; + thread_k = thread_tfg.thread_k; + thread_n = thread_tfg.thread_n; + int blocks = sms * exec_cfg.blocks_per_sm; + if (exec_cfg.blocks_per_sm > 1) + max_shared_mem = max_shared_mem / exec_cfg.blocks_per_sm - 1024; + + int thread_k_blocks = thread_k / 16; + int thread_n_blocks = thread_n / 16; + + TORCH_CHECK(is_valid_config(thread_tfg, thread_m_blocks, prob_m, prob_n, + prob_k, num_bits, group_size, has_act_order, + is_k_full, has_zp, is_zp_float, max_shared_mem), + "Invalid thread config: thread_m_blocks = ", thread_m_blocks, + ", thread_k = ", thread_tfg.thread_k, + ", thread_n = ", thread_tfg.thread_n, + ", num_threads = ", thread_tfg.num_threads, " for MKN = [", + prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits, + ", group_size = ", group_size, + ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full, + ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float, + ", max_shared_mem = ", max_shared_mem); + + auto kernel = get_marlin_kernel( + q_type, thread_m_blocks, thread_n_blocks, thread_k_blocks, m_block_size_8, + has_act_order, has_zp, group_blocks, num_threads, is_zp_float); + + if (kernel == MarlinDefault) { + TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n, + ", ", prob_k, "]", ", has_act_order = ", has_act_order, + ", num_groups = ", num_groups, ", group_size = ", group_size, + ", thread_m_blocks = ", thread_m_blocks, + ", thread_n_blocks = ", thread_n_blocks, + ", thread_k_blocks = ", thread_k_blocks, + ", num_bits = ", num_bits); + } + + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem); + // avoid ">>>" being formatted to "> > >" + // clang-format off + kernel<<>>( + A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr, + sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr, + topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m, + prob_n, prob_k, locks, use_atomic_add, use_fp32_reduce); + // clang-format on +} + +} // namespace MARLIN_NAMESPACE_NAME + +torch::Tensor moe_wna16_marlin_gemm( + torch::Tensor& a, std::optional const& c_or_none, + torch::Tensor& b_q_weight, torch::Tensor& b_scales, + std::optional const& b_zeros_or_none, + std::optional const& g_idx_or_none, + std::optional const& perm_or_none, torch::Tensor& workspace, + torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids, + torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights, + int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep, + vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n, + int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce, + bool is_zp_float) { + vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id); + int pack_factor = 32 / b_q_type.size_bits(); + + if (moe_block_size != 8) { + TORCH_CHECK(moe_block_size % 16 == 0, + "unsupported moe_block_size=", moe_block_size); + TORCH_CHECK(moe_block_size >= 16 && moe_block_size <= 64, + "unsupported moe_block_size=", moe_block_size); + } + + // Verify A + TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0), + ", size_m = ", size_m); + TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1), + ", size_k = ", size_k); + + // Verify B + TORCH_CHECK( + size_k % MARLIN_NAMESPACE_NAME::tile_size == 0, "size_k = ", size_k, + " is not divisible by tile_size = ", MARLIN_NAMESPACE_NAME::tile_size); + TORCH_CHECK((size_k / MARLIN_NAMESPACE_NAME::tile_size) == b_q_weight.size(1), + "Shape mismatch: b_q_weight.size(1) = ", b_q_weight.size(1), + ", size_k = ", size_k, + ", tile_size = ", MARLIN_NAMESPACE_NAME::tile_size); + TORCH_CHECK( + b_q_weight.size(2) % MARLIN_NAMESPACE_NAME::tile_size == 0, + "b_q_weight.size(2) = ", b_q_weight.size(2), + " is not divisible by tile_size = ", MARLIN_NAMESPACE_NAME::tile_size); + int actual_size_n = + (b_q_weight.size(2) / MARLIN_NAMESPACE_NAME::tile_size) * pack_factor; + TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n, + ", actual_size_n = ", actual_size_n); + + // Verify device and strides + TORCH_CHECK(a.device().is_cuda(), "A is not on GPU"); + TORCH_CHECK(a.is_contiguous(), "A is not contiguous"); + + TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU"); + TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous"); + + TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU"); + TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous"); + + // thread_k: `k` size of a thread_tile in `weights` (can usually be left as + // auto -1) + int thread_k = -1; + // thread_n: `n` size of a thread_tile in `weights` (can usually be left as + // auto -1) + int thread_n = -1; + // sms: number of SMs to use for the kernel + int sms = -1; + cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device()); + + // Alloc buffers + const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); + auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device()); + torch::Tensor c; + if (c_or_none.has_value()) { + c = c_or_none.value(); + TORCH_CHECK(c.device().is_cuda(), "c is not on GPU"); + TORCH_CHECK(c.is_contiguous(), "c is not contiguous"); + TORCH_CHECK(c.size(0) == size_m * top_k, + "Shape mismatch: c.size(0) = ", c.size(0), + ", size_m * topk = ", size_m * top_k); + TORCH_CHECK(c.size(1) == size_n, "Shape mismatch: c.size(1) = ", c.size(1), + ", size_n = ", size_n); + } else { + c = torch::empty({size_m * top_k, size_n}, options); + } + + // Alloc C tmp buffer that is going to be used for the global reduce + torch::Tensor c_tmp; + auto options_fp32 = + torch::TensorOptions().dtype(at::kFloat).device(a.device()); + if (use_fp32_reduce && !use_atomic_add) { + // max num of threadblocks is sms * 4 + long max_c_tmp_size = min( + (long)size_n * sorted_token_ids.size(0), + (long)sms * 4 * moe_block_size * MARLIN_NAMESPACE_NAME::max_thread_n); + if (moe_block_size == 8) max_c_tmp_size *= 2; + c_tmp = torch::empty({max_c_tmp_size}, options_fp32); + } else { + c_tmp = torch::empty({0}, options_fp32); + } + + // Detect groupsize and act_order + int num_groups = -1; + int group_size = -1; + + int rank = b_scales.sizes().size(); + TORCH_CHECK(rank == 3, "b_scales rank = ", rank, " is not 3"); + TORCH_CHECK(b_scales.size(2) == size_n, "b_scales dim 2 = ", b_scales.size(2), + " is not size_n = ", size_n); + num_groups = b_scales.size(1); + + torch::Tensor g_idx, perm, a_tmp; + ; + if (g_idx_or_none.has_value() && perm_or_none.has_value()) { + g_idx = g_idx_or_none.value(); + perm = perm_or_none.value(); + + TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU"); + TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous"); + TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU"); + TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous"); + + // Verify g_idx and perm + TORCH_CHECK((g_idx.size(-1) == 0 && perm.size(-1) == 0) || + (g_idx.size(-1) == size_k && perm.size(-1) == size_k), + "Unexpected g_idx.size(-1) = ", g_idx.size(-1), + " and perm.size(-1) = ", perm.size(-1), + ", where size_k = ", size_k); + } else { + g_idx = torch::empty({0}, options); + perm = torch::empty({0}, options); + a_tmp = torch::empty({0}, options); + } + bool has_act_order = g_idx.size(-1) > 0 && perm.size(-1) > 0; + + if (has_act_order) { + a_tmp = torch::empty({size_m * top_k, size_k}, options); + if (is_k_full) { + TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1"); + TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k, + ", is not divisible by num_groups = ", num_groups); + group_size = size_k / num_groups; + } else { + group_size = 0; + } + + } else { + a_tmp = torch::empty({0}, options); + if (num_groups > 1) { + TORCH_CHECK( + size_k % num_groups == 0, "size_k = ", size_k, + ", is not divisible by b_scales.size(1) = ", b_scales.size(1)); + group_size = size_k / num_groups; + } else { + group_size = -1; + } + } + + torch::Tensor b_zeros; + if (b_zeros_or_none.has_value()) { + b_zeros = b_zeros_or_none.value(); + TORCH_CHECK(b_zeros.device().is_cuda(), "b_zeros is not on GPU"); + TORCH_CHECK(b_zeros.is_contiguous(), "b_zeros is not contiguous"); + } else { + b_zeros = torch::empty({0}, options); + } + bool has_zp = b_zeros.size(-1) > 0; + + if (has_zp) { + TORCH_CHECK( + b_q_type == vllm::kU4, + "b_q_type must be u4 when has_zp = True. Got = ", b_q_type.str()); + } else { + TORCH_CHECK( + b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128, + "b_q_type must be uint4b8 or uint8b128 when has_zp = False. Got = ", + b_q_type.str()); + } + + if (has_zp && is_zp_float) { + TORCH_CHECK(a.scalar_type() == at::ScalarType::Half, + "Computation type must be float16 (half) when using float zero " + "points."); + } + + // Verify b_zeros + if (has_zp) { + int rank = b_zeros.sizes().size(); + TORCH_CHECK(rank == 3, "b_zeros rank = ", rank, " is not 3"); + if (is_zp_float) { + TORCH_CHECK(b_zeros.size(2) == size_n, + "b_zeros dim 2 = ", b_zeros.size(2), + " is not size_n = ", size_n); + TORCH_CHECK(num_groups == b_zeros.size(1), + "b_zeros dim 1 = ", b_zeros.size(1), + " is not num_groups = ", num_groups); + TORCH_CHECK(num_groups != -1, "num_groups must be != -1"); + } else { + TORCH_CHECK(b_zeros.size(1) == num_groups, + "b_zeros dim 1 = ", b_zeros.size(1), + " is not num_groups = ", num_groups); + TORCH_CHECK(b_zeros.size(2) == size_n / pack_factor, + "b_zeros dim 2 = ", b_zeros.size(2), + " is not size_n / pack_factor = ", size_n / pack_factor); + } + } + + // Verify workspace size + TORCH_CHECK(size_n % MARLIN_NAMESPACE_NAME::min_thread_n == 0, + "size_n = ", size_n, ", is not divisible by min_thread_n = ", + MARLIN_NAMESPACE_NAME::min_thread_n); + + int max_n_tiles = size_n / MARLIN_NAMESPACE_NAME::min_thread_n; + int min_workspace_size = min( + max_n_tiles * (int)(sorted_token_ids.size(0) / moe_block_size), sms * 4); + TORCH_CHECK(workspace.numel() >= min_workspace_size, + "workspace.numel = ", workspace.numel(), + " is below min_workspace_size = ", min_workspace_size); + + int dev = a.get_device(); + if (a.scalar_type() == at::ScalarType::Half) { + MARLIN_NAMESPACE_NAME::marlin_mm( + a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), + c_tmp.data_ptr(), b_scales.data_ptr(), + b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), + a_tmp.data_ptr(), sorted_token_ids.data_ptr(), + expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(), + topk_weights.data_ptr(), moe_block_size, top_k, mul_topk_weights, is_ep, + size_m, size_n, size_k, workspace.data_ptr(), b_q_type, has_act_order, + is_k_full, has_zp, num_groups, group_size, dev, + at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, + use_atomic_add, use_fp32_reduce, is_zp_float); + } else if (a.scalar_type() == at::ScalarType::BFloat16) { + MARLIN_NAMESPACE_NAME::marlin_mm( + a.data_ptr(), b_q_weight.data_ptr(), + c.data_ptr(), c_tmp.data_ptr(), + b_scales.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(), + perm.data_ptr(), a_tmp.data_ptr(), + sorted_token_ids.data_ptr(), expert_ids.data_ptr(), + num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(), + moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k, + workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp, + num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev), + thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float); + } else { + TORCH_CHECK(false, + "moe_wna16_marlin_gemm only supports bfloat16 and float16"); + } + + return c; +} + +#endif + +TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { + m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm); +} diff --git a/csrc/moe/moe_wna16.cu b/csrc/moe/moe_wna16.cu index 51ae76c1ec882..7b6a111c00adc 100644 --- a/csrc/moe/moe_wna16.cu +++ b/csrc/moe/moe_wna16.cu @@ -13,7 +13,6 @@ template __global__ void moe_wna16_gemm_kernel( const scalar_t* __restrict__ input, scalar_t* __restrict__ output, - const uint32_t* __restrict__ qweight, const scalar_t* __restrict__ scales, const uint32_t* __restrict__ qzeros, @@ -54,8 +53,6 @@ __global__ void moe_wna16_gemm_kernel( if (token_index / top_k >= size_m) break; num_valid_tokens = m + 1; - if (blockIdx.z == 0 && offset_n < size_n) - output[token_index * size_n + offset_n] = Dtype::int2num(0); if (expert_id != -1) { int k_per_thread = DIVIDE(BLOCK_SIZE_K, BLOCK_SIZE_N); @@ -284,8 +281,7 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output, int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N, int64_t BLOCK_SIZE_K, int64_t bit) { const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); - auto options = - torch::TensorOptions().dtype(input.dtype()).device(input.device()); + output.zero_(); const int num_experts = b_qweight.size(0); const int size_m = input.size(0); @@ -302,9 +298,9 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output, const uint32_t* b_qzeros_ptr; if (b_qzeros.has_value()) b_qzeros_ptr = (const uint32_t*)b_qzeros.value().data_ptr(); - const float* topk_weights_ptr; + const float* topk_weights_ptr = nullptr; if (topk_weights.has_value()) - topk_weights_ptr = (const float*)topk_weights.value().data_ptr(); + topk_weights_ptr = (const float*)topk_weights.value().data_ptr(); int groups_per_block_row = BLOCK_SIZE_K / group_size; TORCH_CHECK(bit == 4 || bit == 8, "bit must be 4 or 8"); diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index 718418e6cd497..d0de42251f97a 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -43,14 +43,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { m.impl("moe_wna16_gemm", torch::kCUDA, &moe_wna16_gemm); m.def( - "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, " - "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! " - "b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, " - "int b_q_type, SymInt size_m, " - "SymInt size_n, SymInt size_k, bool is_k_full, int num_experts, int " - "topk, " - "int moe_block_size, bool replicate_input, bool apply_weights)" - " -> Tensor"); + "moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none," + "Tensor! b_q_weight, Tensor! b_scales, Tensor? b_zeros_or_none," + "Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace," + "Tensor sorted_token_ids," + "Tensor! expert_ids, Tensor! num_tokens_past_padded," + "Tensor! topk_weights, int moe_block_size, int top_k, " + "bool mul_topk_weights, bool is_ep, int b_q_type_id," + "int size_m, int size_n, int size_k," + "bool is_full_k, bool use_atomic_add," + "bool use_fp32_reduce, bool is_zp_float) -> Tensor"); + // conditionally compiled so impl registration is in source file #endif diff --git a/csrc/ops.h b/csrc/ops.h index 7434aead57f0e..86039a26041ba 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -52,6 +52,15 @@ void paged_attention_v2( const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step); +#ifndef USE_ROCM +void merge_attn_states(torch::Tensor& output, + std::optional output_lse, + const torch::Tensor& prefix_output, + const torch::Tensor& prefix_lse, + const torch::Tensor& suffix_output, + const torch::Tensor& suffix_lse); +#endif + void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, double epsilon); @@ -119,6 +128,8 @@ void advance_step_flashinfer( torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr, torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds); +torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor); + #ifndef USE_ROCM torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes, const torch::Tensor& codebooks, @@ -143,7 +154,8 @@ torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm); #endif torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m, - int64_t n); + int64_t n, + std::optional const& dtype); torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X, int64_t type, int64_t row); @@ -164,6 +176,7 @@ int64_t ggml_moe_get_block_size(int64_t type); bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability); bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability); bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability); +bool cutlass_group_gemm_supported(int64_t cuda_device_capability); void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A, torch::Tensor const& B, torch::Tensor const& A_sf, @@ -175,6 +188,19 @@ void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b_scales, std::optional const& bias); +void cutlass_moe_mm( + torch::Tensor& out_tensors, torch::Tensor const& a_tensors, + torch::Tensor const& b_tensors, torch::Tensor const& a_scales, + torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, + torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, + torch::Tensor const& b_strides, torch::Tensor const& c_strides); + +void get_cutlass_moe_mm_data( + const torch::Tensor& topk_ids, torch::Tensor& expert_offsets, + torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, + torch::Tensor& input_permutation, torch::Tensor& output_permutation, + const int64_t num_experts, const int64_t n, const int64_t k); + void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, @@ -251,10 +277,10 @@ void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight, const std::optional& has_initial_state, bool silu_activation, int64_t pad_slot_id); -#ifndef USE_ROCM using fptr_t = int64_t; fptr_t init_custom_ar(const std::vector& fake_ipc_ptrs, - torch::Tensor& rank_data, int64_t rank, bool full_nvlink); + torch::Tensor& rank_data, int64_t rank, + bool fully_connected); void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, fptr_t reg_buffer, int64_t reg_buffer_sz_bytes); void dispose(fptr_t _fa); @@ -265,4 +291,7 @@ get_graph_buffer_ipc_meta(fptr_t _fa); void register_graph_buffers(fptr_t _fa, const std::vector>& handles, const std::vector>& offsets); -#endif +std::tuple allocate_shared_buffer_and_handle( + int64_t size); +int64_t open_mem_handle(torch::Tensor& mem_handle); +void free_shared_buffer(int64_t buffer); diff --git a/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh b/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh new file mode 100644 index 0000000000000..6c6e89790847f --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh @@ -0,0 +1,80 @@ +#pragma once + +#include +#include +#include + +#include "core/scalar_type.hpp" +#include "cutlass/bfloat16.h" +#include "cutlass/float8.h" + +template +__global__ void get_group_gemm_starts( + int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets, + ElementC** out_offsets, ElementAccumulator** a_scales_offsets, + ElementAccumulator** b_scales_offsets, ElementAB* a_base_as_int, + ElementAB* b_base_as_int, ElementC* out_base_as_int, + ElementAccumulator* a_scales_base_as_int, + ElementAccumulator* b_scales_base_as_int, int64_t n, int64_t k, + bool per_act_token, bool per_out_ch) { + int expert_id = threadIdx.x; + + int64_t expert_offset = expert_offsets[expert_id]; + + a_offsets[expert_id] = a_base_as_int + expert_offset * k; + b_offsets[expert_id] = b_base_as_int + expert_id * k * n; + out_offsets[expert_id] = out_base_as_int + expert_offset * n; + a_scales_offsets[expert_id] = + a_scales_base_as_int + (per_act_token ? expert_offset : 0); + b_scales_offsets[expert_id] = + b_scales_base_as_int + (per_out_ch ? n * expert_id : expert_id); +} + +#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE) \ + else if (out_tensors.dtype() == TENSOR_C_TYPE) { \ + get_group_gemm_starts \ + <<<1, num_experts, 0, stream>>>( \ + static_cast(expert_offsets.data_ptr()), \ + static_cast(a_ptrs.data_ptr()), \ + static_cast(b_ptrs.data_ptr()), \ + static_cast(out_ptrs.data_ptr()), \ + static_cast(a_scales_ptrs.data_ptr()), \ + static_cast(b_scales_ptrs.data_ptr()), \ + static_cast(a_tensors.data_ptr()), \ + static_cast(b_tensors.data_ptr()), \ + static_cast(out_tensors.data_ptr()), \ + static_cast(a_scales.data_ptr()), \ + static_cast(b_scales.data_ptr()), out_tensors.size(1), \ + a_tensors.size(1), per_act_token, per_out_ch); \ + } + +namespace { + +void run_get_group_gemm_starts( + torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs, + torch::Tensor& b_ptrs, torch::Tensor& out_ptrs, + torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs, + torch::Tensor const& a_tensors, torch::Tensor const& b_tensors, + torch::Tensor& out_tensors, torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { + TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn); + TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn); + TORCH_CHECK(a_scales.dtype() == torch::kFloat32); + TORCH_CHECK(b_scales.dtype() == torch::kFloat32); + + int num_experts = static_cast(expert_offsets.size(0)); + bool per_act_token = a_scales.numel() != 1; + bool per_out_ch = b_scales.numel() != num_experts; + + auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index()); + + if (false) { + } + __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t) + __CALL_GET_STARTS_KERNEL(torch::kFloat16, half) + else { + TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)"); + } +} + +} // namespace \ No newline at end of file diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu new file mode 100644 index 0000000000000..2b8bc3fb0b261 --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu @@ -0,0 +1,160 @@ +#include + +#include +#include + +#include "cutlass/cutlass.h" +#include "grouped_mm_c3x.cuh" + +using namespace cute; + +namespace { + +template typename Epilogue> +struct sm90_fp8_config_default { + // M in (16, inf) + static_assert(std::is_same()); + using KernelSchedule = + cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum; + using EpilogueSchedule = + cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong; + using TileShape = cute::Shape; + using ClusterShape = cute::Shape; + + using Cutlass3xGemm = + cutlass_3x_group_gemm; +}; + +template typename Epilogue> +struct sm90_fp8_config_M16 { + // M in [1, 16] + static_assert(std::is_same()); + using KernelSchedule = + cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum; + using EpilogueSchedule = + cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong; + using TileShape = cute::Shape; + using ClusterShape = cute::Shape; + + using Cutlass3xGemm = + cutlass_3x_group_gemm; +}; + +template typename Epilogue> +struct sm90_fp8_config_K8192 { + // K in [8192, inf) + static_assert(std::is_same()); + using KernelSchedule = + cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum; + using EpilogueSchedule = + cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong; + using TileShape = cute::Shape; + using ClusterShape = cute::Shape; + + using Cutlass3xGemm = + cutlass_3x_group_gemm; +}; + +template typename Epilogue> +struct sm90_fp8_config_N8192 { + // N in [8192, inf) + static_assert(std::is_same()); + using KernelSchedule = + cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum; + using EpilogueSchedule = + cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong; + using TileShape = cute::Shape; + using ClusterShape = cute::Shape; + + using Cutlass3xGemm = + cutlass_3x_group_gemm; +}; + +template +void run_cutlass_moe_mm_sm90( + torch::Tensor& out_tensors, torch::Tensor const& a_tensors, + torch::Tensor const& b_tensors, torch::Tensor const& a_scales, + torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, + torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, + torch::Tensor const& b_strides, torch::Tensor const& c_strides) { + TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided."); + TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided."); + TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided."); + + TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn, + "A tensors must be of type float8_e4m3fn."); + TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn, + "B tensors must be of type float8_e4m3fn."); + + TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn); + TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn); + + using Cutlass3xGemmN8192 = typename sm90_fp8_config_N8192< + InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm; + using Cutlass3xGemmK8192 = typename sm90_fp8_config_K8192< + InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm; + using Cutlass3xGemmM16 = typename sm90_fp8_config_M16< + InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm; + using Cutlass3xGemmDefault = typename sm90_fp8_config_default< + InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm; + + uint32_t const m = a_tensors.size(0); + uint32_t const n = out_tensors.size(1); + uint32_t const k = a_tensors.size(1); + + if (n >= 8192) { + cutlass_group_gemm_caller( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, + problem_sizes, a_strides, b_strides, c_strides); + } else if (k >= 8192) { + cutlass_group_gemm_caller( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, + problem_sizes, a_strides, b_strides, c_strides); + } else if (m <= 16) { + cutlass_group_gemm_caller( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, + problem_sizes, a_strides, b_strides, c_strides); + } else { + cutlass_group_gemm_caller( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, + problem_sizes, a_strides, b_strides, c_strides); + } +} + +void dispatch_moe_mm_sm90( + torch::Tensor& out_tensors, torch::Tensor const& a_tensors, + torch::Tensor const& b_tensors, torch::Tensor const& a_scales, + torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, + torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, + torch::Tensor const& b_strides, torch::Tensor const& c_strides) { + if (out_tensors.dtype() == torch::kBFloat16) { + run_cutlass_moe_mm_sm90( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, + problem_sizes, a_strides, b_strides, c_strides); + } else { + run_cutlass_moe_mm_sm90( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, + problem_sizes, a_strides, b_strides, c_strides); + } +} + +} // namespace + +void cutlass_moe_mm_sm90( + torch::Tensor& out_tensors, torch::Tensor const& a_tensors, + torch::Tensor const& b_tensors, torch::Tensor const& a_scales, + torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, + torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, + torch::Tensor const& b_strides, torch::Tensor const& c_strides) { + dispatch_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales, + expert_offsets, problem_sizes, a_strides, b_strides, + c_strides); +} diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh new file mode 100644 index 0000000000000..db827b7c5e186 --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh @@ -0,0 +1,149 @@ +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass/epilogue/collective/collective_builder.hpp" +#include "cutlass/gemm/device/gemm_universal_adapter.h" + +#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" +#include "cutlass_extensions/common.hpp" +#include "get_group_starts.cuh" + +using namespace cute; + +namespace { + +using ProblemShape = + cutlass::gemm::GroupProblemShape>; + +using ElementAccumulator = float; +using ArchTag = cutlass::arch::Sm90; +using OperatorClass = cutlass::arch::OpClassTensorOp; + +using LayoutA = cutlass::layout::RowMajor; +using LayoutB = cutlass::layout::ColumnMajor; +using LayoutC = cutlass::layout::RowMajor; + +template typename Epilogue_, + typename TileShape, typename ClusterShape, typename KernelSchedule, + typename EpilogueSchedule> +struct cutlass_3x_group_gemm { + using ElementAB = ElementAB_; + using ElementC = void; + using ElementD = ElementC_; + using ElementAccumulator = float; + + using Epilogue = Epilogue_; + + using StrideC = + cute::remove_pointer_t, cute::Int<0>>>; + + static constexpr int AlignmentAB = + 128 / cutlass::sizeof_bits::value; + static constexpr int AlignmentC = 128 / cutlass::sizeof_bits::value; + + using EVTCompute = typename Epilogue::EVTCompute; + + using CollectiveEpilogue = + typename cutlass::epilogue::collective::CollectiveBuilder< + ArchTag, OperatorClass, TileShape, ClusterShape, + cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator, + ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD, + LayoutC*, AlignmentC, EpilogueSchedule, EVTCompute>::CollectiveOp; + + static constexpr size_t CEStorageSize = + sizeof(typename CollectiveEpilogue::SharedStorage); + using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout< + static_cast(CEStorageSize)>; + + using CollectiveMainloop = + typename cutlass::gemm::collective::CollectiveBuilder< + ArchTag, OperatorClass, ElementAB, LayoutA*, AlignmentAB, ElementAB, + LayoutB*, AlignmentAB, ElementAccumulator, TileShape, ClusterShape, + Stages, KernelSchedule>::CollectiveOp; + + using KernelType = enable_sm90_only>; + + struct GemmKernel : public KernelType {}; +}; + +template +void cutlass_group_gemm_caller( + torch::Tensor& out_tensors, torch::Tensor const& a_tensors, + torch::Tensor const& b_tensors, torch::Tensor const& a_scales, + torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, + torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, + torch::Tensor const& b_strides, torch::Tensor const& c_strides) { + using ElementAB = typename Gemm::ElementAB; + using ElementD = typename Gemm::ElementD; + + int num_experts = static_cast(expert_offsets.size(0)); + int k_size = a_tensors.size(1); + int n_size = out_tensors.size(1); + + bool per_act_token = a_scales.numel() != 1; + bool per_out_ch = b_scales.numel() != num_experts; + + auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index()); + + auto options_int = + torch::TensorOptions().dtype(torch::kInt64).device(a_tensors.device()); + + torch::Tensor a_ptrs = torch::empty(num_experts, options_int); + torch::Tensor b_ptrs = torch::empty(num_experts, options_int); + torch::Tensor out_ptrs = torch::empty(num_experts, options_int); + torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int); + torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int); + + run_get_group_gemm_starts(expert_offsets, a_ptrs, b_ptrs, out_ptrs, + a_scales_ptrs, b_scales_ptrs, a_tensors, b_tensors, + out_tensors, a_scales, b_scales); + + using GemmKernel = typename Gemm::GemmKernel; + using StrideA = Stride, Int<0>>; + using StrideB = Stride, Int<0>>; + using StrideC = typename GemmKernel::InternalStrideC; + + ProblemShape::UnderlyingProblemShape* problem_sizes_as_shapes = + static_cast( + problem_sizes.data_ptr()); + ProblemShape prob_shape{num_experts, problem_sizes_as_shapes, nullptr}; + + typename GemmKernel::MainloopArguments mainloop_args{ + static_cast(a_ptrs.data_ptr()), + static_cast(a_strides.data_ptr()), + static_cast(b_ptrs.data_ptr()), + static_cast(b_strides.data_ptr())}; + + // Currently, we are only able to do broadcast on either all or none a_scales + // and on either all or none b_scales + typename GemmKernel::EpilogueArguments epilogue_args{ + Gemm::Epilogue::prepare_args( + static_cast(a_scales_ptrs.data_ptr()), + static_cast(b_scales_ptrs.data_ptr()), + per_act_token, per_out_ch), + nullptr, static_cast(c_strides.data_ptr()), + static_cast(out_ptrs.data_ptr()), + static_cast(c_strides.data_ptr())}; + + typename GemmKernel::Arguments args{ + cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape, mainloop_args, + epilogue_args}; + + using GemmOp = cutlass::gemm::device::GemmUniversalAdapter; + GemmOp gemm_op; + CUTLASS_CHECK(gemm_op.can_implement(args)); + + size_t workspace_size = gemm_op.get_workspace_size(args); + auto const workspace_options = + torch::TensorOptions().dtype(torch::kUInt8).device(a_tensors.device()); + auto workspace = torch::empty(workspace_size, workspace_options); + + cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream); + CUTLASS_CHECK(status); +} + +} // namespace diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu new file mode 100644 index 0000000000000..894727383a639 --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu @@ -0,0 +1,103 @@ +#include + +#include +#include + +#include + +constexpr uint64_t THREADS_PER_EXPERT = 512; + +__global__ void compute_problem_sizes(const int* __restrict__ topk_ids, + int32_t* problem_sizes1, + int32_t* problem_sizes2, + int32_t* atomic_buffer, + const int topk_length, const int n, + const int k) { + int expert_id = blockIdx.x; + + int occurrences = 0; + for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) { + occurrences += (topk_ids[i] == expert_id); + } + atomicAdd(&atomic_buffer[expert_id], occurrences); + __syncthreads(); + + if (threadIdx.x == 0) { + int final_occurrences = atomic_buffer[expert_id]; + problem_sizes1[expert_id * 3] = final_occurrences; + problem_sizes1[expert_id * 3 + 1] = 2 * n; + problem_sizes1[expert_id * 3 + 2] = k; + problem_sizes2[expert_id * 3] = final_occurrences; + problem_sizes2[expert_id * 3 + 1] = k; + problem_sizes2[expert_id * 3 + 2] = n; + } +} + +__global__ void compute_expert_offsets( + const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets, + int32_t* atomic_buffer, const int num_experts) { + int32_t tot_offset = 0; + expert_offsets[0] = 0; + for (int i = 0; i < num_experts; ++i) { + atomic_buffer[i] = tot_offset; + tot_offset += problem_sizes1[i * 3]; + expert_offsets[i + 1] = tot_offset; + } +} + +__global__ void compute_arg_sorts(const int* __restrict__ topk_ids, + const int32_t* __restrict__ expert_offsets, + int32_t* input_permutation, + int32_t* output_permutation, + int32_t* atomic_buffer, const int topk_length, + const int topk) { + int const blk_expert_id = blockIdx.x; + int const num_experts = gridDim.x; + int32_t const num_tokens = expert_offsets[num_experts]; + + for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) { + int const expert_id = topk_ids[i]; + if (expert_id == -1 && blockIdx.x == 0) { + // output_permutation is used to re-order the moe outputs. It is + // used as c2 = c2[c_map], where c2 is a torch.tensor that is the + // output of the cutlass kernels and c_map is the output_permutation. + // c2 is initialized to zeros, therefore by setting the output_permutation + // to num_tokens, we are guaranteed to fill the moe outputs to zero + // for "invalid" topk_ids. + output_permutation[i] = num_tokens; + } else if (expert_id == blk_expert_id) { + int start = atomicAdd(&atomic_buffer[expert_id], 1); + input_permutation[start] = i / topk; + output_permutation[i] = start; + } + } +} + +void get_cutlass_moe_mm_data_caller( + const torch::Tensor& topk_ids, torch::Tensor& expert_offsets, + torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, + torch::Tensor& input_permutation, torch::Tensor& output_permutation, + const int64_t num_experts, const int64_t n, const int64_t k) { + auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index()); + auto options_int32 = + torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device()); + torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32); + + int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel()); + compute_problem_sizes<<>>( + static_cast(topk_ids.data_ptr()), + static_cast(problem_sizes1.data_ptr()), + static_cast(problem_sizes2.data_ptr()), + static_cast(atomic_buffer.data_ptr()), topk_ids.numel(), n, k); + compute_expert_offsets<<<1, 1, 0, stream>>>( + static_cast(problem_sizes1.data_ptr()), + static_cast(expert_offsets.data_ptr()), + static_cast(atomic_buffer.data_ptr()), num_experts); + compute_arg_sorts<<>>( + static_cast(topk_ids.data_ptr()), + static_cast(expert_offsets.data_ptr()), + static_cast(input_permutation.data_ptr()), + static_cast(output_permutation.data_ptr()), + static_cast(atomic_buffer.data_ptr()), topk_ids.numel(), + topk_ids.size(1)); +} diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu index b08386459cbe2..54b63894e4cbc 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu @@ -29,6 +29,20 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, std::optional const& bias); + +void cutlass_moe_mm_sm90( + torch::Tensor& out_tensors, torch::Tensor const& a_tensors, + torch::Tensor const& b_tensors, torch::Tensor const& a_scales, + torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, + torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, + torch::Tensor const& b_strides, torch::Tensor const& c_strides); + +void get_cutlass_moe_mm_data_caller( + const torch::Tensor& topk_ids, torch::Tensor& expert_offsets, + torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, + torch::Tensor& input_permutation, torch::Tensor& output_permutation, + const int64_t num_experts, const int64_t n, const int64_t k); + #endif #if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100 @@ -102,6 +116,19 @@ bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) { return false; } +bool cutlass_group_gemm_supported(int64_t cuda_device_capability) { + // CUTLASS groped FP8 kernels need at least CUDA 12.3 + // and SM90 (Hopper) + +#if defined CUDA_VERSION + if (cuda_device_capability == 90) { + return CUDA_VERSION >= 12030; + } +#endif + + return false; +} + void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, @@ -168,6 +195,46 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a, version_num); } +void cutlass_moe_mm( + torch::Tensor& out_tensors, torch::Tensor const& a_tensors, + torch::Tensor const& b_tensors, torch::Tensor const& a_scales, + torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, + torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, + torch::Tensor const& b_strides, torch::Tensor const& c_strides) { + int32_t version_num = get_sm_version_num(); +#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90 + cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales, + expert_offsets, problem_sizes, a_strides, b_strides, + c_strides); + return; +#endif + TORCH_CHECK_NOT_IMPLEMENTED( + false, + "No compiled cutlass_scaled_mm for CUDA device capability: ", version_num, + ". Required capability: 90"); +} + +void get_cutlass_moe_mm_data( + const torch::Tensor& topk_ids, torch::Tensor& expert_offsets, + torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, + torch::Tensor& input_permutation, torch::Tensor& output_permutation, + const int64_t num_experts, const int64_t n, const int64_t k) { + // This function currently gets compiled only if we have a valid cutlass moe + // mm to run it for. + int32_t version_num = get_sm_version_num(); +#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90 + get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1, + problem_sizes2, input_permutation, + output_permutation, num_experts, n, k); + return; +#endif + TORCH_CHECK_NOT_IMPLEMENTED( + false, + "No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for " + "CUDA device capability: ", + version_num, ". Required capability: 90"); +} + void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu index 8f9aa21aae4ee..eceb3a8ea05da 100644 --- a/csrc/quantization/fp8/common.cu +++ b/csrc/quantization/fp8/common.cu @@ -30,9 +30,6 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel( fp8_type* __restrict__ out, float* __restrict__ scale, scalar_t const* __restrict__ input, float const* __restrict__ scale_ub, const int hidden_size) { - float const min_scaling_factor = - 1.0f / (fp8_e4m3_adjusted_max_v * 512.f); - int const tid = threadIdx.x; int const token_idx = blockIdx.x; @@ -67,8 +64,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel( token_scale = block_absmax_val_maybe; } // token scale computation - token_scale = max(token_scale / fp8_e4m3_adjusted_max_v, - min_scaling_factor); + token_scale = max(token_scale / quant_type_max_v, + min_scaling_factor::val()); scale[token_idx] = token_scale; } __syncthreads(); diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh index d331c63ae827f..def8b31b27546 100644 --- a/csrc/quantization/fp8/common.cuh +++ b/csrc/quantization/fp8/common.cuh @@ -1,20 +1,12 @@ #pragma once #include "quantization/vectorization.cuh" +#include "quantization/utils.cuh" #include -#include -#ifndef USE_ROCM - #include - #define MAYBE_HOST_DEVICE C10_HOST_DEVICE -#else - #include - #include - #include +#ifdef USE_ROCM #include "amd/quant_utils.cuh" - // ROCm doesn't seem to need C10_HOST_DEVICE for static constexpr - #define MAYBE_HOST_DEVICE #endif // Determines the preferred FP8 type for the current platform. @@ -31,29 +23,6 @@ static bool is_fp8_ocp() { #endif } -template -struct fp8_e4m3_adjusted_max; - -template <> -struct fp8_e4m3_adjusted_max { - static constexpr c10::Float8_e4m3fn val() { - return std::numeric_limits::max(); - } -}; - -// Using the default max value from pytorch (240.0 0x7F) will cause accuracy -// issues when running dynamic quantization. Here use 224.0 0x7E for rocm. -template <> -struct fp8_e4m3_adjusted_max { - static constexpr c10::Float8_e4m3fnuz val() { - return c10::Float8_e4m3fnuz(0x7E, c10::Float8_e4m3fnuz::from_bits()); - } -}; - -template -MAYBE_HOST_DEVICE static constexpr T fp8_e4m3_adjusted_max_v = - fp8_e4m3_adjusted_max::val(); - namespace vllm { __device__ __forceinline__ float atomicMaxFloat(float* addr, float value) { @@ -76,8 +45,8 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val, x = val / scale; } - float r = fmax(-fp8_e4m3_adjusted_max_v, - fmin(x, fp8_e4m3_adjusted_max_v)); + float r = + fmax(-quant_type_max_v, fmin(x, quant_type_max_v)); #ifndef USE_ROCM return static_cast(r); #else @@ -123,7 +92,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale, // Finally, since cache[0] contains the maximum for this thread block, // atomically write the max to the target location if (threadIdx.x == 0) { - atomicMaxFloat(scale, cache[0] / fp8_e4m3_adjusted_max_v); + atomicMaxFloat(scale, cache[0] / quant_type_max_v); } } diff --git a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu index 1be89c504bfeb..2b6ab7fcec902 100644 --- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu +++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu @@ -14,8 +14,7 @@ __device__ void rms_norm_dynamic_per_token_quant_vec( float* __restrict__ scales, // [num_tokens] scalar_t const* __restrict__ input, // [..., hidden_size] scalar_t const* __restrict__ weight, // [hidden_size] - float const* scale_ub, float const var_epsilon, - float const min_scaling_factor, int32_t const hidden_size, + float const* scale_ub, float const var_epsilon, int32_t const hidden_size, scalar_t* __restrict__ residual = nullptr) { float rms = 0.0f; float token_scale = 0.0f; @@ -27,8 +26,8 @@ __device__ void rms_norm_dynamic_per_token_quant_vec( // Compute scale vllm::vectorized::compute_dynamic_per_token_scales( - &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor, - hidden_size, residual); + &token_scale, scales, input, weight, rms, scale_ub, hidden_size, + residual); // RMS Norm + Quant if constexpr (std::is_same_v) { @@ -50,8 +49,7 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel( float* __restrict__ scales, // [num_tokens] scalar_t const* __restrict__ input, // [..., hidden_size] scalar_t const* __restrict__ weight, // [hidden_size] - float const* scale_ub, float const var_epsilon, - float const min_scaling_factor, int32_t const hidden_size, + float const* scale_ub, float const var_epsilon, int32_t const hidden_size, scalar_t* __restrict__ residual = nullptr) { // For vectorization, token_input and token_output pointers need to be // aligned at 8-byte and 4-byte addresses respectively. @@ -60,8 +58,8 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel( if (can_vectorize) { return rms_norm_dynamic_per_token_quant_vec( - out, scales, input, weight, scale_ub, var_epsilon, min_scaling_factor, - hidden_size, residual); + out, scales, input, weight, scale_ub, var_epsilon, hidden_size, + residual); } float rms = 0.0f; @@ -72,8 +70,8 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel( var_epsilon, residual); // Compute Scale vllm::compute_dynamic_per_token_scales( - &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor, - hidden_size, residual); + &token_scale, scales, input, weight, rms, scale_ub, hidden_size, + residual); // RMS Norm + Quant if constexpr (std::is_same_v) { @@ -105,11 +103,6 @@ void rms_norm_dynamic_per_token_quant_dispatch( const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - const float min_scaling_factor = - out.dtype() == torch::kInt8 - ? std::numeric_limits::epsilon() - : 1.0f / (std::numeric_limits::max() * 512.f); - if (residual.has_value()) { VLLM_DISPATCH_QUANT_TYPES( out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] { @@ -119,8 +112,7 @@ void rms_norm_dynamic_per_token_quant_dispatch( out.data_ptr(), scales.data_ptr(), input.data_ptr(), weight.data_ptr(), scale_ub.has_value() ? scale_ub->data_ptr() : nullptr, - var_epsilon, min_scaling_factor, hidden_size, - residual->data_ptr()); + var_epsilon, hidden_size, residual->data_ptr()); }); } else { @@ -132,7 +124,7 @@ void rms_norm_dynamic_per_token_quant_dispatch( out.data_ptr(), scales.data_ptr(), input.data_ptr(), weight.data_ptr(), scale_ub.has_value() ? scale_ub->data_ptr() : nullptr, - var_epsilon, min_scaling_factor, hidden_size, nullptr); + var_epsilon, hidden_size, nullptr); }); } } diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh index b5cea98f7706e..e6d23cd24e178 100644 --- a/csrc/quantization/fused_kernels/layernorm_utils.cuh +++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh @@ -5,6 +5,7 @@ */ #include "quantization/vectorization.cuh" +#include "quantization/utils.cuh" #include "quant_conversions.cuh" #ifndef USE_ROCM @@ -51,11 +52,11 @@ __device__ void compute_dynamic_per_token_scales( float* __restrict__ token_scale, float* __restrict__ all_token_scales, scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight, float const rms, float const* __restrict__ scale_ub, - float const min_scaling_factor, int32_t const hidden_size, + int32_t const hidden_size, scalar_t const* __restrict__ residual = nullptr) { int64_t const token_offset = blockIdx.x * static_cast(hidden_size); ; - constexpr scalar_out_t qmax{std::numeric_limits::max()}; + constexpr scalar_out_t qmax{quant_type_max_v}; float block_absmax_val_maybe = 0.0f; for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) { @@ -83,7 +84,7 @@ __device__ void compute_dynamic_per_token_scales( scale = block_absmax_val_maybe; } // token scale computation - scale = max(scale / qmax, min_scaling_factor); + scale = max(scale / qmax, min_scaling_factor::val()); s_token_scale = scale; // Shared memory store all_token_scales[blockIdx.x] = scale; // Global output store } @@ -184,7 +185,7 @@ __device__ void compute_dynamic_per_token_scales( float* __restrict__ token_scale, float* __restrict__ all_token_scales, scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight, float const rms, float const* __restrict__ scale_ub, - float const min_scaling_factor, int32_t const hidden_size, + int32_t const hidden_size, scalar_t const* __restrict__ residual = nullptr) { int64_t const token_offset = blockIdx.x * static_cast(hidden_size); ; @@ -200,7 +201,7 @@ __device__ void compute_dynamic_per_token_scales( reinterpret_cast const*>(&residual[token_offset]); } - constexpr scalar_out_t qmax{std::numeric_limits::max()}; + constexpr scalar_out_t qmax{quant_type_max_v}; int32_t const num_vec_elems = hidden_size >> 2; float block_absmax_val_maybe = 0.0f; @@ -248,7 +249,7 @@ __device__ void compute_dynamic_per_token_scales( scale = block_absmax_val_maybe; } // token scale computation - scale = max(scale / qmax, min_scaling_factor); + scale = max(scale / qmax, min_scaling_factor::val()); s_token_scale = scale; // shared memory store all_token_scales[blockIdx.x] = scale; // global output store } diff --git a/csrc/quantization/fused_kernels/quant_conversions.cuh b/csrc/quantization/fused_kernels/quant_conversions.cuh index 9ac7b188f5181..7c10aaa81cf7b 100644 --- a/csrc/quantization/fused_kernels/quant_conversions.cuh +++ b/csrc/quantization/fused_kernels/quant_conversions.cuh @@ -33,8 +33,8 @@ static __device__ __forceinline__ int8_t float_to_int8_rn(float const x) { template static __device__ __forceinline__ fp8_type float_to_fp8(float const x) { - float const r = fmax(-fp8_e4m3_adjusted_max_v, - fmin(x, fp8_e4m3_adjusted_max_v)); + float const r = + fmax(-quant_type_max_v, fmin(x, quant_type_max_v)); return static_cast(r); } diff --git a/csrc/quantization/gguf/dequantize.cuh b/csrc/quantization/gguf/dequantize.cuh index 41fc032ff1a56..9d355003ef91d 100644 --- a/csrc/quantization/gguf/dequantize.cuh +++ b/csrc/quantization/gguf/dequantize.cuh @@ -94,8 +94,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __ dfloat2 v; dequantize_kernel(vx, ib, iqs, v); - y[iybs + iqs + 0] = v.x; - y[iybs + iqs + y_offset] = v.y; + y[iybs + iqs + 0] = convert_from_half(v.x); + y[iybs + iqs + y_offset] = convert_from_half(v.y); } template @@ -114,10 +114,10 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t half dall = __low2half(x[i].dm); half dmin = __high2half(x[i].dm); - y[l+ 0] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+0] >> 4))); - y[l+32] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+2] >> 4))); - y[l+64] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+4] >> 4))); - y[l+96] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+6] >> 4))); + y[l+ 0] = convert_from_half(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+0] >> 4)))); + y[l+32] = convert_from_half(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+2] >> 4)))); + y[l+64] = convert_from_half(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+4] >> 4)))); + y[l+96] = convert_from_half(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+6] >> 4)))); } template @@ -148,7 +148,9 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t const uint8_t * q = x[i].qs + 32*n; const uint8_t * hm = x[i].hmask; - for (int l = l0; l < l0+4; ++l) y[l] = __hmul(dl, __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4))); + for (int l = l0; l < l0+4; ++l) { + y[l] = convert_from_half(__hmul(dl, __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)))); + } } static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { @@ -188,8 +190,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t const half d2 = __hmul(dall, __int2half_rn(sc)); const half m2 = __hmul(dmin, __int2half_rn(m)); for (int l = 0; l < n; ++l) { - y[l + 0] = __hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1); - y[l +32] = __hsub(__hmul(d2, __int2half_rn(q[l] >> 4)), m2); + y[l + 0] = convert_from_half(__hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1)); + y[l +32] = convert_from_half(__hsub(__hmul(d2, __int2half_rn(q[l] >> 4)), m2)); } } @@ -220,11 +222,11 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t const half d2 = __hmul(dall, __int2half_rn(sc)); const half m2 = __hmul(dmin, __int2half_rn(m)); uint8_t hm = 1 << (2*il); - y[ 0] = __hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1); - y[ 1] = __hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1); + y[ 0] = convert_from_half(__hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1)); + y[ 1] = convert_from_half(__hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1)); hm <<= 1; - y[32] = __hsub(__hmul(d2, __int2half_rn((ql[0] >> 4) + (qh[0] & hm ? 16 : 0))), m2); - y[33] = __hsub(__hmul(d2, __int2half_rn((ql[1] >> 4) + (qh[1] & hm ? 16 : 0))), m2); + y[32] = convert_from_half(__hsub(__hmul(d2, __int2half_rn((ql[0] >> 4) + (qh[0] & hm ? 16 : 0))), m2)); + y[33] = convert_from_half(__hsub(__hmul(d2, __int2half_rn((ql[1] >> 4) + (qh[1] & hm ? 16 : 0))), m2)); } template @@ -247,10 +249,10 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t const uint8_t qh = x[i].qh[32*ip + il]; const int8_t * sc = x[i].scales + is; - y[ 0] = __hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32))); - y[32] = __hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32))); - y[64] = __hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32))); - y[96] = __hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32))); + y[ 0] = convert_from_half(__hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32)))); + y[32] = convert_from_half(__hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32)))); + y[64] = convert_from_half(__hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32)))); + y[96] = convert_from_half(__hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32)))); } template @@ -269,7 +271,7 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds const uint32_t aux32 = q2[2] | (q2[3] << 16); const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.25f; const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127]; - for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f)); + for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); } template @@ -286,7 +288,7 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511)); const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; const uint8_t signs = ksigns_iq2xs[q2[il] >> 9]; - for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f)); + for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); } @@ -303,7 +305,7 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300))); const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; const uint8_t signs = x[i].qs[QK_K/8+4*ib+il]; - for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f)); + for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); } template @@ -324,8 +326,8 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.5f; const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127]; for (int j = 0; j < 4; ++j) { - y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f)); - y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f)); + y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); } } @@ -345,8 +347,8 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_ const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f; const uint8_t signs = x[i].signs[4*ib + il]; for (int j = 0; j < 4; ++j) { - y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f)); - y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f)); + y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); } } @@ -367,7 +369,7 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_ grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; grid32[0] &= 0x0f0f0f0f; for (int j = 0; j < 8; ++j) { - y[j] = __float2half(d * (q[j] + delta)); + y[j] = d * (q[j] + delta); } } @@ -392,7 +394,7 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_ grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; grid32[0] &= 0x0f0f0f0f; for (int j = 0; j < 8; ++j) { - y[j] = __float2half(d * (q[j] + delta)); + y[j] = d * (q[j] + delta); } } @@ -409,8 +411,8 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst const uint8_t * q4 = x[ib].qs + 4*il; const float d = __half2float(x[ib].d); for (int j = 0; j < 4; ++j) { - y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]); - y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >> 4]); + y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; + y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; } } @@ -427,8 +429,8 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst const uint8_t * q4 = x[i].qs + 16*ib + 4*il; const float d = __half2float(x[i].d) * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32); for (int j = 0; j < 4; ++j) { - y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]); - y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >> 4]); + y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; + y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; } } @@ -522,7 +524,8 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, dequantize_block_iq4_xs<<>>(vx, y); } -static to_fp16_cuda_t ggml_get_to_fp16_cuda(int64_t type) { +template +static to_cuda_ggml_t ggml_get_to_cuda(int64_t type) { switch (type) { case 2: return dequantize_block_cuda; diff --git a/csrc/quantization/gguf/ggml-common.h b/csrc/quantization/gguf/ggml-common.h index d42205a6571db..6bef5db3ccf15 100644 --- a/csrc/quantization/gguf/ggml-common.h +++ b/csrc/quantization/gguf/ggml-common.h @@ -1063,7 +1063,8 @@ static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, - typedef half dfloat; // dequantize float typedef half2 dfloat2; typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v); -typedef void (*to_fp16_cuda_t)(const void * __restrict__ x, dfloat * __restrict__ y, int k, cudaStream_t stream); +template +using to_cuda_ggml_t = void (*)(const void * __restrict__ x, dst_t * __restrict__ y, int k, cudaStream_t stream); typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs); typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc); typedef void (*load_tiles_cuda_t)( @@ -1075,6 +1076,25 @@ typedef float (*vec_dot_q_mul_mat_cuda_t)( // Utility function +template +static __device__ __forceinline__ dst_t convert_from_half(half val) { + return val; +} + +template<> +__device__ __forceinline__ c10::BFloat16 convert_from_half(half val) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + return __float2bfloat16(__half2float(val)); +#else + return __half2float(val); +#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +} + +template<> +__device__ __forceinline__ float convert_from_half(half val) { + return __half2float(val); +} + #if defined(USE_ROCM) #ifndef __has_builtin diff --git a/csrc/quantization/gguf/gguf_kernel.cu b/csrc/quantization/gguf/gguf_kernel.cu index dbbb97e6fb3a9..56b78f1834d15 100644 --- a/csrc/quantization/gguf/gguf_kernel.cu +++ b/csrc/quantization/gguf/gguf_kernel.cu @@ -71,14 +71,19 @@ static void quantize_row_q8_1_cuda(const scalar_t* x, void* vy, const int kx, } torch::Tensor ggml_dequantize(torch::Tensor W, // quant weight - int64_t type, int64_t m, int64_t n) { + int64_t type, int64_t m, int64_t n, + std::optional const& dtype) { const at::cuda::OptionalCUDAGuard device_guard(device_of(W)); - auto options = - torch::TensorOptions().dtype(torch::kFloat16).device(W.device()); + auto dtype_ = dtype.value_or(torch::kFloat16); + auto options = torch::TensorOptions().dtype(dtype_).device(W.device()); at::Tensor DW = torch::empty({m, n}, options); cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(type); - to_fp16_cuda((void*)W.data_ptr(), (half*)DW.data_ptr(), m * n, stream); + + VLLM_DISPATCH_FLOATING_TYPES(DW.scalar_type(), "ggml_dequantize", [&] { + auto to_cuda = ggml_get_to_cuda(type); + to_cuda((void*)W.data_ptr(), (scalar_t*)DW.data_ptr(), m * n, stream); + }); + return DW; } diff --git a/csrc/quantization/gguf/moe.cuh b/csrc/quantization/gguf/moe.cuh index c10c59d7a38a7..df9b84abcc134 100644 --- a/csrc/quantization/gguf/moe.cuh +++ b/csrc/quantization/gguf/moe.cuh @@ -129,7 +129,7 @@ static __device__ __forceinline__ void moe_q( } #if defined(USE_ROCM) - #define MOE_X_Q4_0 64 + #define MOE_X_Q4_0 8 #define MOE_Y_Q4_0 128 #define NWARPS_Q4_0 8 #else @@ -190,7 +190,7 @@ static void ggml_moe_q4_0_q8_1_cuda( } #if defined(USE_ROCM) - #define MOE_X_Q4_1 64 + #define MOE_X_Q4_1 8 #define MOE_Y_Q4_1 128 #define NWARPS_Q4_1 8 #else @@ -251,7 +251,7 @@ static void ggml_moe_q4_1_q8_1_cuda( } #if defined(USE_ROCM) - #define MOE_X_Q5_0 64 + #define MOE_X_Q5_0 8 #define MOE_Y_Q5_0 128 #define NWARPS_Q5_0 8 #else @@ -312,7 +312,7 @@ static void ggml_moe_q5_0_q8_1_cuda( } #if defined(USE_ROCM) - #define MOE_X_Q5_1 64 + #define MOE_X_Q5_1 8 #define MOE_Y_Q5_1 128 #define NWARPS_Q5_1 8 #else @@ -373,7 +373,7 @@ static void ggml_moe_q5_1_q8_1_cuda( } #if defined(USE_ROCM) - #define MOE_X_Q8_0 64 + #define MOE_X_Q8_0 8 #define MOE_Y_Q8_0 128 #define NWARPS_Q8_0 8 #else @@ -434,7 +434,7 @@ static void ggml_moe_q8_0_q8_1_cuda( } #if defined(USE_ROCM) - #define MOE_X_Q2_K 64 + #define MOE_X_Q2_K 8 #define MOE_Y_Q2_K 128 #define NWARPS_Q2_K 8 #else @@ -495,7 +495,7 @@ static void ggml_moe_q2_K_q8_1_cuda( } #if defined(USE_ROCM) - #define MOE_X_Q3_K 64 + #define MOE_X_Q3_K 8 #define MOE_Y_Q3_K 128 #define NWARPS_Q3_K 8 #else @@ -556,7 +556,7 @@ static void ggml_moe_q3_K_q8_1_cuda( } #if defined(USE_ROCM) - #define MOE_X_Q4_K 64 + #define MOE_X_Q4_K 8 #define MOE_Y_Q4_K 128 #define NWARPS_Q4_K 8 #else @@ -617,7 +617,7 @@ static void ggml_moe_q4_K_q8_1_cuda( } #if defined(USE_ROCM) - #define MOE_X_Q5_K 64 + #define MOE_X_Q5_K 8 #define MOE_Y_Q5_K 128 #define NWARPS_Q5_K 8 #else @@ -678,7 +678,7 @@ static void ggml_moe_q5_K_q8_1_cuda( } #if defined(USE_ROCM) - #define MOE_X_Q6_K 64 + #define MOE_X_Q6_K 8 #define MOE_Y_Q6_K 128 #define NWARPS_Q6_K 8 #else diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu index 14d397d03e135..83bbd1e6816a8 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu @@ -1785,7 +1785,7 @@ __global__ void Marlin( <<>>( \ A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr, \ num_groups, prob_m, prob_n, prob_k, lda, locks, \ - use_atomic_add, use_fp32_reduce); \ + part_use_atomic_add, use_fp32_reduce); \ } \ } @@ -2215,6 +2215,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, thread_m_blocks = exec_cfg.max_m_blocks; } + // atomic add reduce have better performance only when m * n is small + bool part_use_atomic_add = + use_atomic_add && div_ceil(prob_m, 64) * prob_n <= 2048; + if (false) { } GPTQ_CALL_IF(vllm::kU4B8, 16, 4, 256) diff --git a/csrc/quantization/gptq_marlin/marlin.cuh b/csrc/quantization/gptq_marlin/marlin.cuh index 74ccbac57bd3c..f3b44641e77ee 100644 --- a/csrc/quantization/gptq_marlin/marlin.cuh +++ b/csrc/quantization/gptq_marlin/marlin.cuh @@ -9,7 +9,11 @@ #include #include -namespace marlin { +#ifndef MARLIN_NAMESPACE_NAME + #define MARLIN_NAMESPACE_NAME marlin +#endif + +namespace MARLIN_NAMESPACE_NAME { // Marlin params @@ -23,6 +27,7 @@ static constexpr int pipe_stages = static constexpr int min_thread_n = 64; static constexpr int min_thread_k = 64; +static constexpr int max_thread_n = 256; static constexpr int tile_size = 16; static constexpr int max_par = 16; @@ -84,4 +89,4 @@ __device__ inline void cp_async_wait() { #endif -} // namespace marlin +} // namespace MARLIN_NAMESPACE_NAME diff --git a/csrc/quantization/gptq_marlin/marlin_dtypes.cuh b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh index be06c09bee331..cc16054814342 100644 --- a/csrc/quantization/gptq_marlin/marlin_dtypes.cuh +++ b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh @@ -5,7 +5,11 @@ #include #include -namespace marlin { +#ifndef MARLIN_NAMESPACE_NAME + #define MARLIN_NAMESPACE_NAME marlin +#endif + +namespace MARLIN_NAMESPACE_NAME { template class ScalarType {}; @@ -54,7 +58,7 @@ class ScalarType { using FragS = Vec; using FragZP = Vec; -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 static __device__ float inline num2float(const nv_bfloat16 x) { return __bfloat162float(x); } @@ -74,6 +78,6 @@ class ScalarType { #endif }; -} // namespace marlin +} // namespace MARLIN_NAMESPACE_NAME #endif diff --git a/csrc/quantization/utils.cuh b/csrc/quantization/utils.cuh new file mode 100644 index 0000000000000..73055a1528744 --- /dev/null +++ b/csrc/quantization/utils.cuh @@ -0,0 +1,59 @@ +#pragma once + +/** + * Quantization utilities including: + * Adjusted maximum values for qtypes. + * Minimum scaling factors for qtypes. + */ + +#include +#include + +#ifndef USE_ROCM + #include + #define MAYBE_HOST_DEVICE C10_HOST_DEVICE +#else + #include + #include + #include + // ROCm doesn't seem to need C10_HOST_DEVICE for static constexpr + #define MAYBE_HOST_DEVICE +#endif + +template || + std::is_same_v || + std::is_same_v>> +struct quant_type_max { + static constexpr T val() { return std::numeric_limits::max(); } +}; + +// Using the default max value from pytorch (240.0 0x7F) will cause accuracy +// issues when running dynamic quantization. Here use 224.0 0x7E for rocm. +template <> +struct quant_type_max { + static constexpr c10::Float8_e4m3fnuz val() { + return c10::Float8_e4m3fnuz(0x7E, c10::Float8_e4m3fnuz::from_bits()); + } +}; + +template +MAYBE_HOST_DEVICE static constexpr T quant_type_max_v = + quant_type_max::val(); + +template || + std::is_same_v || + std::is_same_v>> +struct min_scaling_factor { + C10_DEVICE C10_ALWAYS_INLINE static float val() { + return 1.0f / (quant_type_max_v * 512.0f); + } +}; + +template <> +struct min_scaling_factor { + C10_DEVICE C10_ALWAYS_INLINE static float val() { + return std::numeric_limits::epsilon(); + } +}; \ No newline at end of file diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index 8ab2af22f4d0c..2c3cae95e7f55 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -272,6 +272,7 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( const float scale, const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] const int q_stride, @@ -291,6 +292,13 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( const int rowid = laneid / 16; const auto seq_idx = blockIdx.x; + // NOTE queries with sequence len > 1 are prefills and taken care by another + // kernel. + if (query_start_loc_ptr != nullptr && + (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx]) != 1) { + return; + } + const auto partition_idx = blockIdx.y; constexpr int T_PAR_SIZE = 256; // token partition size set to 256 @@ -377,9 +385,10 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( // fetch Q in shared across warps and then write to registers const int local_qhead_idx = 4 * warpid + rowid; const int global_qhead_idx = wg_start_head_idx + local_qhead_idx; - const int64_t seq_idx64 = static_cast(seq_idx); + const int64_t query_start_off = static_cast( + query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx); const scalar_t* q_ptr = - q + seq_idx64 * q_stride + global_qhead_idx * HEAD_SIZE; + q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE; const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B; if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) { @@ -777,6 +786,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( const float scale, const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] const int q_stride, @@ -794,6 +804,12 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( const int lane4id = laneid % 4; const auto seq_idx = blockIdx.x; + // NOTE queries with sequence len > 1 are prefills and taken care by another + // kernel. + if (query_start_loc_ptr != nullptr && + (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) { + return; + } const auto partition_idx = blockIdx.y; const auto partition_size = blockDim.x; const auto max_num_partitions = gridDim.y; @@ -882,9 +898,11 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( } // fetch q elements - // every 4 lanes fetch 8 elems, so warp fetches 8*16 = 128 elems + // every 4 lanes fetch 8 elems, so warp fetches 8*16 = 128 elemsc + const int64_t query_start_off = static_cast( + query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx); const scalar_t* q_ptr = - q + seq_idx * q_stride + wg_start_head_idx * HEAD_SIZE; + q + query_start_off * q_stride + wg_start_head_idx * HEAD_SIZE; const _B16x8* q_ptrh8 = reinterpret_cast(q_ptr); const int qhead_elemh8 = laneid / 4; @@ -1267,10 +1285,19 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, // max_num_partitions, head_size] const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] const int max_num_partitions) { const auto num_heads = gridDim.x; const auto head_idx = blockIdx.x; const auto seq_idx = blockIdx.y; + + // NOTE queries with sequence len > 1 are prefills and taken care by another + // kernel. + if (query_start_loc_ptr != nullptr && + (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) { + return; + } + const int context_len = context_lens[seq_idx]; const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; @@ -1439,7 +1466,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( __fdividef(1.0f, shared_global_exp_sum + 1e-6f); acc *= inv_global_exp_sum; - OUTT* out_ptr = out + static_cast(seq_idx) * num_heads * HEAD_SIZE + + const int64_t query_start_off = static_cast( + query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx); + OUTT* out_ptr = out + query_start_off * num_heads * HEAD_SIZE + static_cast(head_idx) * HEAD_SIZE; if constexpr (std::is_same::value) { out_ptr[threadIdx.x] = @@ -1466,6 +1495,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel( const float scale, const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] const int q_stride, @@ -1492,6 +1522,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( const float scale, const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] const int q_stride, @@ -1515,6 +1546,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( const float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions] const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size] const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] const int max_num_partitions) { UNREACHABLE_CODE } @@ -1522,34 +1554,34 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( #endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support -#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO) \ - paged_attention_ll4mi_QKV_mfma16_kernel \ - <<>>( \ - query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \ - block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq, \ - alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \ - exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \ - k_scale_ptr, v_scale_ptr); +#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO) \ + paged_attention_ll4mi_QKV_mfma16_kernel \ + <<>>( \ + query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \ + block_tables_ptr, context_lens_ptr, query_start_loc_ptr, \ + max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \ + kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, \ + max_ctx_blocks, k_scale_ptr, v_scale_ptr); -#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO) \ - paged_attention_ll4mi_QKV_mfma4_kernel \ - <<>>( \ - query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \ - block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq, \ - alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \ - exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \ - k_scale_ptr, v_scale_ptr); +#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO) \ + paged_attention_ll4mi_QKV_mfma4_kernel \ + <<>>( \ + query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \ + block_tables_ptr, context_lens_ptr, query_start_loc_ptr, \ + max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \ + kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, \ + max_ctx_blocks, k_scale_ptr, v_scale_ptr); #define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS) \ paged_attention_ll4mi_reduce_kernel \ <<>>( \ out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, \ - context_lens_ptr, max_num_partitions); + context_lens_ptr, query_start_loc_ptr, max_num_partitions); template & alibi_slopes, - torch::Tensor& k_scale, torch::Tensor& v_scale) { - int num_seqs = query.size(0); + const std::optional& query_start_loc, int max_context_len, + const std::optional& alibi_slopes, torch::Tensor& k_scale, + torch::Tensor& v_scale) { + int num_seqs = block_tables.size(0); int num_heads = query.size(1); int head_size = query.size(2); int max_num_blocks_per_seq = block_tables.size(1); @@ -1569,6 +1602,13 @@ void paged_attention_custom_launcher( int kv_block_stride = key_cache.stride(0); int kv_head_stride = key_cache.stride(1); + // NOTE: query start location is optional for V0 decode should not be used. + // If batch contains mix of prefills and decode, prefills should be skipped. + const int* query_start_loc_ptr = + query_start_loc + ? reinterpret_cast(query_start_loc.value().data_ptr()) + : nullptr; + // NOTE: alibi_slopes is optional. const float* alibi_slopes_ptr = alibi_slopes @@ -1700,8 +1740,8 @@ void paged_attention_custom_launcher( paged_attention_custom_launcher( \ out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ - num_kv_heads, scale, block_tables, context_lens, max_context_len, \ - alibi_slopes, k_scale, v_scale); + num_kv_heads, scale, block_tables, context_lens, query_start_loc, \ + max_context_len, alibi_slopes, k_scale, v_scale); #define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \ PSIZE) \ @@ -1750,6 +1790,7 @@ void paged_attention( double scale, torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] torch::Tensor& context_lens, // [num_seqs] + const std::optional& query_start_loc, // [num_seqs] int64_t block_size, int64_t max_context_len, const std::optional& alibi_slopes, const std::string& kv_cache_dtype, torch::Tensor& k_scale, diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h index ba161951772ad..b90cfdc617afd 100644 --- a/csrc/rocm/ops.h +++ b/csrc/rocm/ops.h @@ -2,13 +2,23 @@ #include +torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b, + const int64_t rows_per_block); + +torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b, + const int64_t CuCount); + +void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c, + at::Tensor& scale_a, at::Tensor& scale_b, const int64_t CuCount); + void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, - torch::Tensor& context_lens, int64_t block_size, - int64_t max_context_len, + torch::Tensor& context_lens, + const std::optional& query_start_loc, + int64_t block_size, int64_t max_context_len, const std::optional& alibi_slopes, const std::string& kv_cache_dtype, torch::Tensor& k_scale, torch::Tensor& v_scale); diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu new file mode 100644 index 0000000000000..29dbbe8e35e8f --- /dev/null +++ b/csrc/rocm/skinny_gemms.cu @@ -0,0 +1,1600 @@ +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include "cuda_compat.h" +#include "dispatch_utils.h" +#include "quantization/fp8/common.cuh" + +#if defined(__HIPCC__) && (defined(__gfx90a__) || defined(__gfx942__)) + #define __HIP__MI300_MI250__ +#endif + +#if defined(__HIPCC__) && defined(__gfx942__) + #define __HIP__MI300__ +#endif + +#if defined(NDEBUG) + #undef NDEBUG + #include + #define UNREACHABLE_CODE assert(false); + #define NDEBUG +#else + #define UNREACHABLE_CODE assert(false); +#endif + +template +struct scalar {}; + +template +struct scalar2 {}; + +template +__device__ __forceinline__ float2 __s22float2(T v); + +template +__device__ __forceinline__ T __float2s(float v); + +template +__device__ __forceinline__ T __float22s2_rn(float2 v); + +// Definitions and cvt functions for fp16 +template <> +struct scalar { + using type = half; +}; + +template <> +struct scalar2 { + using type = __half2; +}; + +template <> +__device__ __forceinline__ half __float2s(float v) { + return __float2half(v); +} + +template <> +__device__ __forceinline__ float2 __s22float2(__half2 v) { + return __half22float2(v); +} + +template <> +__device__ __forceinline__ __half2 __float22s2_rn(float2 v) { + return __float22half2_rn(v); +} + +// Definitions and cvt functions for bf16 +template <> +struct scalar { + using type = __hip_bfloat16; +}; + +template <> +struct scalar2 { + using type = __hip_bfloat162; +}; + +template <> +__device__ __forceinline__ __hip_bfloat16 __float2s(float v) { + return __float2bfloat16(v); +} + +template <> +__device__ __forceinline__ float2 __s22float2(__hip_bfloat162 v) { + return __bfloat1622float2(v); +} + +template <> +__device__ __forceinline__ __hip_bfloat162 __float22s2_rn(float2 v) { + return __float22bfloat162_rn(v); +} + +template +__device__ __forceinline__ T loadnt(T* addr) { + return __builtin_nontemporal_load(addr); +} + +__device__ __forceinline__ float4 load_ntmprl(const float4* addr) { + auto addr_alias = reinterpret_cast(addr); + auto dat0 = loadnt(addr_alias); + auto dat1 = loadnt(addr_alias + 1); + auto dat2 = loadnt(addr_alias + 2); + auto dat3 = loadnt(addr_alias + 3); + return make_float4(dat0, dat1, dat2, dat3); +} + +// TBlock fetches entire rows of A, and entire col of B (K dimension); assume +// N=1 for time being grid is M/A_NUM_ROWS blocks +template +__global__ void LLGemm1_kernel(const scalar_t* in_a, const scalar_t* in_b, + scalar_t* out_c, const int K) { + using scalar2_t = typename scalar2::type; + auto af4 = reinterpret_cast(in_a); + auto bf4 = reinterpret_cast(in_b); + auto c = reinterpret_cast(out_c); + __shared__ float red_smem[NUM_A_ROWS_PER_BLOCK][WARP_SIZE]; + const int row_addr = blockIdx.x * NUM_A_ROWS_PER_BLOCK * K / 8; + const int threadid = threadIdx.x; + const int warp = threadIdx.x / WARP_SIZE; + const int lane = threadIdx.x % WARP_SIZE; + const int num_warps = blockDim.x / WARP_SIZE; + const int qwarpid = threadid / num_warps; + const int qthreadid = threadid % num_warps; + float4 rowA_elem4[NUM_A_ROWS_PER_BLOCK]; + scalar2_t colB_elem4x, colB_elem4y, colB_elem4z, colB_elem4w; + float acc[NUM_A_ROWS_PER_BLOCK]; + scalar2_t acch2; + scalar2_t oval; + + // As we later use warp shuffle operations, we may have more threads in the + // block than the actual available data, hence the if guard here. + if (threadid * 8 < K) { +#pragma unroll + for (int i = 0; i < NUM_A_ROWS_PER_BLOCK; i++) { + // rowA_elem4[i] holds 8 * half numbers seen as a single float4. + rowA_elem4[i] = load_ntmprl(&af4[row_addr + threadid + K / 8 * i]); + } + } + + colB_elem4x = bf4[threadid * 4 + 0]; + colB_elem4y = bf4[threadid * 4 + 1]; + colB_elem4z = bf4[threadid * 4 + 2]; + colB_elem4w = bf4[threadid * 4 + 3]; + + scalar2_t Af2; + scalar2_t Bf2; + float2 S; + + auto Ah2ptr = reinterpret_cast(&rowA_elem4); + scalar2_t* ah2lptr; + +#pragma unroll + for (int i = 0; i < NUM_A_ROWS_PER_BLOCK; i++) { + // Multiply-add on 8 scalar_t. + ah2lptr = Ah2ptr + i * 4; + Af2 = *(ah2lptr); + acch2 = __hmul2(Af2, colB_elem4x); + Af2 = *(ah2lptr + 1); + acch2 = __hfma2(Af2, colB_elem4y, acch2); + Af2 = *(ah2lptr + 2); + acch2 = __hfma2(Af2, colB_elem4z, acch2); + Af2 = *(ah2lptr + 3); + acch2 = __hfma2(Af2, colB_elem4w, acch2); + S = __s22float2(acch2); + + // See comment above concerning the if guard. + acc[i] = (threadid * 8 < K ? S.x + S.y : 0.f); + } + +// all reduce across warp. +#pragma unroll + for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { +#pragma unroll + for (int i = 0; i < NUM_A_ROWS_PER_BLOCK; i++) { + acc[i] += __shfl_xor(acc[i], mask); + } + } + + // Warp leaders store the data to shared memory. + if (lane < NUM_A_ROWS_PER_BLOCK) { + red_smem[lane][warp] = acc[lane]; + } + + // Make sure the data is in shared memory. + __syncthreads(); + + if (qwarpid < NUM_A_ROWS_PER_BLOCK) { + acc[qwarpid] = qthreadid < num_warps ? red_smem[qwarpid][qthreadid] : 0.f; + for (int mask = num_warps / 2; mask >= 1; mask /= 2) { + acc[qwarpid] += __shfl_xor(acc[qwarpid], mask); + } + float oval2 = __shfl_xor(acc[qwarpid], num_warps); + + if (lane % (num_warps * 2) == 0) { + oval = __float22s2_rn(make_float2(acc[qwarpid], oval2)); + c[blockIdx.x * NUM_A_ROWS_PER_BLOCK / 2 + qwarpid / 2] = oval; + } + } +} + +torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b, + const int64_t rows_per_block) { + auto M = in_a.size(0); + auto K = in_a.size(1); + auto N = in_b.size(0); + + TORCH_CHECK(N == 1, "Row number of activation tensor must be 1."); + TORCH_CHECK(in_a.dtype() == in_b.dtype()); + TORCH_CHECK(in_b.dtype() == torch::kFloat16 || + in_b.dtype() == torch::kBFloat16); + + auto out_c = torch::empty( + {N, M}, torch::TensorOptions().dtype(in_b.dtype()).device(in_b.device())); + + // NUM_TREADS need to be a multiple of WARP_SIZE, as we are using warp shuffle + // operations. + const int NUM_THREADS = + K * 2 / 16 % WARP_SIZE == 0 + ? K * 2 / 16 + : K * 2 / 16 + (WARP_SIZE - K * 2 / 16 % WARP_SIZE); + + int NUM_BLOCKS = M / rows_per_block; + + const at::cuda::OptionalCUDAGuard device_guard(device_of(in_b)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + // call the kernel function... + AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "LLGemm1", [&] { + auto a_ptr = in_a.data_ptr(); + auto b_ptr = in_b.data_ptr(); + auto c_ptr = out_c.data_ptr(); + if (rows_per_block == 2) { + LLGemm1_kernel + <<>>(a_ptr, b_ptr, c_ptr, K); + } else if (rows_per_block == 4) { + LLGemm1_kernel + <<>>(a_ptr, b_ptr, c_ptr, K); + } else if (rows_per_block == 8) { + LLGemm1_kernel + <<>>(a_ptr, b_ptr, c_ptr, K); + } else if (rows_per_block == 16) { + LLGemm1_kernel + <<>>(a_ptr, b_ptr, c_ptr, K); + } else { + NUM_BLOCKS = M / 4; + LLGemm1_kernel + <<>>(a_ptr, b_ptr, c_ptr, K); + } + }); + + return out_c; +} + +#define DOT2C(V0, V2, V3) \ + if constexpr (std::is_same_v) { \ + asm("v_dot2c_f32_f16 %0, %2, %3" : "=v"(V0) : "0"(V0), "v"(V2), "v"(V3)); \ + } else if constexpr (std::is_same_v) { \ + float2 s = __bfloat1622float2(*((__hip_bfloat162*)(&(V2)))) * \ + __bfloat1622float2(*((__hip_bfloat162*)(&(V3)))); \ + V0 += (s.x + s.y); \ + } + +#if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support +// This version targets cases where A[] fits LDS capacity +template +__global__ void __launch_bounds__(WvPrGrp* THRDS) + wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B, + const scalar_t* __restrict__ A, scalar_t* C, + const int _WvPrGrp, const int CuCount) { + using scalar8 = + __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(float)))) float; + union bigType { + scalar_t h[A_CHUNK]; + float f[A_CHUNK / 2]; + float2 f2[A_CHUNK / 4]; + double d[A_CHUNK / 4]; + scalar8 h8; + }; + + //---------------------------------------------------- + // Reserving 64 KB of LDS to have 1 WG / CU + // Goal is to bring the activation matrix A to the LDS + // and use it across the lifetime of the work group + // TODO: When activation matrix is larger than 64 KB + // then this is not goint to work! + //---------------------------------------------------- + __shared__ scalar_t s[1024 * 32]; + + //---------------------------------------------------- + // Fetch the activation matrix to LDS + // Loop iteration: + // - Each thread (lane) is fetching 8 elements (A_Chunk) + // - Each wave will fetch 64*8=> 512 elements + // - Each WG will fetch 512 * 16 => 8K elements + // - Then the WG will move to another 8 K elements + // TODO: Logic below will only work when K is multiple of 8 + //---------------------------------------------------- + for (uint32_t k = 0; k < min(K * N, 32 * 1024); + k += THRDS * WvPrGrp * A_CHUNK) { + uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK); + + if (k_in >= min(K * N, 32 * 1024)) break; + + *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in])); + } + __syncthreads(); + + if (threadIdx.y >= _WvPrGrp) return; + + uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE; + + float sum[N][YTILE]; + + //---------------------------------------------------- + // Each wave works on a single column of weight matrix. + // There are 16 waves per WG, and hence, each WG is + // working on 16 columns of weight matrix. Moreover, + // we tile in column direction by YTILE, so when YTILE=1 + // the above math is right, however, when YTILE=2 then + // each wave will be working on 2 columns and WG will + // be working on 32 columns. + // + // Top level loop that makes WGs persistent! + // - WGs iterates across columns of weight matrix + // - Each wave within WG works on a given column(s) + // - After completing first set of columns, WGs start + // working on the next set of available columns + //---------------------------------------------------- + while (m < M) { + //---------------------------------------------------- + // 'sum' accumulates the matrix A x B computation + // split across 64 lanes. + // + // YTILE represents how many column of weight matrix + // are being worked on by each wave. + //---------------------------------------------------- + for (int i = 0; i < YTILE; i++) + for (int n = 0; n < N; n++) sum[n][i] = 0; + + bigType bigA[N][UNRL]; + bigType bigB[YTILE][UNRL]; + //---------------------------------------------------- + // Fetch weight matrix B in interleaved K-split! + // - Each thread (lane) is fetching 8 elements (A_Chunk) + // - Each wave will fetch 64*8=> 512 elements (1024B) + // - YTILE represents the number of column being serviced + // by wave + // - Loop for fetching weight matrix (B) are unrolled + // + // Fetch activation matrix A from LDS + // - Loop for fetching activation matrix (A) are unrolled + // + // Finally, do the matrix multiplication in an unrolled + // fashion. This provides lot of food for compiler + // scheduling. + // + // TODO: Logic below will only work when K is multiple of 8 + //---------------------------------------------------- + // for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) { + for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) { + // Fetch the weight matrix from memory! + #pragma unroll + for (uint32_t k2 = 0; k2 < UNRL; k2++) { + uint32_t k = k1 + k2 * THRDS * A_CHUNK; + uint32_t k_ = k + threadIdx.x * A_CHUNK; + if (k_ >= K) break; + + const scalar_t* B_ = &B[(m + 0) * K + k_]; + bigB[0][k2].h8 = (loadnt((scalar8*)(&B_[0 * K]))); + //---------------------------------------------------- + // The following code with YTILE > 1 has to be deleted + //---------------------------------------------------- + if constexpr (YTILE >= 2) + bigB[1][k2].h8 = (loadnt((scalar8*)(&B_[1 * K]))); + if constexpr (YTILE >= 3) + bigB[2][k2].h8 = (loadnt((scalar8*)(&B_[2 * K]))); + if constexpr (YTILE >= 4) + bigB[3][k2].h8 = (loadnt((scalar8*)(&B_[3 * K]))); + if constexpr (YTILE >= 5) + bigB[4][k2].h8 = (loadnt((scalar8*)(&B_[4 * K]))); + if constexpr (YTILE >= 6) + bigB[5][k2].h8 = (loadnt((scalar8*)(&B_[5 * K]))); + if constexpr (YTILE >= 7) + bigB[6][k2].h8 = (loadnt((scalar8*)(&B_[6 * K]))); + if constexpr (YTILE >= 8) + bigB[7][k2].h8 = (loadnt((scalar8*)(&B_[7 * K]))); + } + + // Fetch activation matrix from either just LDS or from both LDS / memory + #pragma unroll + for (uint32_t k2 = 0; k2 < UNRL; k2++) { + uint32_t k = k1 + k2 * THRDS * A_CHUNK; + uint32_t k_ = k + threadIdx.x * A_CHUNK; + if (k_ >= K) break; + + // Fetch A activation matrix in interleaved fashion from LDS or memory + + for (int n = 0; n < N; n++) { + bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n]))); + } + } + + // Do the matrix multiplication in interleaved manner + #pragma unroll + for (uint32_t k2 = 0; k2 < UNRL; k2++) { + uint32_t k = k1 + k2 * THRDS * A_CHUNK; + uint32_t k_ = k + threadIdx.x * A_CHUNK; + if (k_ >= K) break; + // Do the matrix multiplication of activation and weight matrix + // - Remember the accumulation is happening for K-split of 64! + #pragma unroll + for (uint32_t n = 0; n < N; n++) { + #pragma unroll + for (uint32_t b = 0; b < A_CHUNK / 2; b++) { + DOT2C(sum[n][0], bigA[n][k2].f[b], bigB[0][k2].f[b]) + //---------------------------------------------------- + // The following code with YTILE > 1 + //---------------------------------------------------- + if constexpr (YTILE >= 2) { + DOT2C(sum[n][1], bigA[n][k2].f[b], bigB[1][k2].f[b]); + } + if constexpr (YTILE >= 3) { + DOT2C(sum[n][2], bigA[n][k2].f[b], bigB[2][k2].f[b]); + } + if constexpr (YTILE >= 4) { + DOT2C(sum[n][3], bigA[n][k2].f[b], bigB[3][k2].f[b]); + } + if constexpr (YTILE >= 5) { + DOT2C(sum[n][4], bigA[n][k2].f[b], bigB[4][k2].f[b]); + } + if constexpr (YTILE >= 6) { + DOT2C(sum[n][5], bigA[n][k2].f[b], bigB[5][k2].f[b]); + } + if constexpr (YTILE >= 7) { + DOT2C(sum[n][6], bigA[n][k2].f[b], bigB[6][k2].f[b]); + } + if constexpr (YTILE >= 8) { + DOT2C(sum[n][7], bigA[n][k2].f[b], bigB[7][k2].f[b]); + } + } + } + } + } + + //---------------------------------------------------- + // Final reduction step using shuffle + //---------------------------------------------------- + for (int n = 0; n < N; n++) { + for (int y = 0; y < YTILE; y++) { + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 " + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 " + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 " + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 wave_shr:1 bound_ctrl:0" + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0" + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0" + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + } + } + if (threadIdx.x == 63) { + for (int n = 0; n < N; n++) { + for (int i = 0; i < YTILE; i++) { + // if (commitColumn[i]) C[m + i + n * M] = __float2half(sum[n][i]); + C[m + i + n * M] = __float2s(sum[n][i]); + } + } + } + + m += CuCount * _WvPrGrp * YTILE; + } +} +#else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support +template +__global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B, + const scalar_t* __restrict__ A, scalar_t* C, + const int _WvPrGrp, const int CuCount) { + UNREACHABLE_CODE +} +#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support + +#if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support +// This version targets cases where A[] marginally exceeds LDS capacity +template +__global__ void __launch_bounds__(WvPrGrp* THRDS) + wvSplitK_hf_(const int K, const int M, const scalar_t* B, + const scalar_t* __restrict__ A, scalar_t* C, + const int _WvPrGrp, const int CuCount) { + using scalar8 = + __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(float)))) float; + union bigType { + scalar_t h[A_CHUNK]; + float f[A_CHUNK / 2]; + float2 f2[A_CHUNK / 4]; + double d[A_CHUNK / 4]; + scalar8 h8; + }; + + //---------------------------------------------------- + // Reserving 64 KB of LDS to have 1 WG / CU + // Goal is to bring the activation matrix A to the LDS + // and use it across the lifetime of the work group + // TODO: When activation matrix is larger than 64 KB + // then this is not goint to work! + //---------------------------------------------------- + __shared__ scalar_t s[1024 * 32]; + + //---------------------------------------------------- + // Computation of columns that need to be committed to memory! + //---------------------------------------------------- + uint32_t commitColumn[YTILE]; + for (uint32_t i = 0; i < YTILE; i++) { + commitColumn[i] = 1; + } + + //---------------------------------------------------- + // Indexing function into the column of weight matrix B + // Algorithm does 64 lane k-splitting / wave and uses + // WG ID and Thread ID to find the index. + //---------------------------------------------------- + // int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp); + uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE; + + // Check whether there will be fragmenation! + // This will happen only for the last wave! + if (m < M && (m + YTILE) >= M) { + uint32_t startColumn = M - YTILE; + for (uint32_t i = 0; i < (m - startColumn); i++) { + commitColumn[i] = 0; + } + m = startColumn; + } + + //---------------------------------------------------- + // Fetch the activation matrix to LDS + // Loop iteration: + // - Each thread (lane) is fetching 8 elements (A_Chunk) + // - Each wave will fetch 64*8=> 512 elements + // - Each WG will fetch 512 * 16 => 8K elements + // - Then the WG will move to another 8 K elements + // TODO: Logic below will only work when K is multiple of 8 + //---------------------------------------------------- + for (uint32_t k = 0; k < min(K * N, 32 * 1024); + k += THRDS * WvPrGrp * A_CHUNK) { + uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK); + + if (k_in >= min(K * N, 32 * 1024)) break; + + *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in])); + } + + __syncthreads(); + + if (threadIdx.y >= _WvPrGrp) return; + + float sum[N][YTILE]; + + //---------------------------------------------------- + // Each wave works on a single column of weight matrix. + // There are 16 waves per WG, and hence, each WG is + // working on 16 columns of weight matrix. Moreover, + // we tile in column direction by YTILE, so when YTILE=1 + // the above math is right, however, when YTILE=2 then + // each wave will be working on 2 columns and WG will + // be working on 32 columns. + // + // Top level loop that makes WGs persistent! + // - WGs iterates across columns of weight matrix + // - Each wave within WG works on a given column(s) + // - After completing first set of columns, WGs start + // working on the next set of available columns + //---------------------------------------------------- + while (m < M) { + //---------------------------------------------------- + // 'sum' accumulates the matrix A x B computation + // split across 64 lanes. + // + // YTILE represents how many column of weight matrix + // are being worked on by each wave. + //---------------------------------------------------- + for (int i = 0; i < YTILE; i++) + for (int n = 0; n < N; n++) sum[n][i] = 0; + + bigType bigA[N][UNRL]; + bigType bigB[YTILE][UNRL]; + //---------------------------------------------------- + // Fetch weight matrix B in interleaved K-split! + // - Each thread (lane) is fetching 8 elements (A_Chunk) + // - Each wave will fetch 64*8=> 512 elements (1024B) + // - YTILE represents the number of column being serviced + // by wave + // - Loop for fetching weight matrix (B) are unrolled + // + // Fetch activation matrix A from LDS + // - Loop for fetching activation matrix (A) are unrolled + // + // Finally, do the matrix multiplication in an unrolled + // fashion. This provides lot of food for compiler + // scheduling. + // + // TODO: Logic below will only work when K is multiple of 8 + //---------------------------------------------------- + for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) { + // Fetch the weight matrix from memory! + #pragma unroll + for (uint32_t k2 = 0; k2 < UNRL; k2++) { + uint32_t k = k1 + k2 * THRDS * A_CHUNK; + uint32_t k_ = k + threadIdx.x * A_CHUNK; + if (k_ >= K) break; + + const scalar_t* B_ = &B[(m + 0) * K + k_]; + bigB[0][k2].h8 = (loadnt((scalar8*)(&B_[0 * K]))); + //---------------------------------------------------- + // The following code with YTILE > 1 has to be deleted + //---------------------------------------------------- + if constexpr (YTILE >= 2) + bigB[1][k2].h8 = (loadnt((scalar8*)(&B_[1 * K]))); + if constexpr (YTILE >= 3) + bigB[2][k2].h8 = (loadnt((scalar8*)(&B_[2 * K]))); + if constexpr (YTILE >= 4) + bigB[3][k2].h8 = (loadnt((scalar8*)(&B_[3 * K]))); + if constexpr (YTILE >= 5) + bigB[4][k2].h8 = (loadnt((scalar8*)(&B_[4 * K]))); + if constexpr (YTILE >= 6) + bigB[5][k2].h8 = (loadnt((scalar8*)(&B_[5 * K]))); + if constexpr (YTILE >= 7) + bigB[6][k2].h8 = (loadnt((scalar8*)(&B_[6 * K]))); + if constexpr (YTILE >= 8) + bigB[7][k2].h8 = (loadnt((scalar8*)(&B_[7 * K]))); + } + + // Fetch activation matrix from either just LDS or from both LDS / memory + #pragma unroll + for (uint32_t k2 = 0; k2 < UNRL; k2++) { + uint32_t k = k1 + k2 * THRDS * A_CHUNK; + uint32_t k_ = k + threadIdx.x * A_CHUNK; + if (k_ >= K) break; + + // Fetch A activation matrix in interleaved fashion from LDS or memory + + for (int n = 0; n < N; n++) { + if (k_ + K * n < 32 * 1024) + bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n]))); + else + bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n]))); + } + } + + // Do the matrix multiplication in interleaved manner + #pragma unroll + for (uint32_t n = 0; n < N; n++) { + #pragma unroll + for (uint32_t k2 = 0; k2 < UNRL; k2++) { + uint32_t k = k1 + k2 * THRDS * A_CHUNK; + uint32_t k_ = k + threadIdx.x * A_CHUNK; + if (k_ >= K) break; + // Do the matrix multiplication of activation and weight matrix + // - Remember the accumulation is happening for K-split of 64! + #pragma unroll + for (uint32_t b = 0; b < A_CHUNK / 2; b++) { + DOT2C(sum[n][0], bigA[n][k2].f[b], bigB[0][k2].f[b]); + //---------------------------------------------------- + // The following code with YTILE > 1 + //---------------------------------------------------- + if constexpr (YTILE >= 2) { + DOT2C(sum[n][1], bigA[n][k2].f[b], bigB[1][k2].f[b]); + } + if constexpr (YTILE >= 3) { + DOT2C(sum[n][2], bigA[n][k2].f[b], bigB[2][k2].f[b]); + } + if constexpr (YTILE >= 4) { + DOT2C(sum[n][3], bigA[n][k2].f[b], bigB[3][k2].f[b]); + } + if constexpr (YTILE >= 5) { + DOT2C(sum[n][4], bigA[n][k2].f[b], bigB[4][k2].f[b]); + } + if constexpr (YTILE >= 6) { + DOT2C(sum[n][5], bigA[n][k2].f[b], bigB[5][k2].f[b]); + } + if constexpr (YTILE >= 7) { + DOT2C(sum[n][6], bigA[n][k2].f[b], bigB[6][k2].f[b]); + } + if constexpr (YTILE >= 8) { + DOT2C(sum[n][7], bigA[n][k2].f[b], bigB[7][k2].f[b]); + } + } + } + } + } + + //---------------------------------------------------- + // Final reduction step using shuffle + //---------------------------------------------------- + for (int n = 0; n < N; n++) { + for (int y = 0; y < YTILE; y++) { + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 " + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 " + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 " + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 wave_shr:1 bound_ctrl:0" + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0" + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0" + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + } + } + + if (threadIdx.x == 63) { + for (int n = 0; n < N; n++) { + for (int i = 0; i < YTILE; i++) { + if (commitColumn[i]) + C[m + i + n * M] = __float2s(sum[n][i]); + } + } + } + + m += CuCount * _WvPrGrp * YTILE; + + // Check whether there will be fragmenation! + // This will happen only for the last wave! + if (m < M && (m + YTILE) >= M) { + uint32_t startColumn = M - YTILE; + for (uint32_t i = 0; i < (m - startColumn); i++) { + commitColumn[i] = 0; + } + m = startColumn; + } + } +} + +#else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support +template +__global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B, + const scalar_t* __restrict__ A, scalar_t* C, + const int _WvPrGrp, const int CuCount) { + UNREACHABLE_CODE +} +#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support + +#if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support +// This version targets big A[] cases, where it is much larger than LDS capacity +template +__global__ void __launch_bounds__(WvPrGrp* THRDS) + wvSplitK_hf_big_(const int K, const int M, const scalar_t* B, + const scalar_t* __restrict__ A, scalar_t* C, + const int _WvPrGrp, const int CuCount) { + using scalar8 = + __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(float)))) float; + + union bigType { + scalar_t h[A_CHUNK]; + float f[A_CHUNK / 2]; + float2 f2[A_CHUNK / 4]; + double d[A_CHUNK / 4]; + scalar8 h8; + }; + + //---------------------------------------------------- + // Reserving 64 KB of LDS to have 1 WG / CU + // Goal is to bring the activation matrix A to the LDS + // and use it across the lifetime of the work group + // TODO: When activation matrix is larger than 64 KB + // then this is not goint to work! + //---------------------------------------------------- + __shared__ scalar_t s[1024 * 32]; + + //---------------------------------------------------- + // Computation of columns that need to be committed to memory! + //---------------------------------------------------- + uint32_t commitColumn[YTILE]; + for (uint32_t i = 0; i < YTILE; i++) { + commitColumn[i] = 1; + } + + // int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp); + if (threadIdx.y >= _WvPrGrp) return; + + //---------------------------------------------------- + // Indexing function into the column of weight matrix B + // Algorithm does 64 lane k-splitting / wave and uses + // WG ID and Thread ID to find the index. + //---------------------------------------------------- + uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE; + + // Check whether there will be fragmenation! + // This will happen only for the last wave! + if (m < M && (m + YTILE) >= M) { + uint32_t startColumn = M - YTILE; + for (uint32_t i = 0; i < (m - startColumn); i++) { + commitColumn[i] = 0; + } + m = startColumn; + } + + //---------------------------------------------------- + // Fetch the activation matrix to LDS + // Loop iteration: + // - Each thread (lane) is fetching 8 elements (A_Chunk) + // - Each wave will fetch 64*8=> 512 elements + // - Each WG will fetch 512 * 16 => 8K elements + // - Then the WG will move to another 8 K elements + // TODO: Logic below will only work when K is multiple of 8 + //---------------------------------------------------- + #define PCML + #ifndef PCML + for (uint32_t k = 0; k < min(K * N, 32 * 1024); + k += THRDS * WvPrGrp * A_CHUNK) { + uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK); + + if (k_in >= min(K * N, 32 * 1024)) break; + + *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in])); + } + __syncthreads(); + #endif + + #define TUC (THRDS * UNRL * A_CHUNK) + uint32_t kBase = 0; + // find biggest k size that fits in LDS + uint32_t kFit = (32 * 1024) / N; + // kFit = (kFit%TWC==0) ? kFit : (kFit-kFit%TWC+TWC); //round up to multiple + // of TUC + kFit = (kFit % TUC == 0) + ? kFit + : (kFit - kFit % TUC); // round up to multiple of TUC + // if (kFit == 0) kFit = TUC; + kFit = min(kFit, K); + + float sum[N][YTILE]; + + //---------------------------------------------------- + // Each wave works on a single column of weight matrix. + // There are 16 waves per WG, and hence, each WG is + // working on 16 columns of weight matrix. Moreover, + // we tile in column direction by YTILE, so when YTILE=1 + // the above math is right, however, when YTILE=2 then + // each wave will be working on 2 columns and WG will + // be working on 32 columns. + // + // Top level loop that makes WGs persistent! + // - WGs iterates across columns of weight matrix + // - Each wave within WG works on a given column(s) + // - After completing first set of columns, WGs start + // working on the next set of available columns + //---------------------------------------------------- + #ifdef PCML + int YW = (YTILE * _WvPrGrp); + uint32_t Mrndp = (M % YW == 0) ? M : (M - M % YW + YW); + while (m < Mrndp) { + #else + while (m < M) { + #endif + //---------------------------------------------------- + // 'sum' accumulates the matrix A x B computation + // split across 64 lanes. + // + // YTILE represents how many column of weight matrix + // are being worked on by each wave. + //---------------------------------------------------- + for (int i = 0; i < YTILE; i++) + for (int n = 0; n < N; n++) sum[n][i] = 0; + + bigType bigA[N][UNRL]; + bigType bigB[YTILE][UNRL]; + //---------------------------------------------------- + // Fetch weight matrix B in interleaved K-split! + // - Each thread (lane) is fetching 8 elements (A_Chunk) + // - Each wave will fetch 64*8=> 512 elements (1024B) + // - YTILE represents the number of column being serviced + // by wave + // - Loop for fetching weight matrix (B) are unrolled + // + // Fetch activation matrix A from LDS + // - Loop for fetching activation matrix (A) are unrolled + // + // Finally, do the matrix multiplication in an unrolled + // fashion. This provides lot of food for compiler + // scheduling. + // + // TODO: Logic below will only work when K is multiple of 8 + //---------------------------------------------------- + for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) { + #ifdef PCML + if ((k1 == 0) || (k1 == kBase + kFit)) { // load next chunk of A[] to LDS + if (k1 != 0) kBase += kFit; + __syncthreads(); + for (uint32_t k = 0; k < kFit; k += THRDS * _WvPrGrp * A_CHUNK) { + uint32_t kOff = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK); + if (kBase + kOff >= K) break; + if (kOff >= kFit) break; + for (uint32_t n = 0; n < N; n++) { + uint32_t k_in = kBase + n * K + kOff; + uint32_t k_ot = n * kFit + kOff; + *((bigType*)(&s[k_ot])) = *((bigType*)(&A[k_in])); + } + } + __syncthreads(); + } + if (m >= M) continue; + #endif + + // Fetch the weight matrix from memory! + #pragma unroll + for (uint32_t k2 = 0; k2 < UNRL; k2++) { + uint32_t k = k1 + k2 * THRDS * A_CHUNK; + uint32_t k_ = k + threadIdx.x * A_CHUNK; + if (k_ >= K) break; + + const scalar_t* B_ = &B[(m + 0) * K + k_]; + bigB[0][k2].h8 = (loadnt((scalar8*)(&B_[0 * K]))); + //---------------------------------------------------- + // The following code with YTILE > 1 has to be deleted + //---------------------------------------------------- + if constexpr (YTILE >= 2) + bigB[1][k2].h8 = (loadnt((scalar8*)(&B_[1 * K]))); + if constexpr (YTILE >= 3) + bigB[2][k2].h8 = (loadnt((scalar8*)(&B_[2 * K]))); + if constexpr (YTILE >= 4) + bigB[3][k2].h8 = (loadnt((scalar8*)(&B_[3 * K]))); + if constexpr (YTILE >= 5) + bigB[4][k2].h8 = (loadnt((scalar8*)(&B_[4 * K]))); + if constexpr (YTILE >= 6) + bigB[5][k2].h8 = (loadnt((scalar8*)(&B_[5 * K]))); + if constexpr (YTILE >= 7) + bigB[6][k2].h8 = (loadnt((scalar8*)(&B_[6 * K]))); + if constexpr (YTILE >= 8) + bigB[7][k2].h8 = (loadnt((scalar8*)(&B_[7 * K]))); + } + + // Fetch activation matrix from either just LDS or from both LDS / memory + #pragma unroll + for (uint32_t k2 = 0; k2 < UNRL; k2++) { + uint32_t k = k1 + k2 * THRDS * A_CHUNK; + uint32_t k_ = k + threadIdx.x * A_CHUNK; + if (k_ >= K) break; + + // Fetch A activation matrix in interleaved fashion from LDS or memory + + for (int n = 0; n < N; n++) { + #ifdef PCML + bigA[n][k2] = *((const bigType*)(&(s[k_ - kBase + kFit * n]))); + #else + if (k_ + K * n < 32 * 1024) + bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n]))); + else + bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n]))); + #endif + } + } + + // Do the matrix multiplication in interleaved manner + #pragma unroll + for (uint32_t k2 = 0; k2 < UNRL; k2++) { + uint32_t k = k1 + k2 * THRDS * A_CHUNK; + uint32_t k_ = k + threadIdx.x * A_CHUNK; + if (k_ >= K) break; + #pragma unroll + for (uint32_t n = 0; n < N; n++) { + // Do the matrix multiplication of activation and weight matrix + // - Remember the accumulation is happening for K-split of 64! + #pragma unroll + for (uint32_t b = 0; b < A_CHUNK / 2; b++) { + DOT2C(sum[n][0], bigA[n][k2].f[b], bigB[0][k2].f[b]); + //---------------------------------------------------- + // The following code with YTILE > 1 + //---------------------------------------------------- + if constexpr (YTILE >= 2) { + DOT2C(sum[n][1], bigA[n][k2].f[b], bigB[1][k2].f[b]); + } + if constexpr (YTILE >= 3) { + DOT2C(sum[n][2], bigA[n][k2].f[b], bigB[2][k2].f[b]); + } + if constexpr (YTILE >= 4) { + DOT2C(sum[n][3], bigA[n][k2].f[b], bigB[3][k2].f[b]); + } + if constexpr (YTILE >= 5) { + DOT2C(sum[n][4], bigA[n][k2].f[b], bigB[4][k2].f[b]); + } + if constexpr (YTILE >= 6) { + DOT2C(sum[n][5], bigA[n][k2].f[b], bigB[5][k2].f[b]); + } + if constexpr (YTILE >= 7) { + DOT2C(sum[n][6], bigA[n][k2].f[b], bigB[6][k2].f[b]); + } + if constexpr (YTILE >= 8) { + DOT2C(sum[n][7], bigA[n][k2].f[b], bigB[7][k2].f[b]); + } + } + } + } + } + + #ifdef PCML + if (m >= M) { + m += CuCount * _WvPrGrp * YTILE; + kBase = 0; + continue; + } + #endif + + //---------------------------------------------------- + // Final reduction step using shuffle + //---------------------------------------------------- + for (int n = 0; n < N; n++) { + for (int y = 0; y < YTILE; y++) { + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 " + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 " + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 " + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 wave_shr:1 bound_ctrl:0" + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0" + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0" + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + } + } + + if (threadIdx.x == 63) { + for (int n = 0; n < N; n++) { + for (int i = 0; i < YTILE; i++) { + if (commitColumn[i]) + C[m + i + n * M] = __float2s(sum[n][i]); + } + } + } + + m += CuCount * _WvPrGrp * YTILE; + kBase = 0; + + // Check whether there will be fragmenation! + // This will happen only for the last wave! + if (m < M && (m + YTILE) >= M) { + uint32_t startColumn = M - YTILE; + for (uint32_t i = 0; i < (m - startColumn); i++) { + commitColumn[i] = 0; + } + m = startColumn; + } + } +} +#else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support +template +__global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B, + const scalar_t* __restrict__ A, scalar_t* C, + const int _WvPrGrp, const int CuCount) { + UNREACHABLE_CODE +} +#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support + +int mindiv(int N, int div1, int div2) { + int nPrRnd = div1 * div2; + int rnds0 = N / nPrRnd; + nPrRnd -= div1 * 3; + int rnds3 = N / nPrRnd; + nPrRnd -= div1; + int rnds4 = N / nPrRnd; + nPrRnd -= div1; + int rnds5 = N / nPrRnd; + nPrRnd -= div1; + int rnds6 = N / nPrRnd; + nPrRnd -= div1; + int rnds7 = N / nPrRnd; + nPrRnd -= div1; + int rnds8 = N / nPrRnd; + nPrRnd -= div1; + int rnds9 = N / nPrRnd; + nPrRnd -= div1; + int rtn = div2; + if (rnds0 == rnds3) rtn = div2 - 3; + if (rnds0 == rnds4) rtn = div2 - 4; + if (rnds0 == rnds5) rtn = div2 - 5; + if (rnds0 == rnds6) rtn = div2 - 6; + if (rnds0 == rnds7) rtn = div2 - 7; + if (rnds0 == rnds8) rtn = div2 - 8; + if (rnds0 == rnds9) rtn = div2 - 9; + return rtn; +} + +torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b, + const int64_t CuCount) { + auto M_in = in_a.size(0); + auto K_in = in_a.size(1); + auto N_in = in_b.size(0); + + TORCH_CHECK(in_a.dtype() == in_b.dtype()); + TORCH_CHECK(K_in % 8 == 0, "k % 8 == 0"); + TORCH_CHECK(in_a.dtype() == torch::kFloat16 || + in_a.dtype() == torch::kBFloat16); + + auto out_c = torch::empty( + {N_in, M_in}, + torch::TensorOptions().dtype(in_b.dtype()).device(in_b.device())); + + dim3 grid(CuCount); + + const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + +#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \ + _N) \ + { \ + dim3 block(64, _WvPrGrp); \ + if ((K_in * N_in <= 32 * 1024) && (M_in % _YTILEs == 0)) { \ + int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \ + wvSplitK_hf_sml_ \ + <<>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \ + CuCount); \ + } else if (K_in * N_in <= 32 * 1024 * 1.2) { \ + int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \ + wvSplitK_hf_ \ + <<>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \ + CuCount); \ + } else { \ + int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp); \ + wvSplitK_hf_big_ \ + <<>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \ + CuCount); \ + } \ + } + + AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] { + using fptype = typename scalar::type; + fptype* af4 = reinterpret_cast(in_a.data_ptr()); + const fptype* bf4 = reinterpret_cast(in_b.data_ptr()); + fptype* c = reinterpret_cast(out_c.data_ptr()); + switch (N_in) { + case 1: + WVSPLITK(16, 2, 2, 2, 2, 2, 2, 1) + break; + case 2: + WVSPLITK(16, 2, 2, 2, 2, 2, 2, 2) + break; + case 3: + WVSPLITK(16, 4, 7, 7, 1, 1, 1, 3) + break; + case 4: + WVSPLITK(16, 4, 7, 7, 1, 1, 1, 4) + break; + default: + throw std::runtime_error( + "Unsupported N value: " + std::to_string(M_in) + "," + + std::to_string(K_in) + "," + std::to_string(N_in)); + } + }); + return out_c; +} + +#if defined(__HIP__MI300__) // TODO: Add NAVI support +template +__global__ void __launch_bounds__(WvPrGrp* THRDS) + wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, const fp8_t* B, + const fp8_t* __restrict__ A, scalar_t* C, + const float* __restrict__ s_A, + const float* __restrict__ s_B, const int _WvPrGrp, + const int CuCount) { + using scalar8 = + __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float; + using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int; + using intx4 = __attribute__((__vector_size__(4 * sizeof(int)))) int; + union bigType { + char f8[A_CHUNK]; + char2 c2[A_CHUNK / 2]; + scalar_t h[A_CHUNK / 2]; + float f[A_CHUNK / 4]; + int i[A_CHUNK / 4]; + long l[A_CHUNK / 8]; + intx4 l2[A_CHUNK / 16]; + scalar8 h8; + }; + + __shared__ fp8_t s[1024 * 64]; + + for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK; + k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) { + *((bigType*)(&s[k])) = *((bigType*)(&A[k])); + } + __syncthreads(); + + if (threadIdx.y >= _WvPrGrp) return; + + uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE; + + using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float; + floatx16 sum[N][YTILE]; + float sA = *s_A; + float sB = *s_B; + + while (m < M) { + for (int i = 0; i < YTILE; i++) + for (int n = 0; n < N; n++) sum[n][i] = {0.f}; + + bigType bigA[N][UNRL]; + bigType bigB[YTILE][UNRL]; + + for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) { + #pragma unroll + for (uint32_t k2 = 0; k2 < UNRL; k2++) { + #pragma unroll + for (uint32_t n = 0; n < N; ++n) bigA[n][k2].h8 = {0.f}; + #pragma unroll + for (uint32_t y = 0; y < YTILE; ++y) bigB[y][k2].h8 = {0.f}; + } + + // Fetch the weight matrix from memory! + #pragma unroll + for (uint32_t k2 = 0; k2 < UNRL; k2++) { + uint32_t k = k1 + k2 * THRDS * A_CHUNK; + uint32_t k_ = k + threadIdx.x * A_CHUNK; + if (k_ >= K) break; + + const fp8_t* B_ = &B[(m + 0) * Kp + k_]; + #pragma unroll + for (uint32_t y = 0; y < YTILE; ++y) { + bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[y * Kp]))); + } + } + + // Fetch activation matrix from either just LDS or from both LDS / memory + #pragma unroll + for (uint32_t k2 = 0; k2 < UNRL; k2++) { + uint32_t k = k1 + k2 * THRDS * A_CHUNK; + uint32_t k_ = k + threadIdx.x * A_CHUNK; + if (k_ >= K) break; + for (int n = 0; n < N; n++) { + bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n]))); + } + } + + // Do the matrix multiplication in interleaved manner + #pragma unroll + for (uint32_t k2 = 0; k2 < UNRL; k2++) { + uint32_t k = k1 + k2 * THRDS * A_CHUNK; + if (k >= K) break; + + for (uint32_t n = 0; n < N; n++) { + for (int i = 0; i < A_CHUNK; i += 8) { + for (int y = 0; y < YTILE; ++y) { + sum[n][y] = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8( + bigA[n][k2].l[i / 8], bigB[y][k2].l[i / 8], sum[n][y], 0, 0, + 0); + } + } + } + } + } + + // Final reduction + for (int n = 0; n < N; n++) { + for (int y = 0; y < YTILE; y++) { + float accm0 = sum[n][y][0]; + float accm16 = sum[n][y][8]; + asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 " + : "=v"(accm0) + : "0"(accm0), "v"(sum[n][y][1]), "v"(accm0)); + asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 " + : "=v"(accm16) + : "0"(accm16), "v"(sum[n][y][9]), "v"(accm16)); + asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 " + : "=v"(accm0) + : "0"(accm0), "v"(sum[n][y][2]), "v"(accm0)); + asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 " + : "=v"(accm16) + : "0"(accm16), "v"(sum[n][y][10]), "v"(accm16)); + asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 " + : "=v"(accm0) + : "0"(accm0), "v"(sum[n][y][3]), "v"(accm0)); + asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 " + : "=v"(accm16) + : "0"(accm16), "v"(sum[n][y][11]), "v"(accm16)); + asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 " + : "=v"(accm0) + : "0"(accm0), "v"(sum[n][y][4]), "v"(accm0)); + asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 " + : "=v"(accm16) + : "0"(accm16), "v"(sum[n][y][12]), "v"(accm16)); + asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 " + : "=v"(accm0) + : "0"(accm0), "v"(sum[n][y][5]), "v"(accm0)); + asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 " + : "=v"(accm16) + : "0"(accm16), "v"(sum[n][y][13]), "v"(accm16)); + asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 " + : "=v"(accm0) + : "0"(accm0), "v"(sum[n][y][6]), "v"(accm0)); + asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 " + : "=v"(accm16) + : "0"(accm16), "v"(sum[n][y][14]), "v"(accm16)); + asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 " + : "=v"(accm0) + : "0"(accm0), "v"(sum[n][y][7]), "v"(accm0)); + asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 " + : "=v"(accm16) + : "0"(accm16), "v"(sum[n][y][15]), "v"(accm16)); + accm0 += __shfl(accm0, 36); + accm16 += __shfl(accm16, 52); + sum[n][y][0] = accm0 + __shfl(accm16, 16); + } + } + + if (threadIdx.x == 0) { + for (int n = 0; n < N; n++) { + for (int y = 0; y < YTILE; y++) { + C[m + y + n * M] = __float2s(sum[n][y][0] * sA * sB); + } + } + } + + m += CuCount * _WvPrGrp * YTILE; + } +} +#else // !defined(__HIP__MI300__) TODO: Add NAVI support +template +__global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, + const fp8_t* B, const fp8_t* __restrict__ A, + scalar_t* C, const float* __restrict__ s_A, + const float* __restrict__ s_B, + const int _WvPrGrp, const int CuCount) { + UNREACHABLE_CODE +} +#endif // defined(__HIP__MI300__) TODO: Add NAVI support + +#if defined(__HIP__MI300__) // TODO: Add NAVI support +template +__global__ void __launch_bounds__(WvPrGrp* THRDS) + wvSplitKQ_hf_(const int K, const int Kp, const int M, const fp8_t* B, + const fp8_t* __restrict__ A, scalar_t* C, + const float* __restrict__ s_A, const float* __restrict__ s_B, + const int _WvPrGrp, const int CuCount) { + using scalar8 = + __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float; + using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int; + using intx4 = __attribute__((__vector_size__(4 * sizeof(int)))) int; + union bigType { + char f8[A_CHUNK]; + char2 c2[A_CHUNK / 2]; + scalar_t h[A_CHUNK / 2]; + float f[A_CHUNK / 4]; + int i[A_CHUNK / 4]; + long l[A_CHUNK / 8]; + intx4 l2[A_CHUNK / 16]; + scalar8 h8; + }; + + __shared__ fp8_t s[1024 * 64]; + + for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK; + k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) { + *((bigType*)(&s[k])) = *((bigType*)(&A[k])); + } + __syncthreads(); + + if (threadIdx.y >= _WvPrGrp) return; + + uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE; + + using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float; + floatx16 sum[N][YTILE]; + float sA = *s_A; + float sB = *s_B; + + while (m < M) { + for (int i = 0; i < YTILE; i++) + for (int n = 0; n < N; n++) sum[n][i] = {0}; + + bigType bigA[N][UNRL]; + bigType bigB[YTILE][UNRL]; + + for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) { + // Fetch the weight matrix from memory! + #pragma unroll + for (uint32_t k2 = 0; k2 < UNRL; k2++) { + uint32_t k = k1 + k2 * THRDS * A_CHUNK; + uint32_t k_ = k + threadIdx.x * A_CHUNK; + if (k_ >= K) break; + + const fp8_t* B_ = &B[(m + 0) * Kp + k_]; + for (int y = 0; y < YTILE; ++y) { + if (y + m >= M) break; // To avoid mem access fault. + bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[y * Kp]))); + } + } + + // Fetch activation matrix from either just LDS or from both LDS / memory + #pragma unroll + for (uint32_t k2 = 0; k2 < UNRL; k2++) { + uint32_t k = k1 + k2 * THRDS * A_CHUNK; + uint32_t k_ = k + threadIdx.x * A_CHUNK; + if (k_ >= K) break; + for (int n = 0; n < N; n++) { + if (k_ + K * n < 64 * 1024) + bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n]))); + else + bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n]))); + } + } + + // Do the matrix multiplication in interleaved manner + #pragma unroll + for (uint32_t k2 = 0; k2 < UNRL; k2++) { + uint32_t k = k1 + k2 * THRDS * A_CHUNK; + uint32_t k_ = k + threadIdx.x * A_CHUNK; + if (k_ >= K) break; + + for (uint32_t n = 0; n < N; n++) { + for (int i = 0; i < A_CHUNK; i += 8) { + for (int y = 0; y < YTILE; ++y) { + sum[n][y] = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8( + bigA[n][k2].l[i / 8], bigB[y][k2].l[i / 8], sum[n][y], 0, 0, + 0); + } + } + } + } + } + + // Final reduction + for (int n = 0; n < N; n++) { + for (int y = 0; y < YTILE; y++) { + float accm0 = sum[n][y][0]; + float accm16 = sum[n][y][8]; + asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 " + : "=v"(accm0) + : "0"(accm0), "v"(sum[n][y][1]), "v"(accm0)); + asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 " + : "=v"(accm16) + : "0"(accm16), "v"(sum[n][y][9]), "v"(accm16)); + asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 " + : "=v"(accm0) + : "0"(accm0), "v"(sum[n][y][2]), "v"(accm0)); + asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 " + : "=v"(accm16) + : "0"(accm16), "v"(sum[n][y][10]), "v"(accm16)); + asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 " + : "=v"(accm0) + : "0"(accm0), "v"(sum[n][y][3]), "v"(accm0)); + asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 " + : "=v"(accm16) + : "0"(accm16), "v"(sum[n][y][11]), "v"(accm16)); + asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 " + : "=v"(accm0) + : "0"(accm0), "v"(sum[n][y][4]), "v"(accm0)); + asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 " + : "=v"(accm16) + : "0"(accm16), "v"(sum[n][y][12]), "v"(accm16)); + asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 " + : "=v"(accm0) + : "0"(accm0), "v"(sum[n][y][5]), "v"(accm0)); + asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 " + : "=v"(accm16) + : "0"(accm16), "v"(sum[n][y][13]), "v"(accm16)); + asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 " + : "=v"(accm0) + : "0"(accm0), "v"(sum[n][y][6]), "v"(accm0)); + asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 " + : "=v"(accm16) + : "0"(accm16), "v"(sum[n][y][14]), "v"(accm16)); + asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 " + : "=v"(accm0) + : "0"(accm0), "v"(sum[n][y][7]), "v"(accm0)); + asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 " + : "=v"(accm16) + : "0"(accm16), "v"(sum[n][y][15]), "v"(accm16)); + accm0 += __shfl(accm0, 36); + accm16 += __shfl(accm16, 52); + sum[n][y][0] = accm0 + __shfl(accm16, 16); + } + } + + if (threadIdx.x == 0) { + for (int n = 0; n < N; n++) { + for (int y = 0; y < YTILE; y++) { + if (y + m >= M) break; // To avoid mem access fault. + C[m + y + n * M] = __float2s(sum[n][y][0] * sA * sB); + } + } + } + + m += CuCount * _WvPrGrp * YTILE; + } +} +#else // !defined(__HIP__MI300__) TODO: Add NAVI support +template +__global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M, + const fp8_t* B, const fp8_t* __restrict__ A, + scalar_t* C, const float* __restrict__ s_A, + const float* __restrict__ s_B, const int _WvPrGrp, + const int CuCount) { + UNREACHABLE_CODE +} +#endif // defined(__HIP__MI300__) TODO: Add NAVI support + +void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c, + at::Tensor& scale_a, at::Tensor& scale_b, + const int64_t CuCount) { + static c10::ScalarType kFp8Type = is_fp8_ocp() + ? c10::ScalarType::Float8_e4m3fn + : c10::ScalarType::Float8_e4m3fnuz; + auto M_in = in_a.size(0); + auto K_in = in_a.size(1); + auto N_in = in_b.size(0); + auto Kp_in = in_a.stride(0); + TORCH_CHECK(K_in % 16 == 0, "k % 16 == 0"); + TORCH_CHECK(in_a.dtype() == in_b.dtype() && in_a.dtype() == kFp8Type); + TORCH_CHECK(out_c.dtype() == torch::kFloat16 || + out_c.dtype() == torch::kBFloat16); + + dim3 grid(CuCount); + const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + +#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \ + _N) \ + { \ + dim3 block(64, _WvPrGrp); \ + if ((K_in * N_in <= 64 * 1024) && (M_in % _YTILEs == 0)) { \ + int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \ + wvSplitKQ_hf_sml_ \ + <<>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \ + s_a, s_b, __wvPrGrp, CuCount); \ + } else { \ + int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \ + wvSplitKQ_hf_ \ + <<>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \ + s_a, s_b, __wvPrGrp, CuCount); \ + } \ + } + + AT_DISPATCH_REDUCED_FLOATING_TYPES(out_c.scalar_type(), "wvSplitKQ", [&] { + using fptype = typename scalar::type; + auto c_ptr = reinterpret_cast(out_c.data_ptr()); + auto s_a = scale_a.data_ptr(); + auto s_b = scale_b.data_ptr(); + VLLM_DISPATCH_FP8_TYPES(in_a.scalar_type(), "wvSplitKQ", [&] { + auto a_ptr = in_a.data_ptr(); + auto b_ptr = in_b.data_ptr(); + switch (N_in) { + case 1: + WVSPLITKQ(16, 2, 2, 2, 2, 2, 2, 1) + break; + case 2: + WVSPLITKQ(16, 2, 2, 2, 2, 2, 2, 2) + break; + case 3: + WVSPLITKQ(16, 4, 7, 7, 1, 1, 1, 3) + break; + case 4: + WVSPLITKQ(16, 4, 7, 7, 1, 1, 1, 4) + break; + default: + throw std::runtime_error( + "Unsupported N value: " + std::to_string(M_in) + "," + + std::to_string(K_in) + "," + std::to_string(N_in)); + } + }); + }); +} \ No newline at end of file diff --git a/csrc/rocm/torch_bindings.cpp b/csrc/rocm/torch_bindings.cpp index a5d2e2f97a3ed..4ac6fd1e99408 100644 --- a/csrc/rocm/torch_bindings.cpp +++ b/csrc/rocm/torch_bindings.cpp @@ -14,6 +14,24 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) { // vLLM custom ops for rocm + // Custom gemm op for matrix-vector multiplication + rocm_ops.def( + "LLMM1(Tensor in_a, Tensor in_b, int rows_per_block) -> " + "Tensor"); + rocm_ops.impl("LLMM1", torch::kCUDA, &LLMM1); + + // Custom gemm op for skinny matrix-matrix multiplication + rocm_ops.def( + "wvSplitK(Tensor in_a, Tensor in_b, int CuCount) -> " + "Tensor"); + rocm_ops.impl("wvSplitK", torch::kCUDA, &wvSplitK); + + // wvSplitK for fp8 + rocm_ops.def( + "wvSplitKQ(Tensor in_a, Tensor in_b, Tensor! out_c, Tensor scale_a, " + " Tensor scale_b, int CuCount) -> ()"); + rocm_ops.impl("wvSplitKQ", torch::kCUDA, &wvSplitKQ); + // Custom attention op // Compute the attention between an input query and the cached // keys/values using PagedAttention. @@ -23,7 +41,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) { " Tensor query, Tensor key_cache," " Tensor value_cache, int num_kv_heads," " float scale, Tensor block_tables," - " Tensor context_lens, int block_size," + " Tensor context_lens," + " Tensor? query_start_loc," + " int block_size," " int max_context_len," " Tensor? alibi_slopes," " str kv_cache_dtype," diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index eb3a2c911d55e..b6ff6a006c028 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -31,6 +31,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("weak_ref_tensor(Tensor input) -> Tensor"); ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor); + ops.def("get_cuda_view_from_cpu_tensor(Tensor cpu_tensor) -> Tensor"); + ops.impl("get_cuda_view_from_cpu_tensor", torch::kCPU, + &get_cuda_view_from_cpu_tensor); + // Attention ops // Compute the attention between an input query and the cached // keys/values using PagedAttention. @@ -60,6 +64,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " int blocksparse_head_sliding_step) -> ()"); ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2); +#ifndef USE_ROCM + // Merge attn states + // Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005 + // can be used to combine partial attention results (in the split-KV case) + ops.def( + "merge_attn_states(" + " Tensor! output," + " Tensor!? output_lse," + " Tensor prefix_output," + " Tensor prefix_lse," + " Tensor suffix_output," + " Tensor suffix_lse) -> ()"); + ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states); +#endif + // Activation ops // Activation function used in SwiGLU. ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()"); @@ -291,7 +310,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { #endif // Dequantization for GGML. - ops.def("ggml_dequantize(Tensor W, int type, SymInt m, SymInt n) -> Tensor"); + ops.def( + "ggml_dequantize(Tensor W, int type, SymInt m, SymInt n, ScalarType? " + "dtype) -> Tensor"); ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize); // mmvq kernel for GGML. @@ -365,6 +386,35 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool"); ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8); + // Check if cutlass grouped gemm is supported for CUDA devices of the given + // capability + ops.def("cutlass_group_gemm_supported(int cuda_device_capability) -> bool"); + ops.impl("cutlass_group_gemm_supported", &cutlass_group_gemm_supported); + + // CUTLASS w8a8 grouped GEMM + ops.def( + "cutlass_moe_mm(Tensor! out_tensors, Tensor a_tensors, Tensor b_tensors, " + " Tensor a_scales, Tensor b_scales, Tensor expert_offsets, " + " Tensor problem_sizes, Tensor a_strides, " + " Tensor b_strides, Tensor c_strides) -> ()", + {stride_tag}); + ops.impl("cutlass_moe_mm", torch::kCUDA, &cutlass_moe_mm); + + // A function that computes data required to run fused MoE with w8a8 grouped + // GEMM. It takes topk_ids as an input, and computes expert_offsets + // (token start indices of each expert). In addition to this, it computes + // problem sizes for each expert's multiplication used by the two mms called + // from fused MoE operation, and arrays with permutations required to shuffle + // and de-shuffle the input/output of the fused operation. + ops.def( + "get_cutlass_moe_mm_data(Tensor topk_ids, Tensor! expert_offsets, " + " Tensor! problem_sizes1, Tensor! problem_sizes2, " + " Tensor! input_permutation, " + " Tensor! output_permutation, int num_experts, " + " int n, int k) -> ()", + {stride_tag}); + ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data); + // Check if cutlass scaled_mm supports block quantization (used by DeepSeekV3) ops.def( "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> " @@ -581,12 +631,11 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) { &get_max_shared_memory_per_block_device_attribute); } -#ifndef USE_ROCM TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) { // Custom all-reduce kernels custom_ar.def( "init_custom_ar(int[] ipc_tensors, Tensor rank_data, " - "int rank, bool full_nvlink) -> int"); + "int rank, bool fully_connected) -> int"); custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar); custom_ar.def( "all_reduce(int fa, Tensor inp, Tensor! out, int reg_buffer, " @@ -599,7 +648,13 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) { custom_ar.def("register_buffer", ®ister_buffer); custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta); custom_ar.def("register_graph_buffers", ®ister_graph_buffers); + + custom_ar.def("allocate_shared_buffer_and_handle", + &allocate_shared_buffer_and_handle); + custom_ar.def("open_mem_handle(Tensor mem_handle) -> int", &open_mem_handle); + custom_ar.impl("open_mem_handle", torch::kCPU, &open_mem_handle); + + custom_ar.def("free_shared_buffer", &free_shared_buffer); } -#endif REGISTER_EXTENSION(TORCH_EXTENSION_NAME) diff --git a/Dockerfile b/docker/Dockerfile similarity index 99% rename from Dockerfile rename to docker/Dockerfile index d1ecef586d50b..e8e18df1bb496 100644 --- a/Dockerfile +++ b/docker/Dockerfile @@ -240,6 +240,8 @@ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \ fi COPY examples examples +COPY benchmarks benchmarks +COPY ./vllm/collect_env.py . # Although we build Flashinfer with AOT mode, there's still # some issues w.r.t. JIT compilation. Therefore we need to diff --git a/Dockerfile.arm b/docker/Dockerfile.arm similarity index 100% rename from Dockerfile.arm rename to docker/Dockerfile.arm diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu new file mode 100644 index 0000000000000..c647d9036f400 --- /dev/null +++ b/docker/Dockerfile.cpu @@ -0,0 +1,142 @@ +# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform. +# +# Build targets: +# vllm-openai (default): used for serving deployment +# vllm-test: used for CI tests +# vllm-dev: used for development +# +# Build arguments: +# PYTHON_VERSION=3.12 (default)|3.11|3.10|3.9 +# VLLM_CPU_DISABLE_AVX512=false (default)|true +# + +######################### BASE IMAGE ######################### +FROM ubuntu:22.04 AS base + +WORKDIR /workspace/ + +ARG PYTHON_VERSION=3.12 +ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" + +ENV LD_PRELOAD="" + +# Install minimal dependencies and uv +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ + --mount=type=cache,target=/var/lib/apt,sharing=locked \ + apt-get update -y \ + && apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \ + gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 \ + && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \ + && curl -LsSf https://astral.sh/uv/install.sh | sh + +ENV CCACHE_DIR=/root/.cache/ccache +ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache + +ENV PATH="/root/.local/bin:$PATH" +ENV VIRTUAL_ENV="/opt/venv" +ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python +RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV} +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +ENV UV_HTTP_TIMEOUT=500 + +# Install Python dependencies +ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} +ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} +ENV UV_INDEX_STRATEGY="unsafe-best-match" +ENV UV_LINK_MODE="copy" +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \ + --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \ + uv pip install --upgrade pip && \ + uv pip install -r requirements/cpu.txt + +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install intel-openmp==2024.2.1 intel_extension_for_pytorch==2.6.0 + +ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD" + +RUN echo 'ulimit -c 0' >> ~/.bashrc + +######################### BUILD IMAGE ######################### +FROM base AS vllm-build + +ARG GIT_REPO_CHECK=0 +# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ... +ARG VLLM_CPU_DISABLE_AVX512 +ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512} + +WORKDIR /workspace/vllm + +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \ + uv pip install -r requirements/build.txt + +COPY . . +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi + +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=bind,source=.git,target=.git \ + VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel + +######################### DEV IMAGE ######################### +FROM vllm-build AS vllm-dev + +WORKDIR /workspace/vllm + +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ + --mount=type=cache,target=/var/lib/apt,sharing=locked \ + apt-get install -y --no-install-recommends vim numactl + +# install development dependencies (for testing) +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install -e tests/vllm_test_utils + +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=bind,source=.git,target=.git \ + VLLM_TARGET_DEVICE=cpu python3 setup.py develop + +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install -r requirements/dev.txt && \ + pre-commit install --hook-type pre-commit --hook-type commit-msg + +ENTRYPOINT ["bash"] + +######################### TEST IMAGE ######################### +FROM base AS vllm-test + +WORKDIR /workspace/ + +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,src=requirements/test.txt,target=requirements/test.txt \ + uv pip install -r requirements/test.txt + +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \ + uv pip install dist/*.whl + +ADD ./tests/ ./tests/ +ADD ./examples/ ./examples/ +ADD ./benchmarks/ ./benchmarks/ +ADD ./vllm/collect_env.py . + +# install development dependencies (for testing) +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install -e tests/vllm_test_utils + +ENTRYPOINT ["bash"] + +######################### RELEASE IMAGE ######################### +FROM base AS vllm-openai + +WORKDIR /workspace/ + +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \ + uv pip install dist/*.whl + +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/Dockerfile.hpu b/docker/Dockerfile.hpu similarity index 84% rename from Dockerfile.hpu rename to docker/Dockerfile.hpu index 48211c88f872b..224f142b5ff44 100644 --- a/Dockerfile.hpu +++ b/docker/Dockerfile.hpu @@ -1,4 +1,4 @@ -FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest +FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest COPY ./ /workspace/vllm diff --git a/Dockerfile.neuron b/docker/Dockerfile.neuron similarity index 90% rename from Dockerfile.neuron rename to docker/Dockerfile.neuron index 067645906366e..2b63fe301bac6 100644 --- a/Dockerfile.neuron +++ b/docker/Dockerfile.neuron @@ -1,6 +1,6 @@ # default base image # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx -ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04" +ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04" FROM $BASE_IMAGE @@ -21,9 +21,9 @@ VOLUME [ ${APP_MOUNT} ] WORKDIR ${APP_MOUNT}/vllm RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas -RUN python3 -m pip install sentencepiece transformers==4.45.2 -U -RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U +RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity +RUN python3 -m pip install sentencepiece transformers==4.48.0 -U +RUN python3 -m pip install neuronx-cc==2.17.194.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install pytest # uninstall transformers-neuronx package explicitly to avoid version conflict diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch new file mode 100644 index 0000000000000..0063712e47818 --- /dev/null +++ b/docker/Dockerfile.nightly_torch @@ -0,0 +1,307 @@ +# The vLLM Dockerfile is used to construct vLLM image against torch nightly that can be directly used for testing + +# for torch nightly, cuda >=12.6 is required, +# use 12.8 due to FlashAttention issue with cuda 12.6 (https://github.com/vllm-project/vllm/issues/15435#issuecomment-2775924628) +ARG CUDA_VERSION=12.8.0 +# +#################### BASE BUILD IMAGE #################### +# prepare basic build environment +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base +ARG CUDA_VERSION=12.8.0 +ARG PYTHON_VERSION=3.12 +ARG TARGETPLATFORM +ENV DEBIAN_FRONTEND=noninteractive +# Install Python and other dependencies +RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ + && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ + && apt-get update -y \ + && apt-get install -y ccache software-properties-common git curl sudo \ + && add-apt-repository ppa:deadsnakes/ppa \ + && apt-get update -y \ + && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ + && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ + && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ + && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ + && python3 --version \ + && python3 -m pip --version +# Install uv for faster pip installs +RUN --mount=type=cache,target=/root/.cache/uv \ + python3 -m pip install uv + +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 + +# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 +# as it was causing spam when compiling the CUTLASS kernels +RUN apt-get install -y gcc-10 g++-10 +RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 +RUN < torch_build_versions.txt +RUN cat torch_build_versions.txt + +# cuda arch list used by torch +# can be useful for `test` +# explicitly set the list to avoid issues with torch 2.2 +# see https://github.com/pytorch/pytorch/pull/123243 + +# Override the arch list for flash-attn to reduce the binary size +ARG vllm_fa_cmake_gpu_arches='80-real;90-real' +ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches} +#################### BASE BUILD IMAGE #################### + +#################### WHEEL BUILD IMAGE #################### +FROM base AS build +ARG TARGETPLATFORM + +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 + +COPY . . + +RUN python3 use_existing_torch.py + +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r requirements/build.txt + +ARG GIT_REPO_CHECK=0 +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi + +# Max jobs used by Ninja to build extensions +ARG max_jobs=16 +ENV MAX_JOBS=${max_jobs} +ARG nvcc_threads=2 +ENV NVCC_THREADS=$nvcc_threads + +ARG USE_SCCACHE +ARG SCCACHE_BUCKET_NAME=vllm-build-sccache +ARG SCCACHE_REGION_NAME=us-west-2 +ARG SCCACHE_S3_NO_CREDENTIALS=0 + +# if USE_SCCACHE is set, use sccache to speed up compilation +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,source=.git,target=.git \ + if [ "$USE_SCCACHE" = "1" ]; then \ + echo "Installing sccache..." \ + && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \ + && tar -xzf sccache.tar.gz \ + && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \ + && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \ + && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \ + && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \ + && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \ + && export SCCACHE_IDLE_TIMEOUT=0 \ + && export CMAKE_BUILD_TYPE=Release \ + && sccache --show-stats \ + && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \ + && sccache --show-stats; \ + fi + +ENV CCACHE_DIR=/root/.cache/ccache +RUN --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,source=.git,target=.git \ + if [ "$USE_SCCACHE" != "1" ]; then \ + # Clean any existing CMake artifacts + rm -rf .deps && \ + mkdir -p .deps && \ + python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ + fi + +#################### WHEEL BUILD IMAGE #################### + +################### VLLM INSTALLED IMAGE #################### +# Setup clean environment for vLLM and its dependencies for test and api server using ubuntu22.04 with AOT flashinfer +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base +# prepare for environment starts +ARG CUDA_VERSION=12.8.0 +ARG PYTHON_VERSION=3.12 +WORKDIR /vllm-workspace +ENV DEBIAN_FRONTEND=noninteractive +ARG TARGETPLATFORM + +RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ + echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment + +# Install Python and other dependencies +RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ + && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ + && apt-get update -y \ + && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \ + && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ + && add-apt-repository ppa:deadsnakes/ppa \ + && apt-get update -y \ + && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ + && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ + && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ + && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ + && python3 --version && python3 -m pip --version + +RUN --mount=type=cache,target=/root/.cache/uv \ + python3 -m pip install uv + +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 + +# Workaround for https://github.com/openai/triton/issues/2507 and +# https://github.com/pytorch/pytorch/issues/107960 -- hopefully +# this won't be needed for future versions of this docker image +# or future versions of triton. +RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ + +# get the nightly torch version used in the build to make sure the version is the same +COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt + +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128 + +# install the vllm wheel +RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm-dist \ + --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system vllm-dist/*.whl --verbose + +# install xformers again for the new environment +RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \ + --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose + +ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0' + +# install package for build flashinfer +# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738 +RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.post1 + + +# build flashinfer for torch nightly from source around 10 mins +# release version: v0.2.2.post1 +# todo(elainewy): cache flashinfer build result for faster build +ENV CCACHE_DIR=/root/.cache/ccache +RUN --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=cache,target=/root/.cache/uv \ + echo "git clone flashinfer..." \ + && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \ + && cd flashinfer \ + && git checkout v0.2.2.post1 \ + && git submodule update --init --recursive \ + && echo "finish git clone flashinfer..." \ + && rm -rf build \ + && export TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} \ + && FLASHINFER_ENABLE_AOT=1 python3 setup.py bdist_wheel --dist-dir=../flashinfer-dist --verbose \ + && cd .. \ + && rm -rf flashinfer + +# install flashinfer +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system flashinfer-dist/*.whl --verbose + +# install common packages +COPY requirements/common.txt requirements/common.txt +COPY use_existing_torch.py use_existing_torch.py +COPY pyproject.toml pyproject.toml + +COPY examples examples +COPY benchmarks benchmarks +COPY ./vllm/collect_env.py . + +RUN python3 use_existing_torch.py +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r requirements/common.txt + +################### VLLM INSTALLED IMAGE #################### + + +#################### UNITTEST IMAGE ############################# +FROM vllm-base as test +COPY tests/ tests/ + +# install build and runtime dependencies without stable torch version +COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt + +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 + +# install development dependencies (for testing) +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -e tests/vllm_test_utils + +# enable fast downloads from hf (for testing) +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system hf_transfer +ENV HF_HUB_ENABLE_HF_TRANSFER 1 + +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r requirements/nightly_torch_test.txt + +#################### UNITTEST IMAGE ############################# + diff --git a/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le similarity index 93% rename from Dockerfile.ppc64le rename to docker/Dockerfile.ppc64le index 913c289adc01e..ec979227871c6 100644 --- a/Dockerfile.ppc64le +++ b/docker/Dockerfile.ppc64le @@ -38,7 +38,7 @@ RUN microdnf install -y openssl-devel dnf \ && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \ && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \ && python -m pip install -U pip uv \ - && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python cmake ninja cython scikit_build_core scikit_build \ + && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python 'cmake<4' ninja cython scikit_build_core scikit_build \ && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \ && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \ && cd /tmp && touch control @@ -126,13 +126,16 @@ RUN --mount=type=cache,target=/root/.cache/uv \ FROM base-builder AS cv-builder ARG MAX_JOBS -ARG OPENCV_VERSION=84 +ARG OPENCV_VERSION=86 +# patch for version 4.11.0.86 +ARG OPENCV_PATCH=97f3f39 ARG ENABLE_HEADLESS=1 RUN --mount=type=cache,target=/root/.cache/uv \ source /opt/rh/gcc-toolset-13/enable && \ git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \ cd opencv-python && \ - sed -i 's/"setuptools==59.2.0",/"setuptools<70.0",/g' pyproject.toml && \ + sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \ + cd opencv && git cherry-pick --no-commit $OPENCV_PATCH && cd .. && \ python -m build --wheel --installer=uv --outdir /opencvwheels/ ############################################################### @@ -148,9 +151,15 @@ COPY --from=arrow-builder /tmp/control /dev/null COPY --from=cv-builder /tmp/control /dev/null ARG VLLM_TARGET_DEVICE=cpu +ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1 # this step installs vllm and populates uv cache # with all the transitive dependencies +RUN --mount=type=cache,target=/root/.cache/uv \ + source /opt/rh/gcc-toolset-13/enable && \ + git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \ + uv pip install maturin && \ + uv build --wheel --out-dir /hf_wheels/ RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \ --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \ @@ -159,7 +168,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ source /opt/rh/gcc-toolset-13/enable && \ uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \ sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \ - uv pip install pandas pythran pybind11 && \ + uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \ # sentencepiece.pc is in some pkgconfig inside uv cache export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \ uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \ @@ -238,7 +247,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ && python -m pip install -U pip uv --no-cache \ && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \ && make -C /numactl install \ - && uv pip install cmake \ + && uv pip install 'cmake<4' \ && cmake --install /lapack/build \ && uv pip uninstall cmake @@ -247,8 +256,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \ --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \ --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \ + --mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \ --mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \ - HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /vllmwheel/*.whl + HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl COPY ./ /workspace/vllm WORKDIR /workspace/vllm diff --git a/Dockerfile.rocm b/docker/Dockerfile.rocm similarity index 98% rename from Dockerfile.rocm rename to docker/Dockerfile.rocm index 841e7978a424f..f9ebb10ca8731 100644 --- a/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -12,7 +12,8 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}} # Install some basic utilities RUN apt-get update -q -y && apt-get install -q -y \ - sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev + sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \ + apt-transport-https ca-certificates wget curl # Remove sccache RUN python3 -m pip install --upgrade pip && pip install setuptools_scm RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)" diff --git a/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base similarity index 84% rename from Dockerfile.rocm_base rename to docker/Dockerfile.rocm_base index 38d6a33636eba..1776b26d445ce 100644 --- a/Dockerfile.rocm_base +++ b/docker/Dockerfile.rocm_base @@ -1,18 +1,18 @@ ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete -ARG HIPBLASLT_BRANCH="4d40e36" +ARG HIPBLASLT_BRANCH="db8e93b4" ARG HIPBLAS_COMMON_BRANCH="7c1566b" ARG LEGACY_HIPBLASLT_OPTION= ARG RCCL_BRANCH="648a58d" ARG RCCL_REPO="https://github.com/ROCm/rccl" ARG TRITON_BRANCH="e5be006" ARG TRITON_REPO="https://github.com/triton-lang/triton.git" -ARG PYTORCH_BRANCH="3a585126" -ARG PYTORCH_VISION_BRANCH="v0.19.1" +ARG PYTORCH_BRANCH="295f2ed4" +ARG PYTORCH_VISION_BRANCH="v0.21.0" ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" -ARG FA_BRANCH="b7d29fb" -ARG FA_REPO="https://github.com/ROCm/flash-attention.git" -ARG AITER_BRANCH="21d47a9" +ARG FA_BRANCH="1a7f4dfa" +ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" +ARG AITER_BRANCH="7e1ed08" ARG AITER_REPO="https://github.com/ROCm/aiter.git" FROM ${BASE_IMAGE} AS base @@ -20,7 +20,7 @@ FROM ${BASE_IMAGE} AS base ENV PATH=/opt/rocm/llvm/bin:$PATH ENV ROCM_PATH=/opt/rocm ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib: -ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942 +ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} ARG PYTHON_VERSION=3.12 @@ -31,7 +31,7 @@ ENV DEBIAN_FRONTEND=noninteractive # Install Python and other dependencies RUN apt-get update -y \ - && apt-get install -y software-properties-common git curl sudo vim less \ + && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \ && add-apt-repository ppa:deadsnakes/ppa \ && apt-get update -y \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ @@ -42,7 +42,7 @@ RUN apt-get update -y \ && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ && python3 --version && python3 -m pip --version -RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython +RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython FROM base AS build_hipblaslt ARG HIPBLASLT_BRANCH @@ -60,7 +60,8 @@ RUN cd hipBLAS-common \ RUN git clone https://github.com/ROCm/hipBLASLt RUN cd hipBLASLt \ && git checkout ${HIPBLASLT_BRANCH} \ - && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \ + && apt-get install -y llvm-dev \ + && ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \ && cd build/release \ && make package RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install @@ -110,11 +111,24 @@ RUN git clone ${FA_REPO} RUN cd flash-attention \ && git checkout ${FA_BRANCH} \ && git submodule update --init \ - && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist + && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \ && cp /app/vision/dist/*.whl /app/install \ && cp /app/flash-attention/dist/*.whl /app/install +FROM base AS build_aiter +ARG AITER_BRANCH +ARG AITER_REPO +RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ + pip install /install/*.whl +RUN git clone --recursive ${AITER_REPO} +RUN cd aiter \ + && git checkout ${AITER_BRANCH} \ + && git submodule update --init --recursive \ + && pip install -r requirements.txt +RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl +RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install + FROM base AS final RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \ dpkg -i /install/*deb \ @@ -130,19 +144,12 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \ pip install /install/*.whl RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ pip install /install/*.whl - -ARG AITER_REPO -ARG AITER_BRANCH -RUN git clone --recursive ${AITER_REPO} -RUN cd aiter \ - && git checkout ${AITER_BRANCH} \ - && git submodule update --init --recursive \ - && pip install -r requirements.txt \ - && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter +RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \ + pip install /install/*.whl ARG BASE_IMAGE -ARG HIPBLASLT_BRANCH ARG HIPBLAS_COMMON_BRANCH +ARG HIPBLASLT_BRANCH ARG LEGACY_HIPBLASLT_OPTION ARG RCCL_BRANCH ARG RCCL_REPO @@ -154,6 +161,8 @@ ARG PYTORCH_REPO ARG PYTORCH_VISION_REPO ARG FA_BRANCH ARG FA_REPO +ARG AITER_BRANCH +ARG AITER_REPO RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \ && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \ @@ -167,6 +176,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \ && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ - && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \ && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt diff --git a/Dockerfile.s390x b/docker/Dockerfile.s390x similarity index 85% rename from Dockerfile.s390x rename to docker/Dockerfile.s390x index 5a84dc12d8f71..128929ac33311 100644 --- a/Dockerfile.s390x +++ b/docker/Dockerfile.s390x @@ -58,7 +58,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ cd ../../python && \ export PYARROW_PARALLEL=4 && \ export ARROW_BUILD_TYPE=release && \ - uv pip install -r requirements/build.txt && \ + uv pip install -r requirements-build.txt && \ python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel FROM python-install AS numa-build @@ -96,6 +96,22 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \ python setup.py bdist_wheel +FROM python-install AS hf-xet-builder +# Install hf-xet +WORKDIR /tmp +ENV CARGO_HOME=/root/.cargo +ENV RUSTUP_HOME=/root/.rustup +ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH" +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \ + --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \ + git clone https://github.com/huggingface/xet-core.git && \ + cd xet-core/hf_xet/ && \ + uv pip install maturin patchelf && \ + python -m maturin build --release --out dist && \ + mkdir -p /tmp/hf-xet/dist && \ + cp dist/*.whl /tmp/hf-xet/dist/ + # Final build stage FROM python-install AS vllm-cpu ARG PYTHON_VERSION @@ -120,12 +136,15 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \ --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \ --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \ + --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \ sed -i '/^torch/d' requirements/build.txt && \ ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \ VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \ + HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \ uv pip install -v \ $ARROW_WHL_FILE \ $VISION_WHL_FILE \ + $HF_XET_WHL_FILE \ --extra-index-url https://download.pytorch.org/whl/nightly/cpu \ --index-strategy unsafe-best-match \ -r requirements/build.txt \ @@ -149,4 +168,5 @@ USER 2000 WORKDIR /home/vllm # Set the default entrypoint -ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] \ No newline at end of file +ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] + diff --git a/Dockerfile.tpu b/docker/Dockerfile.tpu similarity index 100% rename from Dockerfile.tpu rename to docker/Dockerfile.tpu diff --git a/Dockerfile.xpu b/docker/Dockerfile.xpu similarity index 100% rename from Dockerfile.xpu rename to docker/Dockerfile.xpu diff --git a/docs/README.md b/docs/README.md index 74e05ce02636b..dcd5e759dfa88 100644 --- a/docs/README.md +++ b/docs/README.md @@ -2,19 +2,42 @@ ## Build the docs -```bash -# Install dependencies. -pip install -r ../requirements/docs.txt +- Make sure in `docs` directory -# Build the docs. +```bash +cd docs +``` + +- Install the dependencies: + +```bash +pip install -r ../requirements/docs.txt +``` + +- Clean the previous build (optional but recommended): + +```bash make clean +``` + +- Generate the HTML documentation: + +```bash make html ``` ## Open the docs with your browser +- Serve the documentation locally: + ```bash python -m http.server -d build/html/ ``` -Launch your browser and open localhost:8000. +This will start a local server at http://localhost:8000. You can now open your browser and view the documentation. + +If port 8000 is already in use, you can specify a different port, for example: + +```bash +python -m http.server 3000 -d build/html/ +``` diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js index be0b2a388e404..58bc2ebb9614b 100644 --- a/docs/source/_static/custom.js +++ b/docs/source/_static/custom.js @@ -10,8 +10,8 @@ document.addEventListener("DOMContentLoaded", function () { script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget. script.setAttribute("runllm-name", "vLLM"); script.setAttribute("runllm-position", "BOTTOM_RIGHT"); - script.setAttribute("runllm-position-y", "20%"); - script.setAttribute("runllm-position-x", "3%"); + script.setAttribute("runllm-position-y", "120px"); + script.setAttribute("runllm-position-x", "20px"); script.setAttribute("runllm-assistant-id", "207"); script.async = true; diff --git a/docs/source/assets/deployment/open_webui.png b/docs/source/assets/deployment/open_webui.png new file mode 100644 index 0000000000000..fe9a7e15ea71d Binary files /dev/null and b/docs/source/assets/deployment/open_webui.png differ diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-1.png b/docs/source/assets/design/v1/prefix_caching/example-time-1.png index 8849ca0237c39..d5a165ff6944b 100644 Binary files a/docs/source/assets/design/v1/prefix_caching/example-time-1.png and b/docs/source/assets/design/v1/prefix_caching/example-time-1.png differ diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-3.png b/docs/source/assets/design/v1/prefix_caching/example-time-3.png index 71b9e9b60ab9a..d753a406bdb9a 100644 Binary files a/docs/source/assets/design/v1/prefix_caching/example-time-3.png and b/docs/source/assets/design/v1/prefix_caching/example-time-3.png differ diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-4.png b/docs/source/assets/design/v1/prefix_caching/example-time-4.png index 017df1657c22e..d463248a3b1e9 100644 Binary files a/docs/source/assets/design/v1/prefix_caching/example-time-4.png and b/docs/source/assets/design/v1/prefix_caching/example-time-4.png differ diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-5.png b/docs/source/assets/design/v1/prefix_caching/example-time-5.png index b80dd5b9949dc..231ebc6199faf 100644 Binary files a/docs/source/assets/design/v1/prefix_caching/example-time-5.png and b/docs/source/assets/design/v1/prefix_caching/example-time-5.png differ diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-6.png b/docs/source/assets/design/v1/prefix_caching/example-time-6.png index fbd7138596e89..6ded9170e8e83 100644 Binary files a/docs/source/assets/design/v1/prefix_caching/example-time-6.png and b/docs/source/assets/design/v1/prefix_caching/example-time-6.png differ diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-7.png b/docs/source/assets/design/v1/prefix_caching/example-time-7.png index fc33ef50d4fdb..0b536de5a53f2 100644 Binary files a/docs/source/assets/design/v1/prefix_caching/example-time-7.png and b/docs/source/assets/design/v1/prefix_caching/example-time-7.png differ diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md index efb4f692972b5..085918bed2b09 100644 --- a/docs/source/community/meetups.md +++ b/docs/source/community/meetups.md @@ -4,6 +4,9 @@ We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: +- [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing). +- [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama), March 27th 2025. [[Slides]](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing). +- [The first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg), March 16th 2025. [[Slides]](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing). - [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0) - [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) - [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing) diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md index fb93e65673dff..b8a1ddbe38794 100644 --- a/docs/source/community/sponsors.md +++ b/docs/source/community/sponsors.md @@ -22,6 +22,7 @@ Compute Resources: - Databricks - DeepInfra - Google Cloud +- Intel - Lambda Lab - Nebius - Novita AI diff --git a/docs/source/conf.py b/docs/source/conf.py index b02b84826c9f2..a83ad764125c5 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -103,6 +103,11 @@ myst_url_schemes = { "title": "Pull Request #{{path}}", "classes": ["github"], }, + "gh-project": { + "url": "https://github.com/orgs/vllm-project/projects/{{path}}", + "title": "Project #{{path}}", + "classes": ["github"], + }, "gh-dir": { "url": "https://github.com/vllm-project/vllm/tree/main/{{path}}", "title": "{{path}}", diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md index 96674805df534..90b9a33cfbe62 100644 --- a/docs/source/contributing/dockerfile/dockerfile.md +++ b/docs/source/contributing/dockerfile/dockerfile.md @@ -1,6 +1,6 @@ # Dockerfile -We provide a to construct the image for running an OpenAI compatible server with vLLM. +We provide a to construct the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found [here](#deployment-docker). Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: @@ -28,7 +28,7 @@ The edges of the build graph represent: > Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present): > > ```bash - > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile + > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename docker/Dockerfile > ``` > > or in case you want to run it directly with the docker image: @@ -43,7 +43,7 @@ The edges of the build graph represent: > --output png \ > --dpi 200 \ > --max-label-length 50 \ - > --filename Dockerfile \ + > --filename docker/Dockerfile \ > --legend > ``` > diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md index 9cbfc32991f09..b42536f054d76 100644 --- a/docs/source/contributing/model/multimodal.md +++ b/docs/source/contributing/model/multimodal.md @@ -79,6 +79,17 @@ Further update the model as follows: return inputs_embeds ``` +- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model` getter to provide stable access to the underlying language model. + + ```python + class YourModelForImage2Seq(nn.Module): + ... + + def get_language_model(self) -> torch.nn.Module: + # Change `language_model` according to your implementation. + return self.language_model + ``` + - Once the above steps are done, update the model class with the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. ```diff @@ -110,17 +121,19 @@ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": 1} ``` -### Maximum number of placeholder feature tokens +## 3. Specify dummy inputs -Also, override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item` -to return the maximum number of placeholder feature tokens per input item for each modality. +Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for +HF processing as well as memory profiling. -When calling the model, the output embeddings from the visual encoder are assigned to the input positions -containing placeholder feature tokens. Therefore, the number of placeholder feature tokens should be equal -to the size of the output embeddings. +### For memory profiling -:::::{tab-set} -::::{tab-item} Basic example: LLaVA +Override the abstract methods {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text` and {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_mm_data` to construct dummy inputs for memory profiling. These dummy inputs should result in the worst-case memory usage of the model so that vLLM can reserve the correct amount of memory for it. + +Assuming that the memory usage increases with the number of tokens, the dummy inputs can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens. + +::::{tab-set} +:::{tab-item} Basic example: LLaVA :sync: llava Looking at the code of HF's `LlavaForConditionalGeneration`: @@ -229,41 +242,50 @@ def get_num_image_tokens( ``` Notice that the number of image tokens doesn't depend on the image width and height. -So, we can calculate the maximum number of image tokens using any image size: +We can simply use a dummy `image_size` to calculate the multimodal profiling data: ```python +# NOTE: In actuality, this is usually implemented as part of the +# model's subclass of `BaseProcessingInfo`, but we show it as is +# here for simplicity. def get_image_size_with_most_features(self) -> ImageSize: hf_config = self.get_hf_config() width = height = hf_config.image_size return ImageSize(width=width, height=height) -def get_max_image_tokens(self) -> int: - target_width, target_height = self.get_image_size_with_most_features() - - return self.get_num_image_tokens( - image_width=target_width, - image_height=target_height, - ) -``` - -And thus, we can override the method as: - -```python -def get_mm_max_tokens_per_item( +def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], -) -> Mapping[str, int]: - return {"image": self.get_max_image_tokens()} +) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + + target_width, target_height = \ + self.info.get_image_size_with_most_features() + + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } +``` + +For the text, we simply expand the multimodal image token from the model config to match the desired number of images. + +```python +def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + + processor = self.info.get_hf_processor() + image_token = processor.image_token + + return image_token * num_images ``` -:::{note} -Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP. ::: -:::: - -::::{tab-item} Non-consecutive feature tokens: Fuyu +:::{tab-item} No input placeholders: Fuyu :sync: fuyu Looking at the code of HF's `FuyuForCausalLM`: @@ -383,213 +405,42 @@ num_patches_per_dim_w = image_width // patch_width num_patches = num_patches_per_dim_h * num_patches_per_dim_w ``` -We can calculate this in vLLM using this code: - -```python -def get_num_image_patches( - self, - *, - image_width: int, - image_height: int, -) -> int: - image_processor = self.get_image_processor() - target_width = image_processor.size["width"] - target_height = image_processor.size["height"] - patch_width = image_processor.patch_size["width"] - patch_height = image_processor.patch_size["height"] - - if not (image_width <= target_width and image_height <= target_height): - height_scale_factor = target_height / image_height - width_scale_factor = target_width / image_width - optimal_scale_factor = min(height_scale_factor, width_scale_factor) - - image_height = int(image_height * optimal_scale_factor) - image_width = int(image_width * optimal_scale_factor) - - ncols = math.ceil(image_width / patch_width) - nrows = math.ceil(image_height / patch_height) - return ncols * nrows -``` - -These image patches correspond to placeholder tokens (`|SPEAKER|`). However, the processor also -inserts newline tokens (`|NEWLINE|`) as shown here: - -```python -# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L654-L670 -tensor_of_image_ids = torch.full( - [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device -) -patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0) -assert num_patches == patches.shape[0] - -if variable_sized: - # Now terminate each line with |NEWLINE|. - tensor_of_image_ids = tensor_of_image_ids.reshape(-1, image_width // patch_width) - newline_ids = torch.full( - [tensor_of_image_ids.shape[0], 1], - image_newline_id, - dtype=torch.int32, - device=image_input.device, - ) - tensor_of_image_ids = torch.cat([tensor_of_image_ids, newline_ids], dim=1) - tensor_of_image_ids = tensor_of_image_ids.reshape(-1) -``` - -So, the layout of tokens for an image is: - -``` -|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE| -|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE| -... -|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE| -``` - -This makes the placeholder tokens non-consecutive in the prompt. -Since vLLM requires the feature tokens to be consecutive, **we also treat the newline tokens as feature tokens**. - -So overall, the total number of feature tokens is - -```python -def get_num_image_tokens( - self, - *, - image_width: int, - image_height: int, -) -> int: - image_processor = self.get_image_processor() - target_width = image_processor.size["width"] - target_height = image_processor.size["height"] - patch_width = image_processor.patch_size["width"] - patch_height = image_processor.patch_size["height"] - - if not (image_width <= target_width and image_height <= target_height): - height_scale_factor = target_height / image_height - width_scale_factor = target_width / image_width - optimal_scale_factor = min(height_scale_factor, width_scale_factor) - - image_height = int(image_height * optimal_scale_factor) - image_width = int(image_width * optimal_scale_factor) - - ncols = math.ceil(image_width / patch_width) - nrows = math.ceil(image_height / patch_height) - return (ncols + 1) * nrows -``` - -To calculate the maximum number of image tokens, recall that input images are first resized -to fit within `image_processor.size`. The maximum possible dimensions of the image before -being converted into patches is therefore equal to `image_processor.size`. +These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized +to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`. ```python def get_image_size_with_most_features(self) -> ImageSize: image_processor = self.get_image_processor() return ImageSize(width=image_processor.size["width"], height=image_processor.size["height"]) - -def get_max_image_tokens(self) -> int: - target_width, target_height = self.get_image_size_with_most_features() - - return self.get_num_image_tokens( - image_width=target_width, - image_height=target_height, - ) ``` -And thus, we can override the method as: +Fuyu does not expect image placeholders in the inputs to HF processor, so +the dummy prompt text is empty regardless of the number of images. ```python -def get_mm_max_tokens_per_item( +def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + return "" +``` + +For the multimodal image profiling data, the logic is very similar to LLaVA: + +```python +def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], -) -> Mapping[str, int]: - return {"image": self.get_max_image_tokens()} -``` - -:::{note} -Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) returns `ncols` and `nrows` directly instead of the total token count. -This is because `ncols` and `nrows` are used to specify the layout of the feature tokens (as shown in Step 4 of this guide). -::: - -:::: -::::: - -## 3. Specify dummy inputs - -Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for -HF processing as well as memory profiling. - -### For memory profiling - -Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs` -to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of -the model so that vLLM can reserve the correct amount of memory for it. - -Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed based -on the code for {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`. - -::::{tab-set} -:::{tab-item} Basic example: LLaVA -:sync: llava - -Making use of the `get_image_size_with_most_features` method implemented in Step 2: - -```python -def get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], -) -> ProcessorInputs: +) -> MultiModalDataDict: + target_width, target_height = \ + self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) - processor = self.info.get_hf_processor() - image_token = processor.image_token - - hf_config = self.get_hf_config() - target_width, target_height = self.info.get_image_size_with_most_features() - - mm_data = { + return { "image": self._get_dummy_images(width=target_width, height=target_height, num_images=num_images) } - - return ProcessorInputs( - prompt_text=image_token * num_images, - mm_data=mm_data, - ) -``` - -::: - -:::{tab-item} No input placeholders: Fuyu -:sync: fuyu - -Fuyu does not expect image placeholders in the inputs to HF processor, so -the dummy prompt text is empty regardless of the number of images. -Otherwise, the logic of this method is very similar to LLaVA: - -```python -def get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], -) -> ProcessorInputs: - target_width, target_height = \ - self.info.get_image_size_with_most_features() - num_images = mm_counts.get("image", 0) - - mm_data = { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images) - } - - return ProcessorInputs( - prompt_text="", - mm_data=mm_data, - ) ``` ::: @@ -860,8 +711,8 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch( ) ``` -To accommodate this, instead of a string you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails` -with different `full` and `feature` attributes: +To assign the vision embeddings to only the image tokens, instead of a string +you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`: ```python hf_config = self.info.get_hf_config() @@ -879,9 +730,9 @@ def get_replacement_fuyu(item_idx: int): image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows - return PromptUpdateDetails( - full=image_tokens + [bos_token_id], - features=image_tokens, + return PromptUpdateDetails.select_token_id( + image_tokens + [bos_token_id], + embed_token_id=_IMAGE_TOKEN_ID, ) ``` @@ -914,9 +765,9 @@ def _get_prompt_updates( image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows - return PromptUpdateDetails( - full=image_tokens + [bos_token_id], - features=image_tokens, + return PromptUpdateDetails.select_token_id( + image_tokens + [bos_token_id], + embed_token_id=_IMAGE_TOKEN_ID, ) return [ diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md index a414118316692..31c7059fda364 100644 --- a/docs/source/contributing/overview.md +++ b/docs/source/contributing/overview.md @@ -11,6 +11,15 @@ We also believe in the power of community support; thus, answering queries, offe Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository! +## Job Board + +Unsure on where to start? Check out the following links for tasks to work on: + +- [Good first issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22) + - [Selected onboarding tasks](gh-project:6) +- [New model requests](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22new%20model%22) + - [Models with multi-modal capabilities](gh-project:10) + ## License See . @@ -35,6 +44,12 @@ pre-commit run --all-files pytest tests/ ``` +:::{tip} +Since the ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12. + +Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment. +::: + :::{note} Currently, the repository is not fully checked by `mypy`. ::: diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md index 1f60faf40879e..ca56710bc2ef2 100644 --- a/docs/source/deployment/docker.md +++ b/docs/source/deployment/docker.md @@ -19,6 +19,18 @@ $ docker run --runtime nvidia --gpus all \ --model mistralai/Mistral-7B-v0.1 ``` +This image can also be used with other container engines such as [Podman](https://podman.io/). + +```console +$ podman run --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ + -p 8000:8000 \ + --ipc=host \ + vllm/vllm-openai:latest \ + --model mistralai/Mistral-7B-v0.1 +``` + You can add any other you need after the image tag (`vllm/vllm-openai:latest`). :::{note} @@ -34,11 +46,11 @@ If you need to use those dependencies (having accepted the license terms), create a custom Dockerfile on top of the base image with an extra layer that installs them: ```Dockerfile -FROM vllm/vllm-openai:v0.8.0 +FROM vllm/vllm-openai:v0.8.3 -# e.g. install the `audio` and `video` optional dependencies +# e.g. install the `audio` optional dependencies # NOTE: Make sure the version of vLLM matches the base image! -RUN uv pip install vllm[audio,video]==0.8.0 +RUN uv pip install --system vllm[audio]==0.8.3 ``` ::: @@ -52,7 +64,7 @@ with an extra layer that installs their code from source: ```Dockerfile FROM vllm/vllm-openai:latest -RUN uv pip install git+https://github.com/huggingface/transformers.git +RUN uv pip install --system git+https://github.com/huggingface/transformers.git ``` ::: @@ -61,11 +73,11 @@ RUN uv pip install git+https://github.com/huggingface/transformers.git ## Building vLLM's Docker Image from Source -You can build and run vLLM from source via the provided . To build vLLM: +You can build and run vLLM from source via the provided . To build vLLM: ```console # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 -DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai +DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --file docker/Dockerfile ``` :::{note} @@ -92,6 +104,7 @@ Keep an eye on memory usage with parallel jobs as it can be substantial (see exa # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) $ python3 use_existing_torch.py $ DOCKER_BUILDKIT=1 docker build . \ + --file docker/Dockerfile \ --target vllm-openai \ --platform "linux/arm64" \ -t vllm/vllm-gh200-openai:latest \ diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md index cb758d3e6d2e4..8be38d56cd286 100644 --- a/docs/source/deployment/frameworks/index.md +++ b/docs/source/deployment/frameworks/index.md @@ -9,6 +9,7 @@ dstack helm lws modal +open-webui skypilot triton ::: diff --git a/docs/source/deployment/frameworks/open-webui.md b/docs/source/deployment/frameworks/open-webui.md new file mode 100644 index 0000000000000..83e5303a00ef2 --- /dev/null +++ b/docs/source/deployment/frameworks/open-webui.md @@ -0,0 +1,29 @@ +(deployment-open-webui)= + +# Open WebUI + +1. Install the [Docker](https://docs.docker.com/engine/install/) + +2. Start the vLLM server with the supported chat completion model, e.g. + +```console +vllm serve qwen/Qwen1.5-0.5B-Chat +``` + +1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port): + +```console +docker run -d -p 3000:8080 \ +--name open-webui \ +-v open-webui:/app/backend/data \ +-e OPENAI_API_BASE_URL=http://:/v1 \ +--restart always \ +ghcr.io/open-webui/open-webui:main +``` + +1. Open it in the browser: + +On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`. + +:::{image} /assets/deployment/open_webui.png +::: diff --git a/docs/source/deployment/integrations/production-stack.md b/docs/source/deployment/integrations/production-stack.md index e66e8e6a16b29..05f1568306cc9 100644 --- a/docs/source/deployment/integrations/production-stack.md +++ b/docs/source/deployment/integrations/production-stack.md @@ -16,7 +16,7 @@ Ensure that you have a running Kubernetes environment with GPU (you can follow [ ## Deployment using vLLM production stack -The standard vLLM production stack install uses a Helm chart. You can run this [bash script](https://github.com/vllm-project/production-stack/blob/main/tutorials/install-helm.sh) to install Helm on your GPU server. +The standard vLLM production stack is installed using a Helm chart. You can run this [bash script](https://github.com/vllm-project/production-stack/blob/main/utils/install-helm.sh) to install Helm on your GPU server. To install the vLLM production stack, run the following commands on your desktop: diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md index 3885956791365..9079cfa8e1b66 100644 --- a/docs/source/deployment/k8s.md +++ b/docs/source/deployment/k8s.md @@ -46,6 +46,7 @@ metadata: type: Opaque data: token: $(HF_TOKEN) +EOF ``` Next, start the vLLM server as a Kubernetes Deployment and Service: diff --git a/docs/source/deployment/nginx.md b/docs/source/deployment/nginx.md index 62816f514c00e..bf404f1098c3b 100644 --- a/docs/source/deployment/nginx.md +++ b/docs/source/deployment/nginx.md @@ -69,14 +69,14 @@ server { ```console cd $vllm_root -docker build -f Dockerfile . --tag vllm +docker build -f docker/Dockerfile . --tag vllm ``` If you are behind proxy, you can pass the proxy settings to the docker build command as shown below: ```console cd $vllm_root -docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy +docker build -f docker/Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy ``` (nginxloadbalancer-nginx-docker-network)= diff --git a/docs/source/design/mm_processing.md b/docs/source/design/mm_processing.md index 2a4dac786d4bc..dc92a3c2c511e 100644 --- a/docs/source/design/mm_processing.md +++ b/docs/source/design/mm_processing.md @@ -8,7 +8,7 @@ Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModal ## Prompt Update Detection -One of the main responsibilies of HF processor is to update the prompt with placeholder tokens. For example: +One of the main responsibilities of HF processor is to update the prompt with placeholder tokens. For example: - Insert feature placeholder tokens (e.g. `...`, the number of which equals to the feature size) at the start of the string. - Replace existing input placeholder tokens (e.g. `` for a single image) with feature placeholder tokens (e.g. `...`, the number of which equals to the feature size). @@ -47,7 +47,7 @@ Moreover, since the tokenized text has not passed through the HF processor, we h ### Dummy text -We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data. +We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data. (mm-automatic-prompt-updating)= diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md index 55dae0bb92d4e..43fe5fe2e5e94 100644 --- a/docs/source/design/multiprocessing.md +++ b/docs/source/design/multiprocessing.md @@ -24,7 +24,7 @@ This document describes how vLLM deals with these challenges. [Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include: - `spawn` - spawn a new Python process. This will be the default as of Python - 3.14. + 3.14. In macOS, this is already the default. - `fork` - Use `os.fork()` to fork the Python interpreter. This is the default in Python versions prior to 3.14. @@ -34,7 +34,7 @@ This document describes how vLLM deals with these challenges. ### Tradeoffs `fork` is the fastest method, but is incompatible with dependencies that use -threads. +threads. If you are under macOS, using `fork` may cause the process to crash. `spawn` is more compatible with dependencies, but can be problematic when vLLM is used as a library. If the consuming code does not use a `__main__` guard (`if diff --git a/docs/source/design/v1/metrics.md b/docs/source/design/v1/metrics.md index b3981b2dc24a7..3f96290798a33 100644 --- a/docs/source/design/v1/metrics.md +++ b/docs/source/design/v1/metrics.md @@ -66,8 +66,8 @@ vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/getting_ The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important: - `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds -- `vllm:prompt_tokens_total` - Prompt Tokens/Sec -- `vllm:generation_tokens_total` - Generation Tokens/Sec +- `vllm:prompt_tokens_total` - Prompt Tokens +- `vllm:generation_tokens_total` - Generation Tokens - `vllm:time_per_output_token_seconds` - Inter token latency (Time Per Output Token, TPOT) in second. - `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds. - `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in RUNNING, WAITING, and SWAPPED state @@ -86,6 +86,17 @@ See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful b Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs. +With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657): + +```bash +$ curl http://0.0.0.0:8000/metrics 2>/dev/null | grep -P '^http_(?!.*(_bucket|_created|_sum)).*' +http_requests_total{handler="/v1/completions",method="POST",status="2xx"} 201.0 +http_request_size_bytes_count{handler="/v1/completions"} 201.0 +http_response_size_bytes_count{handler="/v1/completions"} 201.0 +http_request_duration_highr_seconds_count 201.0 +http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201.0 +``` + ### Multi-process Mode In v0, metrics are collected in the engine core process and we use multi-process mode to make them available in the API server process. See . diff --git a/docs/source/design/v1/prefix_caching.md b/docs/source/design/v1/prefix_caching.md index 3d14a76840d45..ec1f3cb8d64a8 100644 --- a/docs/source/design/v1/prefix_caching.md +++ b/docs/source/design/v1/prefix_caching.md @@ -15,12 +15,13 @@ Block 3: |<------------------ prefix -------------------->| |<--- block tokens - In the example above, the KV cache in the first block can be uniquely identified with the token “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the block hash of `hash(tuple[components])`, where components are: * Parent hash value: The hash value of the parent hash block. -* Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision. +* Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision. * Extra hashes: Other values required to make this block unique, such as LoRA IDs and multi-modality input hashes (see the example below). -Note 1: We only cache full blocks. +> **Note 1:** We only cache full blocks. -Note 2: The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value, but this should be nearly impossible to happen. Of course, contributions are welcome if you have an awesome idea to eliminate collusion entirely. +> **Note 2:** The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value. To avoid any hash collisions **in a multi-tenant setup, we advise to use SHA256** as hash function instead of the default builtin hash. +SHA256 is supported since vLLM v0.8.3 and must be enabled with a command line argument. It comes with a performance impact of about 100-200ns per token (~6ms for 50k tokens of context). **A hashing example with multi-modality inputs** In this example, we illustrate how prefix caching works with multi-modality inputs (e.g., images). Assuming we have a request with the following messages: diff --git a/docs/source/design/v1/torch_compile.md b/docs/source/design/v1/torch_compile.md index 0dadc8089991c..7920131643c26 100644 --- a/docs/source/design/v1/torch_compile.md +++ b/docs/source/design/v1/torch_compile.md @@ -99,7 +99,7 @@ This time, Inductor compilation is completely bypassed, and we will load from di The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example: -`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'compile_sizes': [1, 2, 4, 8]}"` +`vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'compile_sizes': [1, 2, 4, 8]}"` Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel. @@ -126,7 +126,7 @@ Unfortunately, because auto-tuning takes quite a long time (from seconds to minu ## Cudagraph Capture -vLLM's V1 architecture uses piecewise cudagraph. The full computation graph is split as mentioned above, and we only capture the cudagraph for the piece of graph between attention operations (including the first graph before any attention operation, and the last graph after all the attention operation). This is based on a common observation: computation between attentions are usually token-wise and easy to deal with for cudagraph; while the attention operation is non-trival to be cudagraph compatible. Thus, by running the attention operation in eager mode while the rest operations in cudagraph, we keep the flexibility of the attention operation. +vLLM's V1 architecture uses piecewise cudagraph. The full computation graph is split as mentioned above, and we only capture the cudagraph for the piece of graph between attention operations (including the first graph before any attention operation, and the last graph after all the attention operation). This is based on a common observation: computation between attentions are usually token-wise and easy to deal with for cudagraph; while the attention operation is non-trivial to be cudagraph compatible. Thus, by running the attention operation in eager mode while the rest operations in cudagraph, we keep the flexibility of the attention operation. The piecewise cudagraph also has fine-grained memory management. The purpose is to only exclude the attention kernel from cudagraph, while keeping all the rest modules and the memory allocation operations in the cudagraph. This is why the attention operation in V1 has the output tensor as the input of the attention. @@ -134,6 +134,6 @@ The cudagraphs are captured and managed by the compiler backend, and replayed wh By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`: -`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"` +`vllm serve meta-llama/Llama-3.2-1B --compilation-config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"` Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture. diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md index a71da72e4360a..b5b51095b3a75 100644 --- a/docs/source/features/lora.md +++ b/docs/source/features/lora.md @@ -106,19 +106,18 @@ curl http://localhost:8000/v1/completions \ ## Dynamically serving LoRA Adapters -In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading -LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility -to change models on-the-fly is needed. +In addition to serving LoRA adapters at server startup, the vLLM server supports dynamically configuring LoRA adapters at runtime through dedicated API endpoints and plugins. This feature can be particularly useful when the flexibility to change models on-the-fly is needed. Note: Enabling this feature in production environments is risky as users may participate in model adapter management. -To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING` -is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active. +To enable dynamic LoRA configuration, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING` +is set to `True`. ```bash export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True ``` +### Using API Endpoints Loading a LoRA Adapter: To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary @@ -153,6 +152,58 @@ curl -X POST http://localhost:8000/v1/unload_lora_adapter \ }' ``` +### Using Plugins +Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adapters. LoRAResolver plugins enable you to load LoRA adapters from both local and remote sources such as local file system and S3. On every request, when there's a new model name that hasn't been loaded yet, the LoRAResolver will try to resolve and load the corresponding LoRA adapter. + +You can set up multiple LoRAResolver plugins if you want to load LoRA adapters from different sources. For example, you might have one resolver for local files and another for S3 storage. vLLM will load the first LoRA adapter that it finds. + +You can either install existing plugins or implement your own. + +Steps to implement your own LoRAResolver plugin: +1. Implement the LoRAResolver interface. + + Example of a simple S3 LoRAResolver implementation: + + ```python + import os + import s3fs + from vllm.lora.request import LoRARequest + from vllm.lora.resolver import LoRAResolver + + class S3LoRAResolver(LoRAResolver): + def __init__(self): + self.s3 = s3fs.S3FileSystem() + self.s3_path_format = os.getenv("S3_PATH_TEMPLATE") + self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE") + + async def resolve_lora(self, base_model_name, lora_name): + s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name) + local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name) + + # Download the LoRA from S3 to the local path + await self.s3._get( + s3_path, local_path, recursive=True, maxdepth=1 + ) + + lora_request = LoRARequest( + lora_name=lora_name, + lora_path=local_path, + lora_int_id=abs(hash(lora_name)) + ) + return lora_request + ``` + +2. Register LoRAResolver plugin. + + ```python + from vllm.lora.resolver import LoRAResolverRegistry + + s3_resolver = S3LoRAResolver() + LoRAResolverRegistry.register_resolver("s3_resolver", s3_resolver) + ``` + + For more details, refer to the [vLLM's Plugins System](../design/plugin_system.md). + ## New format for `--lora-modules` In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example: diff --git a/docs/source/features/quantization/bitblas.md b/docs/source/features/quantization/bitblas.md new file mode 100644 index 0000000000000..2901f760d3e4c --- /dev/null +++ b/docs/source/features/quantization/bitblas.md @@ -0,0 +1,48 @@ +(bitblas)= + +# BitBLAS + +vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more efficient and flexible model inference. Compared to other quantization frameworks, BitBLAS provides more precision combinations. + +:::{note} +Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`). +Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper. +For details see [supported hardware](https://docs.vllm.ai/en/latest/features/quantization/supported_hardware.html). +::: + +Below are the steps to utilize BitBLAS with vLLM. + +```console +pip install bitblas>=0.1.0 +``` + +vLLM reads the model's config file and supports pre-quantized checkpoints. + +You can find pre-quantized models on: + +- [Hugging Face (BitBLAS)](https://huggingface.co/models?other=bitblas) +- [Hugging Face (GPTQ)](https://huggingface.co/models?other=gptq) + +Usually, these repositories have a `quantize_config.json` file that includes a `quantization_config` section. + +## Read bitblas format checkpoint + +```python +from vllm import LLM +import torch + +# "hxbgsyxh/llama-13b-4bit-g-1-bitblas" is a pre-quantized checkpoint. +model_id = "hxbgsyxh/llama-13b-4bit-g-1-bitblas" +llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, quantization="bitblas") +``` + +## Read gptq format checkpoint + +```python +from vllm import LLM +import torch + +# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint. +model_id = "hxbgsyxh/llama-13b-4bit-g-1" +llm = LLM(model=model_id, dtype=torch.float16, trust_remote_code=True, quantization="bitblas", max_model_len=1024) +``` diff --git a/docs/source/features/quantization/bnb.md b/docs/source/features/quantization/bnb.md index fc499e7692d98..e356b99d85cdf 100644 --- a/docs/source/features/quantization/bnb.md +++ b/docs/source/features/quantization/bnb.md @@ -19,17 +19,20 @@ And usually, these repositories have a config.json file that includes a quantiza ## Read quantized checkpoint +For pre-quantized checkpoints, vLLM will try to infer the quantization method from the config file, so you don't need to explicitly specify the quantization argument. + ```python from vllm import LLM import torch # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint. model_id = "unsloth/tinyllama-bnb-4bit" -llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ -quantization="bitsandbytes") +llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True) ``` ## Inflight quantization: load as 4bit quantization +For inflight 4bit quantization with BitsAndBytes, you need to explicitly specify the quantization argument. + ```python from vllm import LLM import torch @@ -40,7 +43,7 @@ quantization="bitsandbytes") ## OpenAI Compatible Server -Append the following to your 4bit model arguments: +Append the following to your model arguments for 4bit inflight quantization: ```console --quantization bitsandbytes diff --git a/docs/source/features/quantization/gguf.md b/docs/source/features/quantization/gguf.md index 4b1ff4a22a23b..e93e4dcd3b578 100644 --- a/docs/source/features/quantization/gguf.md +++ b/docs/source/features/quantization/gguf.md @@ -29,7 +29,7 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlam We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size. ::: -GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-confing-path +GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path ```console # If you model is not supported by huggingface you can manually provide a huggingface compatible config path diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md index 65f438f599f19..c7c8aeb662a56 100644 --- a/docs/source/features/quantization/index.md +++ b/docs/source/features/quantization/index.md @@ -11,10 +11,13 @@ Quantization trades off model precision for smaller memory footprint, allowing l supported_hardware auto_awq bnb +bitblas gguf gptqmodel int4 int8 fp8 +quark quantized_kvcache +torchao ::: diff --git a/docs/source/features/quantization/quark.md b/docs/source/features/quantization/quark.md new file mode 100644 index 0000000000000..935ee37a815ff --- /dev/null +++ b/docs/source/features/quantization/quark.md @@ -0,0 +1,217 @@ +(quark)= + +# AMD QUARK + +Quantization can effectively reduce memory and bandwidth usage, accelerate computation and improve +throughput while with minimal accuracy loss. vLLM can leverage [Quark](https://quark.docs.amd.com/latest/), +the flexible and powerful quantization toolkit, to produce performant quantized models to run on AMD GPUs. Quark has specialized support for quantizing large language models with weight, +activation and kv-cache quantization and cutting-edge quantization algorithms like +AWQ, GPTQ, Rotation and SmoothQuant. + +## Quark Installation + +Before quantizing models, you need to install Quark. The latest release of Quark can be installed with pip: + +```console +pip install amd-quark +``` + +You can refer to [Quark installation guide](https://quark.docs.amd.com/latest/install.html) +for more installation details. + +## Quantization Process + +After installing Quark, we will use an example to illustrate how to use Quark. +The Quark quantization process can be listed for 5 steps as below: + +1. Load the model +2. Prepare the calibration dataloader +3. Set the quantization configuration +4. Quantize the model and export +5. Evaluation in vLLM + +### 1. Load the Model + +Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index) +to fetch model and tokenizer. + +```python +from transformers import AutoTokenizer, AutoModelForCausalLM + +MODEL_ID = "meta-llama/Llama-2-70b-chat-hf" +MAX_SEQ_LEN = 512 + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto", +) +model.eval() + +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN) +tokenizer.pad_token = tokenizer.eos_token +``` + +### 2. Prepare the Calibration Dataloader + +Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html) +to load calibration data. For more details about how to use calibration datasets efficiently, please refer +to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html). + +```python +from datasets import load_dataset +from torch.utils.data import DataLoader + +BATCH_SIZE = 1 +NUM_CALIBRATION_DATA = 512 + +# Load the dataset and get calibration data. +dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation") +text_data = dataset["text"][:NUM_CALIBRATION_DATA] + +tokenized_outputs = tokenizer(text_data, return_tensors="pt", + padding=True, truncation=True, max_length=MAX_SEQ_LEN) +calib_dataloader = DataLoader(tokenized_outputs['input_ids'], + batch_size=BATCH_SIZE, drop_last=True) +``` + +### 3. Set the Quantization Configuration + +We need to set the quantization configuration, you can check +[quark config guide](https://quark.docs.amd.com/latest/pytorch/user_guide_config_description.html) +for further details. Here we use FP8 per-tensor quantization on weight, activation, +kv-cache and the quantization algorithm is AutoSmoothQuant. + +:::{note} +Note the quantization algorithm needs a JSON config file and the config file is located in +[Quark Pytorch examples](https://quark.docs.amd.com/latest/pytorch/pytorch_examples.html), +under the directory `examples/torch/language_modeling/llm_ptq/models`. For example, +AutoSmoothQuant config file for Llama is +`examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`. +::: + +```python +from quark.torch.quantization import (Config, QuantizationConfig, + FP8E4M3PerTensorSpec, + load_quant_algo_config_from_file) + +# Define fp8/per-tensor/static spec. +FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max", + is_dynamic=False).to_quantization_spec() + +# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC. +global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC, + weight=FP8_PER_TENSOR_SPEC) + +# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC. +KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC +kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"] +kv_cache_quant_config = {name : + QuantizationConfig(input_tensors=global_quant_config.input_tensors, + weight=global_quant_config.weight, + output_tensors=KV_CACHE_SPEC) + for name in kv_cache_layer_names_for_llama} +layer_quant_config = kv_cache_quant_config.copy() + +# Define algorithm config by config file. +LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = + 'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json' +algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE) + +EXCLUDE_LAYERS = ["lm_head"] +quant_config = Config( + global_quant_config=global_quant_config, + layer_quant_config=layer_quant_config, + kv_cache_quant_config=kv_cache_quant_config, + exclude=EXCLUDE_LAYERS, + algo_config=algo_config) +``` + +### 4. Quantize the Model and Export + +Then we can apply the quantization. After quantizing, we need to freeze the +quantized model first before exporting. Note that we need to export model with format of +HuggingFace `safetensors`, you can refer to +[HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html) +for more exporting format details. + +```python +import torch +from quark.torch import ModelQuantizer, ModelExporter +from quark.torch.export import ExporterConfig, JsonExporterConfig + +# Apply quantization. +quantizer = ModelQuantizer(quant_config) +quant_model = quantizer.quantize_model(model, calib_dataloader) + +# Freeze quantized model to export. +freezed_model = quantizer.freeze(model) + +# Define export config. +LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"] +export_config = ExporterConfig(json_export_config=JsonExporterConfig()) +export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP + +EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant" +exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR) +with torch.no_grad(): + exporter.export_safetensors_model(freezed_model, + quant_config=quant_config, tokenizer=tokenizer) +``` + +### 5. Evaluation in vLLM + +Now, you can load and run the Quark quantized model directly through the LLM entrypoint: + +```python +from vllm import LLM, SamplingParams + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Create an LLM. +llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant", + kv_cache_dtype='fp8',quantization='quark') +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +print("\nGenerated Outputs:\n" + "-" * 60) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}") + print(f"Output: {generated_text!r}") + print("-" * 60) +``` + +Or, you can use `lm_eval` to evaluate accuracy: + +```console +$ lm_eval --model vllm \ + --model_args pretrained=Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant,kv_cache_dtype='fp8',quantization='quark' \ + --tasks gsm8k +``` + +## Quark Quantization Script +In addition to the example of Python API above, Quark also offers a +[quantization script](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html) +to quantize large language models more conveniently. It supports quantizing models with variety +of different quantization schemes and optimization algorithms. It can export the quantized model +and run evaluation tasks on the fly. With the script, the example above can be: + +```console +python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \ + --output_dir /path/to/output \ + --quant_scheme w_fp8_a_fp8 \ + --kv_cache_dtype fp8 \ + --quant_algo autosmoothquant \ + --num_calib_data 512 \ + --model_export hf_format \ + --tasks gsm8k +``` diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md index a5bd8caf77cd0..984e6626e2417 100644 --- a/docs/source/features/quantization/supported_hardware.md +++ b/docs/source/features/quantization/supported_hardware.md @@ -62,7 +62,7 @@ The table below shows the compatibility of various quantization implementations * ❌ * ✅︎ * ❌ - * ❌ + * ✅︎ - * FP8 (W8A8) * ❌ * ❌ @@ -74,6 +74,17 @@ The table below shows the compatibility of various quantization implementations * ❌ * ❌ * ❌ +- * BitBLAS (GPTQ) + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ❌ + * ❌ + * ❌ + * ❌ - * AQLM * ✅︎ * ✅︎ diff --git a/docs/source/features/quantization/torchao.md b/docs/source/features/quantization/torchao.md new file mode 100644 index 0000000000000..9a85f0bab9ec7 --- /dev/null +++ b/docs/source/features/quantization/torchao.md @@ -0,0 +1,34 @@ +# TorchAO + +TorchAO is an architecture optimization library for PyTorch, it provides high performance dtypes, optimization techniques and kernels for inference and training, featuring composability with native PyTorch features like torch.compile, FSDP etc.. Some benchmark numbers can be found [here](https://github.com/pytorch/ao/tree/main/torchao/quantization#benchmarks). + +We recommend installing the latest torchao nightly with + +```console +# Install the latest TorchAO nightly build +# Choose the CUDA version that matches your system (cu126, cu128, etc.) +pip install --pre torchao>=10.0.0 --index-url https://download.pytorch.org/whl/nightly/cu126 +``` + +## Quantizing HuggingFace Models +You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code: + +```Python +import torch +from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer +from torchao.quantization import Int8WeightOnlyConfig + +model_name = "meta-llama/Meta-Llama-3-8B" +quantization_config = TorchAoConfig(Int8WeightOnlyConfig()) +quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config) +tokenizer = AutoTokenizer.from_pretrained(model_name) +input_text = "What are we having for dinner?" +input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") + +hub_repo = # YOUR HUB REPO ID +tokenizer.push_to_hub(hub_repo) +quantized_model.push_to_hub(hub_repo, safe_serialization=False) +``` + +Alternatively, you can use the TorchAO Quantization space for quantizing models with a simple UI. +See: https://huggingface.co/spaces/medmekk/TorchAO_Quantization diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md index 0b170aadc3443..3a0be69f8e1c6 100644 --- a/docs/source/features/reasoning_outputs.md +++ b/docs/source/features/reasoning_outputs.md @@ -4,7 +4,7 @@ vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions. -Reasoning models return a additional `reasoning_content` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models. +Reasoning models return an additional `reasoning_content` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models. ## Supported Models @@ -14,6 +14,9 @@ vLLM currently supports the following reasoning models: |--------------|-------------|------------------|-------------| | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ | | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ | +| [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ | + +- IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. ## Quickstart @@ -43,6 +46,7 @@ model = models.data[0].id # Round 1 messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] +# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` response = client.chat.completions.create(model=model, messages=messages) reasoning_content = response.choices[0].message.reasoning_content @@ -97,6 +101,7 @@ models = client.models.list() model = models.data[0].id messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] +# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` stream = client.chat.completions.create(model=model, messages=messages, stream=True) @@ -131,7 +136,14 @@ Remember to check whether the `reasoning_content` exists in the response before ## Structured output -The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. +The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now. + +```bash +VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ + --enable-reasoning --reasoning-parser deepseek_r1 +``` + +Please note that the `VLLM_USE_V1` environment variable must be set to `0` to use the v0 engine. ```python from openai import OpenAI diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md index 3e1f1d5be7523..f16e0d96522da 100644 --- a/docs/source/features/spec_decode.md +++ b/docs/source/features/spec_decode.md @@ -52,7 +52,7 @@ python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model ``` :::{warning} -Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately will be deprecated in the next release. +Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately has been deprecated now. ::: Then use a client: diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md index 2e1081bf8d14b..57888e122969d 100644 --- a/docs/source/features/tool_calling.md +++ b/docs/source/features/tool_calling.md @@ -1,6 +1,6 @@ # Tool Calling -vLLM currently supports named function calling, as well as the `auto` and `none` options for the `tool_choice` field in the chat completion API. The `tool_choice` option `required` is **not yet supported** but [on the roadmap](gh-issue:13002). +vLLM currently supports named function calling, as well as the `auto`, `required` (as of `vllm>=0.8.3`) and `none` options for the `tool_choice` field in the chat completion API. ## Quickstart @@ -91,6 +91,12 @@ For best results, we recommend ensuring that the expected output format / schema To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. +## Required Function Calling + +vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](https://docs.vllm.ai/en/latest/getting_started/v1_user_guide.html#feature-model) for the V1 engine. + +When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter. + ## Automatic Function Calling To enable this feature, you should set the following flags: @@ -146,12 +152,13 @@ Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_cha Supported models: -* `meta-llama/Meta-Llama-3.1-8B-Instruct` -* `meta-llama/Meta-Llama-3.1-70B-Instruct` -* `meta-llama/Meta-Llama-3.1-405B-Instruct` -* `meta-llama/Meta-Llama-3.1-405B-Instruct-FP8` +All Llama 3.1 and 3.2 models should be supported. + +* `meta-llama/Llama-3.1-*` +* `meta-llama/Llama-3.2-*` + +The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below. -The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) in Llama-3.2 models, see the `pythonic` tool parser below. Other tool calling formats like the built in python tool calling or custom tool calling are not supported. Known issues: @@ -160,10 +167,14 @@ Known issues: 2. The model can generate parameters with a wrong format, such as generating an array serialized as string instead of an array. -The `tool_chat_template_llama3_json.jinja` file contains the "official" Llama chat template, but tweaked so that -it works better with vLLM. +VLLM provides two JSON based chat templates for Llama 3.1 and 3.2: -Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool_chat_template_llama3_json.jinja` +* `examples/tool_chat_template_llama3.1_json.jinja` - this is the "official" chat template for the Llama 3.1 +models, but tweaked so that it works better with vLLM. +* `examples/tool_chat_template_llama3.2_json.jinja` - this extends upon the Llama 3.1 chat template by adding support for +images. + +Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}` #### IBM Granite @@ -239,6 +250,8 @@ Example supported models: * `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) * `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`) * `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`) +* `meta-llama/Llama-4-Scout-17B-16E-Instruct`\* (use with `examples/tool_chat_template_llama4_pythonic.jinja`) +* `meta-llama/Llama-4-Maverick-17B-128E-Instruct`\* (use with `examples/tool_chat_template_llama4_pythonic.jinja`) Flags: `--tool-call-parser pythonic --chat-template {see_above}` diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py index 1206d5fe75390..f77dbefb0a018 100644 --- a/docs/source/generate_examples.py +++ b/docs/source/generate_examples.py @@ -17,6 +17,7 @@ def fix_case(text: str) -> str: "cli": "CLI", "cpu": "CPU", "llm": "LLM", + "mae": "MAE", "tpu": "TPU", "aqlm": "AQLM", "gguf": "GGUF", @@ -24,6 +25,7 @@ def fix_case(text: str) -> str: "rlhf": "RLHF", "vllm": "vLLM", "openai": "OpenAI", + "lmcache": "LMCache", "multilora": "MultiLoRA", "mlpspeculator": "MLPSpeculator", r"fp\d+": lambda x: x.group(0).upper(), # e.g. fp16, fp32 diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md index e91ed6fbd7a88..78938de317c48 100644 --- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md +++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md @@ -13,11 +13,11 @@ There are no pre-built wheels or images for this device, so you must build vLLM - Intel Gaudi accelerator - Intel Gaudi software version 1.18.0 -Please follow the instructions provided in the [Gaudi Installation -Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) +Please follow the instructions provided in the +[Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) to set up the execution environment. To achieve the best performance, -please follow the methods outlined in the [Optimizing Training Platform -Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). +please follow the methods outlined in the +[Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). ## Configure a new environment @@ -32,15 +32,13 @@ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloade pip list | grep neural # verify that neural_compressor is installed ``` -Refer to [Intel Gaudi Software Stack -Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade) +Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade) for more details. ### Run Docker Image It is highly recommended to use the latest Docker image from Intel Gaudi -vault. Refer to the [Intel Gaudi -documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) +vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) for more details. Use the following commands to run a Docker image: @@ -86,7 +84,7 @@ Currently, there are no pre-built Intel Gaudi images. ### Build image from source ```console -docker build -f Dockerfile.hpu -t vllm-hpu-env . +docker build -f docker/Dockerfile.hpu -t vllm-hpu-env . docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env ``` @@ -278,8 +276,9 @@ Lower value corresponds to less usable graph memory reserved for prefill stage, ::: User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented: -\- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode -\- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt + +- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode +- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy. @@ -326,8 +325,7 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi - We recommend running inference on Gaudi 2 with `block_size` of 128 for BF16 data type. Using default values (16, 32) might lead to sub-optimal performance due to Matrix Multiplication Engine - under-utilization (see [Gaudi - Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)). + under-utilization (see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)). - For max throughput on Llama 7B, we recommend running with batch size of 128 or 256 and max context length of 2048 with HPU Graphs enabled. If you encounter out-of-memory issues, see troubleshooting section. @@ -336,11 +334,11 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi **Diagnostic and profiling knobs:** -- `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be enabled. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled by default. -- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`. Disabled by default. -- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default. -- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default. -- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default. +- `VLLM_PROFILER_ENABLED`: If `true`, enable the high level profiler. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). `false` by default. +- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: If `true`, log graph compilations for each vLLM engine step when any occurs. Highly recommended to use with `PT_HPU_METRICS_GC_DETAILS=1`. `false` by default. +- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: If `true`, always log graph compilations for each vLLM engine step even if none occurred. `false` by default. +- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: If `true`, log CPU fallbacks for each vLLM engine step when any occurs. `false` by default. +- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, always log CPU fallbacks for each vLLM engine step even if none occurred. `false` by default. **Performance tuning knobs:** @@ -381,7 +379,7 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: -- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default +- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used; if `1`, PyTorch Lazy backend for Gaudi will be used. `1` is default. - `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs ## Troubleshooting: tweaking HPU graphs diff --git a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md index 4c668a8e68927..b4bfb696faa28 100644 --- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md +++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md @@ -132,7 +132,7 @@ Currently, there are no pre-built Neuron images. See for instructions on building the Docker image. -Make sure to use in place of the default Dockerfile. +Make sure to use in place of the default Dockerfile. ## Extra information diff --git a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md index 6c7bbf6024992..beb803cf05978 100644 --- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md +++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md @@ -169,10 +169,10 @@ See for instructions on using the o ### Build image from source -You can use to build a Docker image with TPU support. +You can use to build a Docker image with TPU support. ```console -docker build -f Dockerfile.tpu -t vllm-tpu . +docker build -f docker/Dockerfile.tpu -t vllm-tpu . ``` Run the Docker image with the following command: diff --git a/docs/source/getting_started/installation/cpu.md b/docs/source/getting_started/installation/cpu.md index 1b2ffd6199945..2c0ec60d7100f 100644 --- a/docs/source/getting_started/installation/cpu.md +++ b/docs/source/getting_started/installation/cpu.md @@ -159,26 +159,45 @@ Currently, there are no pre-built CPU wheels. ### Pre-built images -Currently, there are no pre-build CPU images. +:::::{tab-set} +:sync-group: device + +::::{tab-item} Intel/AMD x86 +:sync: x86 + +:::{include} cpu/x86.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +::: + +:::: + +::::: ### Build image from source ```console -$ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g . -$ docker run -it \ - --rm \ - --network=host \ - --cpuset-cpus= \ - --cpuset-mems= \ - vllm-cpu-env +$ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai . + +# Launching OpenAI server +$ docker run --rm \ + --privileged=true \ + --shm-size=4g \ + -p 8000:8000 \ + -e VLLM_CPU_KVCACHE_SPACE= \ + -e VLLM_CPU_OMP_THREADS_BIND= \ + vllm-cpu-env \ + --model=meta-llama/Llama-3.2-1B-Instruct \ + --dtype=bfloat16 \ + other vLLM OpenAI server arguments ``` ::::{tip} -For ARM or Apple silicon, use `Dockerfile.arm` +For ARM or Apple silicon, use `docker/Dockerfile.arm` :::: ::::{tip} -For IBM Z (s390x), use `Dockerfile.s390x` and in `docker run` use flag `--dtype float` +For IBM Z (s390x), use `docker/Dockerfile.s390x` and in `docker run` use flag `--dtype float` :::: ## Supported features @@ -253,12 +272,14 @@ $ python examples/offline_inference/basic/basic.py - Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance. -- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.inc.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel. +- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa). For NUMA architecture, Tensor Parallel is a option for better performance. - - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: + - Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: ```console VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp ``` - - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.inc.md). + - For each thread id list in `VLLM_CPU_OMP_THREADS_BIND`, users should guarantee threads in the list belong to a same NUMA node. + + - Meanwhile, users should also take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, TP worker will be killed due to out-of-memory. diff --git a/docs/source/getting_started/installation/cpu/build.inc.md b/docs/source/getting_started/installation/cpu/build.inc.md index 39d9dfbd2b2e2..f385f3d5b1984 100644 --- a/docs/source/getting_started/installation/cpu/build.inc.md +++ b/docs/source/getting_started/installation/cpu/build.inc.md @@ -2,7 +2,7 @@ First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as ```console sudo apt-get update -y -sudo apt-get install -y gcc-12 g++-12 libnuma-dev +sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 ``` @@ -26,3 +26,9 @@ Finally, build and install vLLM CPU backend: ```console VLLM_TARGET_DEVICE=cpu python setup.py install ``` + +If you want to develop vllm, install it in editable mode instead. + +```console +VLLM_TARGET_DEVICE=cpu python setup.py develop +``` diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md index b2f3bafb4e511..9ae2035db5433 100644 --- a/docs/source/getting_started/installation/cpu/x86.inc.md +++ b/docs/source/getting_started/installation/cpu/x86.inc.md @@ -34,6 +34,8 @@ There are no pre-built wheels or images for this device, so you must build vLLM ### Pre-built images +See [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo) + ### Build image from source ## Extra information diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md index d3e375aec10cb..cd39d6376fe3c 100644 --- a/docs/source/getting_started/installation/gpu/cuda.inc.md +++ b/docs/source/getting_started/installation/gpu/cuda.inc.md @@ -46,7 +46,7 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe ##### Install the latest code using `pip` ```console -pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly +pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly ``` `--pre` is required for `pip` to consider pre-released versions. @@ -65,9 +65,11 @@ Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.p Another way to install the latest code is to use `uv`: ```console -uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly +uv pip install -U vllm --extra-index-url https://wheels.vllm.ai/nightly ``` +##### Install specific revisions using `uv` + If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL: ```console diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md index 4381cef5e96a3..21c8d7d01adeb 100644 --- a/docs/source/getting_started/installation/gpu/rocm.inc.md +++ b/docs/source/getting_started/installation/gpu/rocm.inc.md @@ -8,7 +8,7 @@ There are no pre-built wheels for this device, so you must either use the pre-bu ## Requirements -- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) +- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100/1101), Radeon RX 9000 series (gfx1200/1201) - ROCm 6.3 ## Set up using Python @@ -31,7 +31,7 @@ Currently, there are no pre-built ROCm wheels. ```console # Install PyTorch $ pip uninstall torch -y - $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/rocm6.3 + $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 ``` 1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton) @@ -123,7 +123,7 @@ Building the Docker image from source is the recommended way to use vLLM with RO #### (Optional) Build an image with ROCm software stack -Build a docker image from which setup ROCm software stack needed by the vLLM. +Build a docker image from which setup ROCm software stack needed by the vLLM. **This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.** If you choose to build this rocm_base image yourself, the steps are as follows. @@ -140,12 +140,12 @@ It is important that the user kicks off the docker build using buildkit. Either To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default: ```console -DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm_base -t rocm/vllm-dev:base . +DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm_base -t rocm/vllm-dev:base . ``` #### Build an image with vLLM -First, build a docker image from and launch a docker container from the image. +First, build a docker image from and launch a docker container from the image. It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: ```console @@ -156,10 +156,10 @@ It is important that the user kicks off the docker build using buildkit. Either } ``` - uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches. + uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches. It provides flexibility to customize the build of docker image using the following arguments: -- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using +- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using - `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build - `BUILD_RPD`: Include RocmProfileData profiling tool in the image - `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image @@ -169,13 +169,13 @@ Their values can be passed in when running `docker build` with `--build-arg` opt To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default: ```console -DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . +DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm . ``` To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image: ```console -DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f Dockerfile.rocm -t vllm-rocm . +DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f docker/Dockerfile.rocm -t vllm-rocm . ``` To run the above docker image `vllm-rocm`, use the below command: diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md index 84a9b387789c7..fbf5421eeec5b 100644 --- a/docs/source/getting_started/installation/gpu/xpu.inc.md +++ b/docs/source/getting_started/installation/gpu/xpu.inc.md @@ -23,6 +23,8 @@ Currently, there are no pre-built XPU wheels. - Second, install Python packages for vLLM XPU backend building: ```console +git clone https://github.com/vllm-project/vllm.git +cd vllm pip install --upgrade pip pip install -v -r requirements/xpu.txt ``` @@ -54,7 +56,7 @@ Currently, there are no pre-built XPU images. ### Build image from source ```console -$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . +$ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . $ docker run -it \ --rm \ --network=host \ diff --git a/docs/source/getting_started/installation/python_env_setup.inc.md b/docs/source/getting_started/installation/python_env_setup.inc.md index 6ea44c36db324..a03d35030fe8a 100644 --- a/docs/source/getting_started/installation/python_env_setup.inc.md +++ b/docs/source/getting_started/installation/python_env_setup.inc.md @@ -1,4 +1,4 @@ -You can create a new Python environment using `conda`: +You can create a new Python environment using [conda](https://docs.conda.io/projects/conda/en/stable/user-guide/getting-started.html): ```console # (Recommended) Create a new conda environment. diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index b5246c41883ea..25189b006c260 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -208,5 +208,5 @@ Currently, vLLM supports multiple backends for efficient Attention computation a If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`. ```{attention} -There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see [Dockerfile](https://github.com/vllm-project/vllm/blob/main/Dockerfile) for instructions on how to install it. +There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see for instructions on how to install it. ``` diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md index fdfaf9f932698..a4744827f2268 100644 --- a/docs/source/getting_started/troubleshooting.md +++ b/docs/source/getting_started/troubleshooting.md @@ -24,7 +24,15 @@ To isolate the model downloading and loading issue, you can use the `--load-form ## Out of memory -If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider [using tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. +If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider adopting [these options](#reducing-memory-usage) to reduce the memory consumption. + +## Generation quality changed + +In v0.8.0, the source of default sampling parameters was changed in . Prior to v0.8.0, the default sampling parameters came from vLLM's set of neutral defaults. From v0.8.0 onwards, the default sampling parameters come from the `generation_config.json` provided by the model creator. + +In most cases, this should lead to higher quality responses, because the model creator is likely to know which sampling parameters are best for their model. However, in some cases the defaults provided by the model creator can lead to degraded performance. + +You can check if this is happening by trying the old defaults with `--generation-config vllm` for online and `generation_config="vllm"` for offline. If, after trying this, your generation quality improves we would recommend continuing to use the vLLM defaults and petition the model creator on to update their default `generation_config.json` so that it produces better quality generations. ## Enable more logging diff --git a/docs/source/getting_started/v1_user_guide.md b/docs/source/getting_started/v1_user_guide.md index b1c2807657ffa..a87484c3bb042 100644 --- a/docs/source/getting_started/v1_user_guide.md +++ b/docs/source/getting_started/v1_user_guide.md @@ -47,9 +47,9 @@ This living user guide outlines a few known **important changes and limitations* | **Logprobs Calculation** | 🟢 Functional | | **LoRA** | 🟢 Functional ([PR #13096](https://github.com/vllm-project/vllm/pull/13096))| | **Multimodal Models** | 🟢 Functional | +| **FP8 KV Cache** | 🟢 Functional on Hopper devices ([PR #15191](https://github.com/vllm-project/vllm/pull/15191))| | **Spec Decode** | 🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))| | **Prompt Logprobs with Prefix Caching** | 🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))| -| **FP8 KV Cache** | 🟡 Planned | | **Structured Output Alternative Backends** | 🟡 Planned | | **Embedding Models** | 🟡 Planned ([RFC #12249](https://github.com/vllm-project/vllm/issues/12249)) | | **Mamba Models** | 🟡 Planned | @@ -134,8 +134,6 @@ in progress. #### Features to Be Supported -- **FP8 KV Cache**: While vLLM V1 introduces new FP8 kernels for model weight quantization, support for an FP8 key–value cache is not yet available. Users must continue using FP16 (or other supported precisions) for the KV cache. - - **Structured Output Alternative Backends**: Structured output alternative backends (outlines, guidance) support is planned. V1 currently supports only the `xgrammar:no_fallback` mode, meaning that it will error out if the output schema is unsupported by xgrammar. Details about the structured outputs can be found @@ -158,10 +156,3 @@ vLLM V1 is currently optimized for decoder-only transformers. Models requiring cross-attention between separate encoder and decoder are not yet supported (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`). For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html). - -## Frequently Asked Questions - -**I'm using vLLM V1 and I'm getting CUDA OOM errors. What should I do?** -The default `max_num_seqs` has been raised from `256` in V0 to `1024` in V1. If you encounter CUDA OOM only when using V1 engine, try setting a lower value of `max_num_seqs` or `gpu_memory_utilization`. - -On the other hand, if you get an error about insufficient memory for the cache blocks, you should increase `gpu_memory_utilization` as this indicates that your GPU has sufficient memory but you're not allocating enough to vLLM for KV cache blocks. diff --git a/docs/source/index.md b/docs/source/index.md index 1624d5cf5aae7..28dc0f67d7746 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -43,7 +43,7 @@ vLLM is flexible and easy to use with: - Tensor parallelism and pipeline parallelism support for distributed inference - Streaming outputs - OpenAI-compatible API server -- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators. +- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators. - Prefix caching support - Multi-lora support @@ -77,9 +77,9 @@ getting_started/v1_user_guide :caption: Models :maxdepth: 1 +models/supported_models models/generative_models models/pooling_models -models/supported_models models/extensions/index ::: diff --git a/docs/source/models/extensions/fastsafetensor.md b/docs/source/models/extensions/fastsafetensor.md index 66cd710c97e9f..531d58690014e 100644 --- a/docs/source/models/extensions/fastsafetensor.md +++ b/docs/source/models/extensions/fastsafetensor.md @@ -1,5 +1,5 @@ Loading Model weights with fastsafetensors =================================================================== -Using fastsafetensor library enables loading model weights to GPU memory by leveraging GPU direct storage. See https://github.com/foundation-model-stack/fastsafetensors for more details. +Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details. For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true`` diff --git a/docs/source/models/extensions/tensorizer.md b/docs/source/models/extensions/tensorizer.md index 830c579d91bae..cd94c81e620a2 100644 --- a/docs/source/models/extensions/tensorizer.md +++ b/docs/source/models/extensions/tensorizer.md @@ -9,7 +9,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor For more information on CoreWeave's Tensorizer, please refer to [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see -the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html). +the [vLLM example script](https://docs.vllm.ai/en/latest/getting_started/examples/tensorize_vllm_model.html). :::{note} Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md index c94e940b8534c..63fc53b0e7c55 100644 --- a/docs/source/models/generative_models.md +++ b/docs/source/models/generative_models.md @@ -23,6 +23,8 @@ It is similar to [its counterpart in HF Transformers](https://huggingface.co/doc except that tokenization and detokenization are also performed automatically. ```python +from vllm import LLM + llm = LLM(model="facebook/opt-125m") outputs = llm.generate("Hello, my name is") @@ -36,6 +38,8 @@ You can optionally control the language generation by passing {class}`~vllm.Samp For example, you can use greedy sampling by setting `temperature=0`: ```python +from vllm import LLM, SamplingParams + llm = LLM(model="facebook/opt-125m") params = SamplingParams(temperature=0) outputs = llm.generate("Hello, my name is", params) @@ -83,6 +87,8 @@ Base models may perform poorly as they are not trained to respond to the chat co ::: ```python +from vllm import LLM + llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") conversation = [ { diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index f774f3d0fa0ed..5f1c2b5b4a3ba 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -68,6 +68,8 @@ The {class}`~vllm.LLM.encode` method is available to all pooling models in vLLM. It returns the extracted hidden states directly, which is useful for reward models. ```python +from vllm import LLM + llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward") (output,) = llm.encode("Hello, my name is") @@ -81,6 +83,8 @@ The {class}`~vllm.LLM.embed` method outputs an embedding vector for each prompt. It is primarily designed for embedding models. ```python +from vllm import LLM + llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed") (output,) = llm.embed("Hello, my name is") @@ -96,6 +100,8 @@ The {class}`~vllm.LLM.classify` method outputs a probability vector for each pro It is primarily designed for classification models. ```python +from vllm import LLM + llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify") (output,) = llm.classify("Hello, my name is") @@ -116,6 +122,8 @@ To handle RAG at a higher level, you should use integration frameworks such as [ ::: ```python +from vllm import LLM + llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score") (output,) = llm.score("What is the capital of France?", "The capital of Brazil is Brasilia.") @@ -133,3 +141,77 @@ Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints tha - [Pooling API](#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models. - [Embeddings API](#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs) for embedding models. - [Score API](#score-api) is similar to `LLM.score` for cross-encoder models. + +## Matryoshka Embeddings + +[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows user to trade off between performance and cost. + +:::{warning} +Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings. + +For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model will result in the following error. + +```json +{"object":"error","message":"Model \"BAAI/bge-m3\" does not support matryoshka representation, changing output dimensions will lead to poor results.","type":"BadRequestError","param":null,"code":400} +``` + +::: + +### Manually enable Matryoshka Embeddings + +There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, we simply check the existence of the fields `is_matryoshka` or `matryoshka_dimensions` inside `config.json`. + +For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}` (offline) or `--hf_overrides '{"is_matryoshka": true}'` (online). + +Here is an example to serve a model with Matryoshka Embeddings enabled. + +```text +vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"is_matryoshka":true}' +``` + +### Offline Inference + +You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in {class}`~vllm.PoolingParams`. + +```python +from vllm import LLM, PoolingParams + +model = LLM(model="jinaai/jina-embeddings-v3", + task="embed", + trust_remote_code=True) +outputs = model.embed(["Follow the white rabbit."], + pooling_params=PoolingParams(dimensions=32)) +print(outputs[0].outputs) +``` + +A code example can be found here: + +### Online Inference + +Use the following command to start vllm server. + +```text +vllm serve jinaai/jina-embeddings-v3 --trust-remote-code +``` + +You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter. + +```text +curl http://127.0.0.1:8000/v1/embeddings \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "input": "Follow the white rabbit.", + "model": "jinaai/jina-embeddings-v3", + "encoding_format": "float", + "dimensions": 1 + }' +``` + +Expected output: + +```json +{"id":"embd-0aab28c384d348c3b8f0eb783109dc5f","object":"list","created":1744195454,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-1.0]}],"usage":{"prompt_tokens":10,"total_tokens":10,"completion_tokens":0,"prompt_tokens_details":null}} +``` + +A openai client example can be found here: diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 56ea8c5d8372b..0fdffbeefd031 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -1,66 +1,39 @@ (supported-models)= -# List of Supported Models +# Supported Models -vLLM supports generative and pooling models across various tasks. +vLLM supports [generative](generative-models) and [pooling](pooling-models) models across various tasks. If a model supports more than one task, you can set the task via the `--task` argument. For each task, we list the model architectures that have been implemented in vLLM. Alongside each architecture, we include some popular models that use it. -## Loading a Model +## Model Implementation -### HuggingFace Hub +### vLLM -By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models). +If vLLM natively supports a model, its implementation can be found in . -To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository. -If the `"architectures"` field contains a model architecture listed below, then it should be natively supported. +These models are what we list in and . -Models do not _need_ to be natively supported to be used in vLLM. -The enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!). +(transformers-backend)= -:::{tip} -The easiest way to check if your model is really supported at runtime is to run the program below: +### Transformers + +vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models are supported, and vision language model support is planned! + +To check if the modeling backend is Transformers, you can simply do this: ```python from vllm import LLM - -# For generative models (task=generate) only -llm = LLM(model=..., task="generate") # Name or path of your model -output = llm.generate("Hello, my name is") -print(output) - -# For pooling models (task={embed,classify,reward,score}) only -llm = LLM(model=..., task="embed") # Name or path of your model -output = llm.encode("Hello, my name is") -print(output) -``` - -If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. -::: - -Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM. -Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support. - -(transformers-fallback)= - -### Transformers fallback - -vLLM can fallback to model implementations that are available in Transformers. This does not work for all models for now, but most decoder language models are supported, and vision language model support is planned! - -To check if the backend is Transformers, you can simply do this: - -```python -from vllm import LLM llm = LLM(model=..., task="generate") # Name or path of your model llm.apply_model(lambda model: print(type(model))) ``` -If it is `TransformersModel` then it means it's based on Transformers! +If it is `TransformersForCausalLM` then it means it's based on Transformers! :::{tip} -You can force the use of `TransformersModel` by setting `model_impl="transformers"` for or `--model-impl transformers` for the . +You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for or `--model-impl transformers` for the . ::: :::{note} @@ -69,27 +42,30 @@ vLLM may not fully optimise the Transformers implementation so you may see degra #### Supported features -The Transformers fallback explicitly supports the following features: +The Transformers modeling backend explicitly supports the following features: - (except GGUF) - -- (requires `transformers>=4.49.0`) +- -#### Remote code +#### Remote Code -Earlier we mentioned that the Transformers fallback enables you to run remote code models directly in vLLM. -If you are interested in this feature, this section is for you! +If your model is neither supported natively by vLLM or Transformers, you can still run it in vLLM! Simply set `trust_remote_code=True` and vLLM will run any model on the Model Hub that is compatible with Transformers. Provided that the model writer implements their model in a compatible way, this means that you can run new models before they are officially supported in Transformers or vLLM! -```python +:::{tip} +If you have not yet created your custom model, you can follow this guide on [customising models in Transformers](https://huggingface.co/docs/transformers/en/custom_models). +::: + +```python from vllm import LLM llm = LLM(model=..., task="generate", trust_remote_code=True) # Name or path of your model llm.apply_model(lambda model: print(model.__class__)) ``` -To make your model compatible with the Transformers fallback, it needs: +To make your model compatible with the Transformers backend, it needs: ```{code-block} python :caption: modeling_my_model.py @@ -119,9 +95,11 @@ Here is what happens in the background: 1. The config is loaded 2. `MyModel` Python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`. -3. The `TransformersModel` backend is used. See , which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`. +3. The `TransformersForCausalLM` backend is used. See , which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`. -To make your model compatible with tensor parallel, it needs: +That's it! + +For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class: ```{code-block} python :caption: configuration_my_model.py @@ -130,20 +108,94 @@ from transformers import PretrainedConfig class MyConfig(PretrainedConfig): base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise", - ... + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), } ``` +- `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported). +- `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s: + * You only need to do this for layers which are not present on all pipeline stages + * vLLM assumes that there will be only one `nn.ModuleList`, which is distributed across the pipeline stages + * The `list` in the first element of the `tuple` contains the names of the input arguments + * The `list` in the last element of the `tuple` contains the names of the variables the layer outputs to in your modeling code + +## Loading a Model + +### Hugging Face Hub + +By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models). To change the download path for models, you can set the `HF_HOME` environment variable; for more details, refer to [their official documentation](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhome). + +To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository. +If the `"architectures"` field contains a model architecture listed below, then it should be natively supported. + +Models do not _need_ to be natively supported to be used in vLLM. +The [Transformers backend](#transformers-backend) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!). + :::{tip} -`base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported). +The easiest way to check if your model is really supported at runtime is to run the program below: + +```python +from vllm import LLM + +# For generative models (task=generate) only +llm = LLM(model=..., task="generate") # Name or path of your model +output = llm.generate("Hello, my name is") +print(output) + +# For pooling models (task={embed,classify,reward,score}) only +llm = LLM(model=..., task="embed") # Name or path of your model +output = llm.encode("Hello, my name is") +print(output) +``` + +If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. ::: -That's it! +Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM. +Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support. + +#### Using a proxy + +Here are some tips for loading/downloading models from Hugging Face using a proxy: + +- Set the proxy globally for your session (or set it in the profile file): + +```shell +export http_proxy=http://your.proxy.server:port +export https_proxy=http://your.proxy.server:port +``` + +- Set the proxy for just the current command: + +```shell +https_proxy=http://your.proxy.server:port huggingface-cli download + +# or use vllm cmd directly +https_proxy=http://your.proxy.server:port vllm serve --disable-log-requests +``` + +- Set the proxy in Python interpreter: + +```python +import os + +os.environ['http_proxy'] = 'http://your.proxy.server:port' +os.environ['https_proxy'] = 'http://your.proxy.server:port' +``` ### ModelScope -To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable: +To use models from [ModelScope](https://www.modelscope.cn) instead of Hugging Face Hub, set an environment variable: ```shell export VLLM_USE_MODELSCOPE=True @@ -165,6 +217,8 @@ output = llm.encode("Hello, my name is") print(output) ``` +(supported-text-models)= + ## List of Text-only Language Models ### Generative Models @@ -197,6 +251,11 @@ See [this page](#generative-models) for more information on how to use generativ * `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. * ✅︎ * ✅︎ +- * `BambaForCausalLM` + * Bamba + * `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` + * + * - * `BloomForCausalLM` * BLOOM, BLOOMZ, BLOOMChat * `bigscience/bloom`, `bigscience/bloomz`, etc. @@ -207,9 +266,9 @@ See [this page](#generative-models) for more information on how to use generativ * `facebook/bart-base`, `facebook/bart-large-cnn`, etc. * * -- * `ChatGLMModel` +- * `ChatGLMModel`, `ChatGLMForConditionalGeneration` * ChatGLM - * `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc. + * `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. * ✅︎ * ✅︎ - * `CohereForCausalLM`, `Cohere2ForCausalLM` @@ -224,7 +283,7 @@ See [this page](#generative-models) for more information on how to use generativ * ✅︎ - * `DeciLMForCausalLM` * DeciLM - * `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc. + * `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. * * ✅︎ - * `DeepseekForCausalLM` @@ -277,6 +336,11 @@ See [this page](#generative-models) for more information on how to use generativ * `THUDM/glm-4-9b-chat-hf`, etc. * ✅︎ * ✅︎ +- * `Glm4ForCausalLM` + * GLM-4-0414 + * `THUDM/GLM-4-32B-0414`, etc. + * ✅︎ + * ✅︎ - * `GPT2LMHeadModel` * GPT-2 * `gpt2`, `gpt2-xl`, etc. @@ -437,6 +501,11 @@ See [this page](#generative-models) for more information on how to use generativ * `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. * * ✅︎ +- * `Plamo2ForCausalLM` + * PLaMo2 + * `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. + * + * - * `QWenLMHeadModel` * Qwen * `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. @@ -452,6 +521,16 @@ See [this page](#generative-models) for more information on how to use generativ * `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. * * ✅︎ +- * `Qwen3ForCausalLM` + * Qwen3 + * `Qwen/Qwen3-8B`, etc. + * ✅︎ + * ✅︎ +- * `Qwen3MoeForCausalLM` + * Qwen3MoE + * `Qwen/Qwen3-MoE-15B-A2B`, etc. + * ✅︎ + * ✅︎ - * `StableLmForCausalLM` * StableLM * `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. @@ -482,6 +561,11 @@ See [this page](#generative-models) for more information on how to use generativ * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. * ✅︎ * ✅︎ +- * `MiniMaxText01ForCausalLM` + * MiniMax-Text + * `MiniMaxAI/MiniMax-Text-01`, etc. + * + * ✅︎ - * `Zamba2ForCausalLM` * Zamba2 * `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. @@ -545,7 +629,7 @@ you should explicitly specify the task type to ensure that the model is used in * - * `XLMRobertaModel` * XLM-RoBERTa-based - * `intfloat/multilingual-e5-large`, etc. + * `intfloat/multilingual-e5-large`, `jinaai/jina-reranker-v2-base-multilingual`, etc. * * ::: @@ -660,6 +744,11 @@ If your model is not in the above list, we will try to automatically convert the * `BAAI/bge-reranker-v2-m3`, etc. * * +- * `ModernBertForSequenceClassification` + * ModernBert-based + * `Alibaba-NLP/gte-reranker-modernbert-base`, etc. + * + * ::: (supported-mm-models)= @@ -684,7 +773,7 @@ On the other hand, modalities separated by `/` are mutually exclusive. See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model. :::{important} -To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference) +**To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference) or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt: Offline inference: @@ -699,9 +788,11 @@ llm = LLM( Online serving: ```bash -vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 +vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt '{"image":4}' ``` +**This is no longer required if you are using vLLM V1.** + ::: :::{note} @@ -732,6 +823,13 @@ See [this page](#generative-models) for more information on how to use generativ * * ✅︎ * ✅︎ +- * `AyaVisionForConditionalGeneration` + * Aya Vision + * T + I+ + * `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. + * + * ✅︎ + * ✅︎ - * `Blip2ForConditionalGeneration` * BLIP-2 * T + IE @@ -796,9 +894,23 @@ See [this page](#generative-models) for more information on how to use generativ * * ✅︎ - * `InternVLChatModel` - * InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 + * InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 * T + IE+ - * `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. + * `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. + * + * ✅︎ + * ✅︎ +- * `KimiVLForConditionalGeneration` + * Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking + * T + I+ + * `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` + * + * + * ✅︎ +- * `Llama4ForConditionalGeneration` + * Llama 4 + * T + I+ + * `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. * * ✅︎ * ✅︎ @@ -836,14 +948,21 @@ See [this page](#generative-models) for more information on how to use generativ * `openbmb/MiniCPM-o-2_6`, etc. * ✅︎ * ✅︎ - * + * ✅︎ - * `MiniCPMV` * MiniCPM-V * T + IE+ + VE+ * `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. * ✅︎ * ✅︎ + * ✅︎ +- * `Mistral3ForConditionalGeneration` + * Mistral3 + * T + I+ + * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. * + * ✅︎ + * ✅︎ - * `MllamaForConditionalGeneration` * Llama 3.2 * T + I+ @@ -853,7 +972,7 @@ See [this page](#generative-models) for more information on how to use generativ * - * `MolmoForCausalLM` * Molmo - * T + I + * T + I+ * `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. * ✅︎ * ✅︎ @@ -885,7 +1004,7 @@ See [this page](#generative-models) for more information on how to use generativ * `microsoft/Phi-4-multimodal-instruct`, etc. * ✅︎ * - * + * ✅︎ - * `PixtralForConditionalGeneration` * Pixtral * T + I+ @@ -921,6 +1040,27 @@ See [this page](#generative-models) for more information on how to use generativ * ✅︎ * ✅︎ * ✅︎ +- * `Qwen2_5OmniThinkerForConditionalGeneration` + * Qwen2.5-Omni + * T + IE+ + VE+ + A+ + * `Qwen/Qwen2.5-Omni-7B` + * + * ✅︎ + * ✅︎\* +- * `SkyworkR1VChatModel` + * Skywork-R1V-38B + * T + I + * `Skywork/Skywork-R1V-38B` + * + * ✅︎ + * ✅︎ +- * `SmolVLMForConditionalGeneration` + * SmolVLM2 + * T + I + * `SmolVLM2-2.2B-Instruct` + * + * ✅︎ + * ✅︎ - * `UltravoxModel` * Ultravox * T + AE+ @@ -937,9 +1077,6 @@ See [this page](#generative-models) for more information on how to use generativ + Multiple items can be inputted per text prompt for this modality. :::{important} -To use Gemma3 series models, you have to install Hugging Face Transformers library from source via -`pip install git+https://github.com/huggingface/transformers`. - Pan-and-scan image pre-processing is currently supported on V0 (but not V1). You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`. ::: @@ -979,6 +1116,14 @@ For more details, please see: Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1. ::: +:::{note} +To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from source via +`pip install git+https://github.com/huggingface/transformers.git`. + +Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1. +`--mm-processor-kwargs '{"use_audio_in_video": True}'`. +::: + ### Pooling Models See [this page](pooling-models) for more information on how to use pooling models. @@ -1059,7 +1204,7 @@ At vLLM, we are committed to facilitating the integration and support of third-p 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results. :::{tip} - When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. + When comparing the output of `model.generate` from Hugging Face Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. ::: 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback. @@ -1076,5 +1221,5 @@ We have the following levels of testing for models: 1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test. 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test. -3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:main/examples) for the models that have passed this test. +3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:examples) for the models that have passed this test. 4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category. diff --git a/docs/source/performance/optimization.md b/docs/source/performance/optimization.md index 5b0f8421a51eb..ccbe8a367061f 100644 --- a/docs/source/performance/optimization.md +++ b/docs/source/performance/optimization.md @@ -31,6 +31,8 @@ vLLM supports an experimental feature chunked prefill. Chunked prefill allows to You can enable the feature by specifying `--enable-chunked-prefill` in the command line or setting `enable_chunked_prefill=True` in the LLM constructor. ```python +from vllm import LLM + llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True) # Set max_num_batched_tokens to tune performance. # NOTE: 2048 is the default max_num_batched_tokens for chunked prefill. diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md index 2e2016c95e4fc..d9a093e8d145d 100644 --- a/docs/source/serving/multimodal_inputs.md +++ b/docs/source/serving/multimodal_inputs.md @@ -21,6 +21,8 @@ To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType` You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples: ```python +from vllm import LLM + llm = LLM(model="llava-hf/llava-1.5-7b-hf") # Refer to the HuggingFace repo for the correct format to use @@ -65,6 +67,8 @@ Full example: To substitute multiple images inside the same text prompt, you can pass in a list of images instead: ```python +from vllm import LLM + llm = LLM( model="microsoft/Phi-3.5-vision-instruct", trust_remote_code=True, # Required to load Phi-3.5-vision @@ -96,6 +100,8 @@ Full example: . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. +::: + #### Quantization Quantized models take less memory at the cost of lower precision. @@ -92,11 +106,46 @@ You can further reduce memory usage by limiting the context length of the model and the maximum batch size (`max_num_seqs` option). ```python +from vllm import LLM + llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2) ``` +#### Reduce CUDA Graphs + +By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU. + +:::{important} +CUDA graph capture takes up more memory in V1 than in V0. +::: + +You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage: + +```python +from vllm import LLM +from vllm.config import CompilationConfig, CompilationLevel + +llm = LLM( + model="meta-llama/Llama-3.1-8B-Instruct", + compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + # By default, it goes up to max_num_seqs + cudagraph_capture_sizes=[1, 2, 4, 8, 16], + ), +) +``` + +You can disable graph capturing completely via the `enforce_eager` flag: + +```python +from vllm import LLM + +llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", + enforce_eager=True) +``` + #### Adjust cache size If you run out of CPU RAM, try the following options: @@ -104,6 +153,62 @@ If you run out of CPU RAM, try the following options: - (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB). - (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB). +#### Multi-modal input limits + +You can allow a smaller number of multi-modal items per prompt to reduce the memory footprint of the model: + +```python +from vllm import LLM + +# Accept up to 3 images and 1 video per prompt +llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", + limit_mm_per_prompt={"image": 3, "video": 1}) +``` + +You can go a step further and disable unused modalities completely by setting its limit to zero. +For example, if your application only accepts image input, there is no need to allocate any memory for videos. + +```python +from vllm import LLM + +# Accept any number of images but no videos +llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", + limit_mm_per_prompt={"video": 0}) +``` + +You can even run a multi-modal model for text-only inference: + +```python +from vllm import LLM + +# Don't accept images. Just text. +llm = LLM(model="google/gemma-3-27b-it", + limit_mm_per_prompt={"image": 0}) +``` + +#### Multi-modal processor arguments + +For certain models, you can adjust the multi-modal processor arguments to +reduce the size of the processed multi-modal inputs, which in turn saves memory. + +Here are some examples: + +```python +from vllm import LLM + +# Available for Qwen2-VL series models +llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", + mm_processor_kwargs={ + "max_pixels": 768 * 768, # Default is 1280 * 28 * 28 + }) + +# Available for InternVL series models +llm = LLM(model="OpenGVLab/InternVL2-2B", + mm_processor_kwargs={ + "max_dynamic_patch": 4, # Default is 12 + }) +``` + ### Performance optimization and tuning You can potentially improve the performance of vLLM by finetuning various options. diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 1cebff7e1f6e2..34382c87a484b 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -2,15 +2,15 @@ # OpenAI-Compatible Server -vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! +vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! This functionality lets you serve models and interact with them using an HTTP client. -You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](#deployment-docker): +In your terminal, you can [install](../getting_started/installation.md) vLLM, then start the server with the [`vllm serve`](#vllm-serve) command. (You can also use our [Docker](#deployment-docker) image.) ```bash vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 ``` -To call the server, you can use the [official OpenAI Python client](https://github.com/openai/openai-python), or any other HTTP client. +To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python). ```python from openai import OpenAI @@ -33,11 +33,13 @@ print(completion.choices[0].message) vLLM supports some parameters that are not supported by OpenAI, `top_k` for example. You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`. ::: + :::{important} By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator. To disable this behavior, please pass `--generation-config vllm` when launching the server. ::: + ## Supported APIs We currently support the following OpenAI APIs: @@ -172,6 +174,12 @@ print(completion._request_id) The `vllm serve` command is used to launch the OpenAI-compatible server. +:::{tip} +The vast majority of command-line arguments are based on those for offline inference. + +See [here](configuration-options) for some common options. +::: + :::{argparse} :module: vllm.entrypoints.openai.cli_args :func: create_parser_for_docs @@ -188,6 +196,7 @@ For example: ```yaml # config.yaml +model: meta-llama/Llama-3.1-8B-Instruct host: "127.0.0.1" port: 6379 uvicorn-log-level: "info" @@ -196,12 +205,13 @@ uvicorn-log-level: "info" To use the above config file: ```bash -vllm serve SOME_MODEL --config config.yaml +vllm serve --config config.yaml ``` :::{note} In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence. The order of priorities is `command line > config file values > defaults`. +e.g. `vllm serve SOME_MODEL --config config.yaml`, SOME_MODEL takes precedence over `model` in config file. ::: ## API Reference @@ -392,9 +402,26 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai To use the Transcriptions API, please install with extra audio dependencies using `pip install vllm[audio]`. ::: +Code example: -Code example: +#### Extra Parameters + +The following [sampling parameters](#sampling-params) are supported. + +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-transcription-sampling-params +:end-before: end-transcription-sampling-params +::: + +The following extra parameters are supported: + +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-transcription-extra-params +:end-before: end-transcription-extra-params +::: (tokenizer-api)= diff --git a/docs/source/serving/usage_stats.md b/docs/source/serving/usage_stats.md index cfc3cb2576873..750cba7ed9ce2 100644 --- a/docs/source/serving/usage_stats.md +++ b/docs/source/serving/usage_stats.md @@ -1,6 +1,8 @@ # Usage Stats Collection -vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information, and will be publicly released for the community's benefit. +vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information. + +A subset of the data, after cleaning and aggregation, will be publicly released for the community's benefit. For example, you can see the 2024 usage report [here](https://2024.vllm.ai). ## What data is collected? diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index 840892ea07010..e3c75d5cb6a96 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -47,7 +47,7 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData: model=model_name, trust_remote_code=True, max_model_len=4096, - max_num_seqs=5, + max_num_seqs=2, limit_mm_per_prompt={"audio": audio_count}, ) @@ -89,7 +89,7 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData: engine_args = EngineArgs( model=model_path, trust_remote_code=True, - max_model_len=4096, + max_model_len=12800, max_num_seqs=2, enable_lora=True, max_lora_rank=320, @@ -130,6 +130,36 @@ def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData: ) +# Qwen2.5-Omni +def run_qwen2_5_omni(question: str, audio_count: int): + model_name = "Qwen/Qwen2.5-Omni-7B" + + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=5, + limit_mm_per_prompt={"audio": audio_count}, + ) + + audio_in_prompt = "".join([ + "<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count) + ]) + + default_system = ( + "You are Qwen, a virtual human developed by the Qwen Team, Alibaba " + "Group, capable of perceiving auditory and visual inputs, as well as " + "generating text and speech.") + + prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n" + "<|im_start|>user\n" + f"{audio_in_prompt}{question}<|im_end|>\n" + "<|im_start|>assistant\n") + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + ) + + # Ultravox 0.5-1B def run_ultravox(question: str, audio_count: int) -> ModelRequestData: model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b" @@ -182,59 +212,13 @@ model_example_map = { "minicpmo": run_minicpmo, "phi4_mm": run_phi4mm, "qwen2_audio": run_qwen2_audio, + "qwen2_5_omni": run_qwen2_5_omni, "ultravox": run_ultravox, "whisper": run_whisper, } -def main(args): - model = args.model_type - if model not in model_example_map: - raise ValueError(f"Model type {model} is not supported.") - - audio_count = args.num_audios - req_data = model_example_map[model](question_per_audio_count[audio_count], - audio_count) - - engine_args = asdict(req_data.engine_args) | {"seed": args.seed} - llm = LLM(**engine_args) - - # To maintain code compatibility in this script, we add LoRA here. - # You can also add LoRA using: - # llm.generate(prompts, lora_request=lora_request,...) - if req_data.lora_requests: - for lora_request in req_data.lora_requests: - llm.llm_engine.add_lora(lora_request=lora_request) - - # We set temperature to 0.2 so that outputs can be different - # even when all prompts are identical when running batch inference. - sampling_params = SamplingParams(temperature=0.2, - max_tokens=64, - stop_token_ids=req_data.stop_token_ids) - - mm_data = {} - if audio_count > 0: - mm_data = { - "audio": [ - asset.audio_and_sample_rate - for asset in audio_assets[:audio_count] - ] - } - - assert args.num_prompts > 0 - inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data} - if args.num_prompts > 1: - # Batch inference - inputs = [inputs] * args.num_prompts - - outputs = llm.generate(inputs, sampling_params=sampling_params) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - -if __name__ == "__main__": +def parse_args(): parser = FlexibleArgumentParser( description='Demo on using vLLM for offline inference with ' 'audio language models') @@ -258,5 +242,61 @@ if __name__ == "__main__": default=None, help="Set the seed when initializing `vllm.LLM`.") - args = parser.parse_args() + return parser.parse_args() + + +def main(args): + model = args.model_type + if model not in model_example_map: + raise ValueError(f"Model type {model} is not supported.") + + audio_count = args.num_audios + req_data = model_example_map[model](question_per_audio_count[audio_count], + audio_count) + + # Disable other modalities to save memory + default_limits = {"image": 0, "video": 0, "audio": 0} + req_data.engine_args.limit_mm_per_prompt = default_limits | dict( + req_data.engine_args.limit_mm_per_prompt or {}) + + engine_args = asdict(req_data.engine_args) | {"seed": args.seed} + llm = LLM(**engine_args) + + # We set temperature to 0.2 so that outputs can be different + # even when all prompts are identical when running batch inference. + sampling_params = SamplingParams(temperature=0.2, + max_tokens=64, + stop_token_ids=req_data.stop_token_ids) + + mm_data = {} + if audio_count > 0: + mm_data = { + "audio": [ + asset.audio_and_sample_rate + for asset in audio_assets[:audio_count] + ] + } + + assert args.num_prompts > 0 + inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data} + if args.num_prompts > 1: + # Batch inference + inputs = [inputs] * args.num_prompts + # Add LoRA request if applicable + lora_request = (req_data.lora_requests * + args.num_prompts if req_data.lora_requests else None) + + outputs = llm.generate( + inputs, + sampling_params=sampling_params, + lora_request=lora_request, + ) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + +if __name__ == "__main__": + args = parse_args() main(args) diff --git a/examples/offline_inference/basic/basic.py b/examples/offline_inference/basic/basic.py index a6e96c0bb4339..ae5ae7cb48346 100644 --- a/examples/offline_inference/basic/basic.py +++ b/examples/offline_inference/basic/basic.py @@ -12,13 +12,23 @@ prompts = [ # Create a sampling params object. sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -# Create an LLM. -llm = LLM(model="facebook/opt-125m") -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") \ No newline at end of file + +def main(): + # Create an LLM. + llm = LLM(model="facebook/opt-125m") + # Generate texts from the prompts. + # The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + print("\nGenerated Outputs:\n" + "-" * 60) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}") + print(f"Output: {generated_text!r}") + print("-" * 60) + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/basic/chat.py b/examples/offline_inference/basic/chat.py index b2523e533a40a..6857c6e9e31df 100644 --- a/examples/offline_inference/basic/chat.py +++ b/examples/offline_inference/basic/chat.py @@ -4,6 +4,24 @@ from vllm import LLM, EngineArgs from vllm.utils import FlexibleArgumentParser +def create_parser(): + parser = FlexibleArgumentParser() + # Add engine args + engine_group = parser.add_argument_group("Engine arguments") + EngineArgs.add_cli_args(engine_group) + engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct") + # Add sampling params + sampling_group = parser.add_argument_group("Sampling parameters") + sampling_group.add_argument("--max-tokens", type=int) + sampling_group.add_argument("--temperature", type=float) + sampling_group.add_argument("--top-p", type=float) + sampling_group.add_argument("--top-k", type=int) + # Add example params + parser.add_argument("--chat-template-path", type=str) + + return parser + + def main(args: dict): # Pop arguments not used by LLM max_tokens = args.pop("max_tokens") @@ -27,12 +45,13 @@ def main(args: dict): sampling_params.top_k = top_k def print_outputs(outputs): + print("\nGenerated Outputs:\n" + "-" * 80) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}") + print(f"Prompt: {prompt!r}\n") print(f"Generated text: {generated_text!r}") - print("-" * 80) + print("-" * 80) print("=" * 80) @@ -81,18 +100,6 @@ def main(args: dict): if __name__ == "__main__": - parser = FlexibleArgumentParser() - # Add engine args - engine_group = parser.add_argument_group("Engine arguments") - EngineArgs.add_cli_args(engine_group) - engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct") - # Add sampling params - sampling_group = parser.add_argument_group("Sampling parameters") - sampling_group.add_argument("--max-tokens", type=int) - sampling_group.add_argument("--temperature", type=float) - sampling_group.add_argument("--top-p", type=float) - sampling_group.add_argument("--top-k", type=int) - # Add example params - parser.add_argument("--chat-template-path", type=str) + parser = create_parser() args: dict = vars(parser.parse_args()) main(args) diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py index 4ef949b4784de..5b6dcb41eee1c 100644 --- a/examples/offline_inference/basic/classify.py +++ b/examples/offline_inference/basic/classify.py @@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs from vllm.utils import FlexibleArgumentParser +def parse_args(): + parser = FlexibleArgumentParser() + parser = EngineArgs.add_cli_args(parser) + # Set example specific arguments + parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach", + task="classify", + enforce_eager=True) + return parser.parse_args() + + def main(args: Namespace): # Sample prompts. prompts = [ @@ -23,20 +33,16 @@ def main(args: Namespace): outputs = model.classify(prompts) # Print the outputs. + print("\nGenerated Outputs:\n" + "-" * 60) for prompt, output in zip(prompts, outputs): probs = output.outputs.probs probs_trimmed = ((str(probs[:16])[:-1] + ", ...]") if len(probs) > 16 else probs) - print(f"Prompt: {prompt!r} | " + print(f"Prompt: {prompt!r} \n" f"Class Probabilities: {probs_trimmed} (size={len(probs)})") + print("-" * 60) if __name__ == "__main__": - parser = FlexibleArgumentParser() - parser = EngineArgs.add_cli_args(parser) - # Set example specific arguments - parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach", - task="classify", - enforce_eager=True) - args = parser.parse_args() + args = parse_args() main(args) diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py index f1655b6dbe111..cb5f923ffb697 100644 --- a/examples/offline_inference/basic/embed.py +++ b/examples/offline_inference/basic/embed.py @@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs from vllm.utils import FlexibleArgumentParser +def parse_args(): + parser = FlexibleArgumentParser() + parser = EngineArgs.add_cli_args(parser) + # Set example specific arguments + parser.set_defaults(model="intfloat/e5-mistral-7b-instruct", + task="embed", + enforce_eager=True) + return parser.parse_args() + + def main(args: Namespace): # Sample prompts. prompts = [ @@ -23,20 +33,16 @@ def main(args: Namespace): outputs = model.embed(prompts) # Print the outputs. + print("\nGenerated Outputs:\n" + "-" * 60) for prompt, output in zip(prompts, outputs): embeds = output.outputs.embedding embeds_trimmed = ((str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds) - print(f"Prompt: {prompt!r} | " + print(f"Prompt: {prompt!r} \n" f"Embeddings: {embeds_trimmed} (size={len(embeds)})") + print("-" * 60) if __name__ == "__main__": - parser = FlexibleArgumentParser() - parser = EngineArgs.add_cli_args(parser) - # Set example specific arguments - parser.set_defaults(model="intfloat/e5-mistral-7b-instruct", - task="embed", - enforce_eager=True) - args = parser.parse_args() + args = parse_args() main(args) diff --git a/examples/offline_inference/basic/generate.py b/examples/offline_inference/basic/generate.py index 93f4f2a36fac6..54b52b22a45a9 100644 --- a/examples/offline_inference/basic/generate.py +++ b/examples/offline_inference/basic/generate.py @@ -4,6 +4,22 @@ from vllm import LLM, EngineArgs from vllm.utils import FlexibleArgumentParser +def create_parser(): + parser = FlexibleArgumentParser() + # Add engine args + engine_group = parser.add_argument_group("Engine arguments") + EngineArgs.add_cli_args(engine_group) + engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct") + # Add sampling params + sampling_group = parser.add_argument_group("Sampling parameters") + sampling_group.add_argument("--max-tokens", type=int) + sampling_group.add_argument("--temperature", type=float) + sampling_group.add_argument("--top-p", type=float) + sampling_group.add_argument("--top-k", type=int) + + return parser + + def main(args: dict): # Pop arguments not used by LLM max_tokens = args.pop("max_tokens") @@ -35,23 +51,15 @@ def main(args: dict): ] outputs = llm.generate(prompts, sampling_params) # Print the outputs. + print("-" * 50) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) if __name__ == "__main__": - parser = FlexibleArgumentParser() - # Add engine args - engine_group = parser.add_argument_group("Engine arguments") - EngineArgs.add_cli_args(engine_group) - engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct") - # Add sampling params - sampling_group = parser.add_argument_group("Sampling parameters") - sampling_group.add_argument("--max-tokens", type=int) - sampling_group.add_argument("--temperature", type=float) - sampling_group.add_argument("--top-p", type=float) - sampling_group.add_argument("--top-k", type=int) + parser = create_parser() args: dict = vars(parser.parse_args()) main(args) diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py index 2d21f1f0e3971..d2bda8b3180c3 100644 --- a/examples/offline_inference/basic/score.py +++ b/examples/offline_inference/basic/score.py @@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs from vllm.utils import FlexibleArgumentParser +def parse_args(): + parser = FlexibleArgumentParser() + parser = EngineArgs.add_cli_args(parser) + # Set example specific arguments + parser.set_defaults(model="BAAI/bge-reranker-v2-m3", + task="score", + enforce_eager=True) + return parser.parse_args() + + def main(args: Namespace): # Sample prompts. text_1 = "What is the capital of France?" @@ -22,17 +32,13 @@ def main(args: Namespace): outputs = model.score(text_1, texts_2) # Print the outputs. + print("\nGenerated Outputs:\n" + "-" * 60) for text_2, output in zip(texts_2, outputs): score = output.outputs.score - print(f"Pair: {[text_1, text_2]!r} | Score: {score}") + print(f"Pair: {[text_1, text_2]!r} \nScore: {score}") + print("-" * 60) if __name__ == "__main__": - parser = FlexibleArgumentParser() - parser = EngineArgs.add_cli_args(parser) - # Set example specific arguments - parser.set_defaults(model="BAAI/bge-reranker-v2-m3", - task="score", - enforce_eager=True) - args = parser.parse_args() + args = parse_args() main(args) diff --git a/examples/offline_inference/batch_llm_inference.py b/examples/offline_inference/batch_llm_inference.py new file mode 100644 index 0000000000000..6548857b6d111 --- /dev/null +++ b/examples/offline_inference/batch_llm_inference.py @@ -0,0 +1,90 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +This example shows how to use Ray Data for data parallel batch inference. + +Ray Data is a data processing framework that can handle large datasets +and integrates tightly with vLLM for data-parallel inference. + +As of Ray 2.44, Ray Data has a native integration with +vLLM (under ray.data.llm). + +Ray Data provides functionality for: +* Reading and writing to cloud storage (S3, GCS, etc.) +* Automatic sharding and load-balancing across a cluster +* Optimized configuration of vLLM using continuous batching +* Compatible with tensor/pipeline parallel inference as well. + +Learn more about Ray Data's LLM integration: +https://docs.ray.io/en/latest/data/working-with-llms.html +""" +import ray +from packaging.version import Version +from ray.data.llm import build_llm_processor, vLLMEngineProcessorConfig + +assert Version(ray.__version__) >= Version( + "2.44.1"), "Ray version must be at least 2.44.1" + +# Uncomment to reduce clutter in stdout +# ray.init(log_to_driver=False) +# ray.data.DataContext.get_current().enable_progress_bars = False + +# Read one text file from S3. Ray Data supports reading multiple files +# from cloud storage (such as JSONL, Parquet, CSV, binary format). +ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt") +print(ds.schema()) + +size = ds.count() +print(f"Size of dataset: {size} prompts") + +# Configure vLLM engine. +config = vLLMEngineProcessorConfig( + model_source="unsloth/Llama-3.1-8B-Instruct", + engine_kwargs={ + "enable_chunked_prefill": True, + "max_num_batched_tokens": 4096, + "max_model_len": 16384, + }, + concurrency=1, # set the number of parallel vLLM replicas + batch_size=64, +) + +# Create a Processor object, which will be used to +# do batch inference on the dataset +vllm_processor = build_llm_processor( + config, + preprocess=lambda row: dict( + messages=[{ + "role": "system", + "content": "You are a bot that responds with haikus." + }, { + "role": "user", + "content": row["text"] + }], + sampling_params=dict( + temperature=0.3, + max_tokens=250, + )), + postprocess=lambda row: dict( + answer=row["generated_text"], + **row # This will return all the original columns in the dataset. + ), +) + +ds = vllm_processor(ds) + +# Peek first 10 results. +# NOTE: This is for local testing and debugging. For production use case, +# one should write full result out as shown below. +outputs = ds.take(limit=10) + +for output in outputs: + prompt = output["prompt"] + generated_text = output["generated_text"] + print(f"Prompt: {prompt!r}") + print(f"Generated text: {generated_text!r}") + +# Write inference output data out as Parquet files to S3. +# Multiple files would be written to the output destination, +# and each task would write one or more files separately. +# +# ds.write_parquet("s3://") diff --git a/examples/offline_inference/cpu_offload_lmcache.py b/examples/offline_inference/cpu_offload_lmcache.py index 8211629b24ecc..37aea281032fd 100644 --- a/examples/offline_inference/cpu_offload_lmcache.py +++ b/examples/offline_inference/cpu_offload_lmcache.py @@ -3,9 +3,12 @@ This file demonstrates the example usage of cpu offloading with LMCache. -Note that `pip install lmcache` is needed to run this example. -Learn more about LMCache in https://github.com/LMCache/LMCache. +Note that `lmcache` is needed to run this example. +Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1 +Learn more about LMCache environment setup, please refer to: +https://docs.lmcache.ai/getting_started/installation.html """ +import contextlib import os import time @@ -15,51 +18,83 @@ from lmcache.integration.vllm.utils import ENGINE_NAME from vllm import LLM, SamplingParams from vllm.config import KVTransferConfig -# LMCache-related environment variables -# Use experimental features in LMCache -os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True" -# LMCache is set to use 256 tokens per chunk -os.environ["LMCACHE_CHUNK_SIZE"] = "256" -# Enable local CPU backend in LMCache -os.environ["LMCACHE_LOCAL_CPU"] = "True" -# Set local CPU memory limit to 5.0 GB -os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0" -# This example script runs two requests with a shared prefix. -shared_prompt = "Hello, how are you?" * 1000 -first_prompt = [ - shared_prompt + "Hello, my name is", -] -second_prompt = [ - shared_prompt + "Tell me a very long story", -] +def setup_environment_variables(): + # LMCache-related environment variables + # Use experimental features in LMCache + os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True" + # LMCache is set to use 256 tokens per chunk + os.environ["LMCACHE_CHUNK_SIZE"] = "256" + # Enable local CPU backend in LMCache + os.environ["LMCACHE_LOCAL_CPU"] = "True" + # Set local CPU memory limit to 5.0 GB + os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0" -sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) -ktc = KVTransferConfig.from_cli( - '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}') -# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB -# memory. Reduce the value if your GPU has less memory. -# Note that LMCache is not compatible with chunked prefill for now. -llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", - kv_transfer_config=ktc, - max_model_len=8000, - enable_chunked_prefill=False, - gpu_memory_utilization=0.8) +@contextlib.contextmanager +def build_llm_with_lmcache(): + ktc = KVTransferConfig.from_cli( + '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}') + # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB + # memory. Reduce the value if your GPU has less memory. + # Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392). + llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", + kv_transfer_config=ktc, + max_model_len=8000, + enable_chunked_prefill=True, + gpu_memory_utilization=0.8) -outputs = llm.generate(first_prompt, sampling_params) -for output in outputs: - generated_text = output.outputs[0].text - print(f"Generated text: {generated_text!r}") -print("First request done.") + try: + yield llm + finally: + # Clean up lmcache backend + LMCacheEngineBuilder.destroy(ENGINE_NAME) -time.sleep(1) -outputs = llm.generate(second_prompt, sampling_params) -for output in outputs: - generated_text = output.outputs[0].text - print(f"Generated text: {generated_text!r}") -print("Second request done.") +def print_output( + llm: LLM, + prompt: list[str], + sampling_params: SamplingParams, + req_str: str, +): + start = time.time() + outputs = llm.generate(prompt, sampling_params) + print("-" * 50) + for output in outputs: + generated_text = output.outputs[0].text + print(f"Generated text: {generated_text!r}") + print(f"Generation took {time.time() - start:.2f} seconds, " + f"{req_str} request done.") + print("-" * 50) -# Clean up lmcache backend -LMCacheEngineBuilder.destroy(ENGINE_NAME) + +def main(): + setup_environment_variables() + + with build_llm_with_lmcache() as llm: + + # This example script runs two requests with a shared prefix. + # Define the shared prompt and specific prompts + shared_prompt = "Hello, how are you?" * 1000 + first_prompt = [ + shared_prompt + "Hello, my name is", + ] + second_prompt = [ + shared_prompt + "Tell me a very long story", + ] + + sampling_params = SamplingParams(temperature=0, + top_p=0.95, + max_tokens=10) + + # Print the first output + print_output(llm, first_prompt, sampling_params, "first") + + time.sleep(1) + + # print the second output + print_output(llm, second_prompt, sampling_params, "second") + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index b73770ce382cf..965915beaf58f 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -1,26 +1,83 @@ # SPDX-License-Identifier: Apache-2.0 -# usage: -# VLLM_USE_V1=1 python examples/offline_inference/data_parallel.py -# we need to have a launcher to create multiple data parallel -# ranks. And each rank will create a vLLM instance to process its own prompts. +""" +Usage: +Single node: + python examples/offline_inference/data_parallel.py \ + --model="ibm-research/PowerMoE-3b" \ + --dp-size=2 \ + --tp-size=2 + +Multi-node: + Node 0 (assume the node has ip of 10.99.48.128): + python examples/offline_inference/data_parallel.py \ + --model="ibm-research/PowerMoE-3b" \ + --dp-size=2 \ + --tp-size=2 \ + --node-size=2 \ + --node-rank=0 \ + --master-addr=10.99.48.128 \ + --master-port=13345 + Node 1: + python examples/offline_inference/data_parallel.py \ + --model="ibm-research/PowerMoE-3b" \ + --dp-size=2 \ + --tp-size=2 \ + --node-size=2 \ + --node-rank=1 \ + --master-addr=10.99.48.128 \ + --master-port=13345 +""" import os +from time import sleep from vllm import LLM, SamplingParams from vllm.utils import get_open_port -GPUs_per_dp_rank = 2 -DP_size = 2 + +def parse_args(): + import argparse + parser = argparse.ArgumentParser(description="Data Parallel Inference") + parser.add_argument("--model", + type=str, + default="ibm-research/PowerMoE-3b", + help="Model name or path") + parser.add_argument("--dp-size", + type=int, + default=2, + help="Data parallel size") + parser.add_argument("--tp-size", + type=int, + default=2, + help="Tensor parallel size") + parser.add_argument("--node-size", + type=int, + default=1, + help="Total number of nodes") + parser.add_argument("--node-rank", + type=int, + default=0, + help="Rank of the current node") + parser.add_argument("--master-addr", + type=str, + default="", + help="Master node IP address") + parser.add_argument("--master-port", + type=int, + default=0, + help="Master node port") + return parser.parse_args() -def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank): - os.environ["VLLM_DP_RANK"] = str(dp_rank) +def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, + dp_master_port, GPUs_per_dp_rank): + os.environ["VLLM_DP_RANK"] = str(global_dp_rank) + os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank) os.environ["VLLM_DP_SIZE"] = str(dp_size) os.environ["VLLM_DP_MASTER_IP"] = dp_master_ip os.environ["VLLM_DP_MASTER_PORT"] = str(dp_master_port) - # set devices for each dp_rank - os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( - str(i) for i in range(dp_rank * GPUs_per_dp_rank, (dp_rank + 1) * - GPUs_per_dp_rank)) + + # CUDA_VISIBLE_DEVICES for each DP rank is set automatically inside the + # engine processes. # Sample prompts. prompts = [ @@ -28,20 +85,20 @@ def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank): "The president of the United States is", "The capital of France is", "The future of AI is", - ] + ] * 100 # with DP, each rank should process different prompts. # usually all the DP ranks process a full dataset, # and each rank processes a different part of the dataset. promts_per_rank = len(prompts) // dp_size - start = dp_rank * promts_per_rank + start = global_dp_rank * promts_per_rank end = start + promts_per_rank prompts = prompts[start:end] if len(prompts) == 0: # if any rank has no prompts to process, # we need to set a placeholder prompt prompts = ["Placeholder"] - print(f"DP rank {dp_rank} needs to process {len(prompts)} prompts") + print(f"DP rank {global_dp_rank} needs to process {len(prompts)} prompts") # Create a sampling params object. # since we are doing data parallel, every rank can have different @@ -49,37 +106,67 @@ def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank): # ranks for demonstration. sampling_params = SamplingParams(temperature=0.8, top_p=0.95, - max_tokens=16 * (dp_rank + 1)) + max_tokens=[16, 20][global_dp_rank % 2]) # Create an LLM. - llm = LLM(model="ibm-research/PowerMoE-3b", + llm = LLM(model=model, tensor_parallel_size=GPUs_per_dp_rank, enforce_eager=True, enable_expert_parallel=True) outputs = llm.generate(prompts, sampling_params) # Print the outputs. - for output in outputs: + for i, output in enumerate(outputs): + if i >= 5: + # print only 5 outputs + break prompt = output.prompt generated_text = output.outputs[0].text - print(f"DP rank {dp_rank}, Prompt: {prompt!r}, " + print(f"DP rank {global_dp_rank}, Prompt: {prompt!r}, " f"Generated text: {generated_text!r}") + # Give engines time to pause their processing loops before exiting. + sleep(1) + if __name__ == "__main__": + + args = parse_args() + + dp_size = args.dp_size + tp_size = args.tp_size + node_size = args.node_size + node_rank = args.node_rank + + if node_size == 1: + dp_master_ip = "127.0.0.1" + dp_master_port = get_open_port() + else: + dp_master_ip = args.master_addr + dp_master_port = args.master_port + + assert dp_size % node_size == 0, "dp_size should be divisible by node_size" + dp_per_node = dp_size // node_size + from multiprocessing import Process - dp_master_ip = "127.0.0.1" - dp_master_port = get_open_port() + procs = [] - for i in range(DP_size): + for local_dp_rank, global_dp_rank in enumerate( + range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)): proc = Process(target=main, - args=(DP_size, i, dp_master_ip, dp_master_port, - GPUs_per_dp_rank)) + args=(args.model, dp_size, local_dp_rank, + global_dp_rank, dp_master_ip, dp_master_port, + tp_size)) proc.start() procs.append(proc) exit_code = 0 for proc in procs: - proc.join() - if proc.exitcode: + proc.join(timeout=300) + if proc.exitcode is None: + print(f"Killing process {proc.pid} that " + f"didn't stop within 5 minutes.") + proc.kill() + exit_code = 1 + elif proc.exitcode: exit_code = proc.exitcode exit(exit_code) diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py new file mode 100644 index 0000000000000..66efbc0c9deec --- /dev/null +++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm import LLM, SamplingParams +from vllm.config import KVTransferConfig + +# Read prompts from output.txt +prompts = [] +try: + with open("output.txt") as f: + for line in f: + prompts.append(line.strip()) + print(f"Loaded {len(prompts)} prompts from output.txt") +except FileNotFoundError: + print("Error: output.txt file not found") + exit(-1) + +sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) + +llm = LLM( + model="meta-llama/Llama-3.2-1B-Instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + max_num_batched_tokens=64, + max_num_seqs=16, + kv_transfer_config=KVTransferConfig.from_cli( + '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",' + '"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}' + )) #, max_model_len=2048, max_num_batched_tokens=2048) + +# 1ST generation (prefill instance) +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py new file mode 100644 index 0000000000000..f7cbf6557d54f --- /dev/null +++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm import LLM, SamplingParams +from vllm.config import KVTransferConfig + +context = "Hi " * 1000 +context2 = "Hey " * 500 +prompts = [ + context + "Hello, my name is", + context + "The capital of France is", + context2 + "Your name is", + context2 + "The capital of China is", +] + +sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) + +llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + kv_transfer_config=KVTransferConfig.from_cli( + '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", ' + '"kv_connector_extra_config": ' + '{"shared_storage_path": "local_storage"}}') + ) #, max_model_len=2048, max_num_batched_tokens=2048) + +# 1ST generation (prefill instance) +outputs = llm.generate( + prompts, + sampling_params, +) + +new_prompts = [] +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + new_prompts.append(prompt + generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +# Write new_prompts to output.txt +with open("output.txt", "w") as f: + for prompt in new_prompts: + f.write(prompt + "\n") +print(f"Saved {len(new_prompts)} prompts to output.txt") diff --git a/examples/offline_inference/disaggregated-prefill-v1/run.sh b/examples/offline_inference/disaggregated-prefill-v1/run.sh new file mode 100644 index 0000000000000..0ebf45a1586a0 --- /dev/null +++ b/examples/offline_inference/disaggregated-prefill-v1/run.sh @@ -0,0 +1,5 @@ +rm -rf local_storage/ +rm output.txt + +VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 prefill_example.py +VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/offline_inference/disaggregated_prefill.py index 36ee24bf7f18b..d60985146c5c9 100644 --- a/examples/offline_inference/disaggregated_prefill.py +++ b/examples/offline_inference/disaggregated_prefill.py @@ -95,7 +95,7 @@ def run_decode(prefill_done): print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -if __name__ == "__main__": +def main(): prefill_done = Event() prefill_process = Process(target=run_prefill, args=(prefill_done, )) decode_process = Process(target=run_decode, args=(prefill_done, )) @@ -109,3 +109,7 @@ if __name__ == "__main__": # Terminate the prefill node when decode is finished decode_process.join() prefill_process.terminate() + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/disaggregated_prefill_lmcache.py b/examples/offline_inference/disaggregated_prefill_lmcache.py index 5c84bbfc92c53..7da6fb7aaa230 100644 --- a/examples/offline_inference/disaggregated_prefill_lmcache.py +++ b/examples/offline_inference/disaggregated_prefill_lmcache.py @@ -38,6 +38,10 @@ os.environ["LMCACHE_REMOTE_URL"] = f"lm://localhost:{port}" # `naive` indicates using raw bytes of the tensor without any compression os.environ["LMCACHE_REMOTE_SERDE"] = "naive" +prompts = [ + "Hello, how are you?" * 1000, +] + def run_prefill(prefill_done, prompts): # We use GPU 0 for prefill node. @@ -106,12 +110,7 @@ def run_lmcache_server(port): return server_proc -if __name__ == "__main__": - - prompts = [ - "Hello, how are you?" * 1000, - ] - +def main(): prefill_done = Event() prefill_process = Process(target=run_prefill, args=(prefill_done, prompts)) decode_process = Process(target=run_decode, args=(prefill_done, prompts)) @@ -128,3 +127,7 @@ if __name__ == "__main__": prefill_process.terminate() lmcache_server_process.terminate() lmcache_server_process.wait() + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/distributed.py b/examples/offline_inference/distributed.py deleted file mode 100644 index e890c6dad8bd1..0000000000000 --- a/examples/offline_inference/distributed.py +++ /dev/null @@ -1,109 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -""" -This example shows how to use Ray Data for running offline batch inference -distributively on a multi-nodes cluster. - -Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html -""" - -from typing import Any - -import numpy as np -import ray -from packaging.version import Version -from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy - -from vllm import LLM, SamplingParams - -assert Version(ray.__version__) >= Version( - "2.22.0"), "Ray version must be at least 2.22.0" - -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - -# Set tensor parallelism per instance. -tensor_parallel_size = 1 - -# Set number of instances. Each instance will use tensor_parallel_size GPUs. -num_instances = 1 - - -# Create a class to do batch inference. -class LLMPredictor: - - def __init__(self): - # Create an LLM. - self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", - tensor_parallel_size=tensor_parallel_size) - - def __call__(self, batch: dict[str, np.ndarray]) -> dict[str, list]: - # Generate texts from the prompts. - # The output is a list of RequestOutput objects that contain the prompt, - # generated text, and other information. - outputs = self.llm.generate(batch["text"], sampling_params) - prompt: list[str] = [] - generated_text: list[str] = [] - for output in outputs: - prompt.append(output.prompt) - generated_text.append(' '.join([o.text for o in output.outputs])) - return { - "prompt": prompt, - "generated_text": generated_text, - } - - -# Read one text file from S3. Ray Data supports reading multiple files -# from cloud storage (such as JSONL, Parquet, CSV, binary format). -ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt") - - -# For tensor_parallel_size > 1, we need to create placement groups for vLLM -# to use. Every actor has to have its own placement group. -def scheduling_strategy_fn(): - # One bundle per tensor parallel worker - pg = ray.util.placement_group( - [{ - "GPU": 1, - "CPU": 1 - }] * tensor_parallel_size, - strategy="STRICT_PACK", - ) - return dict(scheduling_strategy=PlacementGroupSchedulingStrategy( - pg, placement_group_capture_child_tasks=True)) - - -resources_kwarg: dict[str, Any] = {} -if tensor_parallel_size == 1: - # For tensor_parallel_size == 1, we simply set num_gpus=1. - resources_kwarg["num_gpus"] = 1 -else: - # Otherwise, we have to set num_gpus=0 and provide - # a function that will create a placement group for - # each instance. - resources_kwarg["num_gpus"] = 0 - resources_kwarg["ray_remote_args_fn"] = scheduling_strategy_fn - -# Apply batch inference for all input data. -ds = ds.map_batches( - LLMPredictor, - # Set the concurrency to the number of LLM instances. - concurrency=num_instances, - # Specify the batch size for inference. - batch_size=32, - **resources_kwarg, -) - -# Peek first 10 results. -# NOTE: This is for local testing and debugging. For production use case, -# one should write full result out as shown below. -outputs = ds.take(limit=10) -for output in outputs: - prompt = output["prompt"] - generated_text = output["generated_text"] - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -# Write inference output data out as Parquet files to S3. -# Multiple files would be written to the output destination, -# and each task would write one or more files separately. -# -# ds.write_parquet("s3://") diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py index baa91b2d0364d..c7b4368c9b132 100644 --- a/examples/offline_inference/eagle.py +++ b/examples/offline_inference/eagle.py @@ -7,87 +7,108 @@ from transformers import AutoTokenizer from vllm import LLM, SamplingParams -parser = argparse.ArgumentParser() -parser.add_argument( - "--dataset", - type=str, - default="./examples/data/gsm8k.jsonl", - help="downloaded from the eagle repo " \ - "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/" -) -parser.add_argument("--max_num_seqs", type=int, default=8) -parser.add_argument("--num_prompts", type=int, default=80) -parser.add_argument("--num_spec_tokens", type=int, default=2) -parser.add_argument("--tp", type=int, default=1) -parser.add_argument("--draft_tp", type=int, default=1) -parser.add_argument("--enforce_eager", action='store_true') -parser.add_argument("--enable_chunked_prefill", action='store_true') -parser.add_argument("--max_num_batched_tokens", type=int, default=2048) -parser.add_argument("--temp", type=float, default=0) +def load_prompts(dataset_path, num_prompts): + if os.path.exists(dataset_path): + prompts = [] + try: + with open(dataset_path) as f: + for line in f: + data = json.loads(line) + prompts.append(data["turns"][0]) + except Exception as e: + print(f"Error reading dataset: {e}") + return [] + else: + prompts = [ + "The future of AI is", "The president of the United States is" + ] -args = parser.parse_args() + return prompts[:num_prompts] -print(args) -model_dir = "meta-llama/Meta-Llama-3-8B-Instruct" -eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm" +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--dataset", + type=str, + default="./examples/data/gsm8k.jsonl", + help="downloaded from the eagle repo " \ + "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/" + ) + parser.add_argument("--max_num_seqs", type=int, default=8) + parser.add_argument("--num_prompts", type=int, default=80) + parser.add_argument("--num_spec_tokens", type=int, default=2) + parser.add_argument("--tp", type=int, default=1) + parser.add_argument("--draft_tp", type=int, default=1) + parser.add_argument("--enforce_eager", action='store_true') + parser.add_argument("--enable_chunked_prefill", action='store_true') + parser.add_argument("--max_num_batched_tokens", type=int, default=2048) + parser.add_argument("--temp", type=float, default=0) + return parser.parse_args() -max_model_len = 2048 -tokenizer = AutoTokenizer.from_pretrained(model_dir) +def main(): -if os.path.exists(args.dataset): - prompts = [] - num_prompts = args.num_prompts - with open(args.dataset) as f: - for line in f: - data = json.loads(line) - prompts.append(data["turns"][0]) -else: - prompts = ["The future of AI is", "The president of the United States is"] + args = parse_args() -prompts = prompts[:args.num_prompts] -num_prompts = len(prompts) + model_dir = "meta-llama/Meta-Llama-3-8B-Instruct" + eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm" -prompt_ids = [ - tokenizer.apply_chat_template([{ - "role": "user", - "content": prompt - }], - add_generation_prompt=True) - for prompt in prompts -] + max_model_len = 2048 -llm = LLM( - model=model_dir, - trust_remote_code=True, - tensor_parallel_size=args.tp, - enable_chunked_prefill=args.enable_chunked_prefill, - max_num_batched_tokens=args.max_num_batched_tokens, - enforce_eager=args.enforce_eager, - max_model_len=max_model_len, - max_num_seqs=args.max_num_seqs, - gpu_memory_utilization=0.8, - speculative_model=eagle_dir, - num_speculative_tokens=args.num_spec_tokens, - speculative_draft_tensor_parallel_size=args.draft_tp, - speculative_max_model_len=max_model_len, - disable_log_stats=False, -) + tokenizer = AutoTokenizer.from_pretrained(model_dir) -sampling_params = SamplingParams(temperature=args.temp, max_tokens=256) + prompts = load_prompts(args.dataset, args.num_prompts) -outputs = llm.generate(prompt_token_ids=prompt_ids, - sampling_params=sampling_params) + prompt_ids = [ + tokenizer.apply_chat_template([{ + "role": "user", + "content": prompt + }], + add_generation_prompt=True) + for prompt in prompts + ] -# calculate the average number of accepted tokens per forward pass, +1 is -# to account for the token from the target model that's always going to be -# accepted -acceptance_counts = [0] * (args.num_spec_tokens + 1) -for output in outputs: - for step, count in enumerate(output.metrics.spec_token_acceptance_counts): - acceptance_counts[step] += count + llm = LLM( + model=model_dir, + trust_remote_code=True, + tensor_parallel_size=args.tp, + enable_chunked_prefill=args.enable_chunked_prefill, + max_num_batched_tokens=args.max_num_batched_tokens, + enforce_eager=args.enforce_eager, + max_model_len=max_model_len, + max_num_seqs=args.max_num_seqs, + gpu_memory_utilization=0.8, + speculative_config={ + "method": "eagle", + "model": eagle_dir, + "num_speculative_tokens": args.num_spec_tokens, + "draft_tensor_parallel_size": args.draft_tp, + "max_model_len": max_model_len, + }, + disable_log_stats=False, + ) -print(f"mean acceptance length: \ - {sum(acceptance_counts) / acceptance_counts[0]:.2f}") + sampling_params = SamplingParams(temperature=args.temp, max_tokens=256) + + outputs = llm.generate(prompt_token_ids=prompt_ids, + sampling_params=sampling_params) + + # calculate the average number of accepted tokens per forward pass, +1 is + # to account for the token from the target model that's always going to be + # accepted + acceptance_counts = [0] * (args.num_spec_tokens + 1) + for output in outputs: + for step, count in enumerate( + output.metrics.spec_token_acceptance_counts): + acceptance_counts[step] += count + + print("-" * 50) + print(f"mean acceptance length: \ + {sum(acceptance_counts) / acceptance_counts[0]:.2f}") + print("-" * 50) + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/embed_jina_embeddings_v3.py new file mode 100644 index 0000000000000..b347ddbf3197a --- /dev/null +++ b/examples/offline_inference/embed_jina_embeddings_v3.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 + +from argparse import Namespace + +from vllm import LLM, EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def parse_args(): + parser = FlexibleArgumentParser() + parser = EngineArgs.add_cli_args(parser) + # Set example specific arguments + parser.set_defaults(model="jinaai/jina-embeddings-v3", + task="embed", + trust_remote_code=True) + return parser.parse_args() + + +def main(args: Namespace): + # Sample prompts. + prompts = [ + "Follow the white rabbit.", # English + "Sigue al conejo blanco.", # Spanish + "Suis le lapin blanc.", # French + "跟着白兔走。", # Chinese + "اتبع الأرنب الأبيض.", # Arabic + "Folge dem weißen Kaninchen.", # German + ] + + # Create an LLM. + # You should pass task="embed" for embedding models + model = LLM(**vars(args)) + + # Generate embedding. The output is a list of EmbeddingRequestOutputs. + # Only text matching task is supported for now. See #16120 + outputs = model.embed(prompts) + + # Print the outputs. + print("\nGenerated Outputs:") + print("Only text matching task is supported for now. See #16120") + print("-" * 60) + for prompt, output in zip(prompts, outputs): + embeds = output.outputs.embedding + embeds_trimmed = ((str(embeds[:16])[:-1] + + ", ...]") if len(embeds) > 16 else embeds) + print(f"Prompt: {prompt!r} \n" + f"Embeddings for text matching: {embeds_trimmed} " + f"(size={len(embeds)})") + print("-" * 60) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/embed_matryoshka_fy.py new file mode 100644 index 0000000000000..7a6cb02556d9a --- /dev/null +++ b/examples/offline_inference/embed_matryoshka_fy.py @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: Apache-2.0 + +from argparse import Namespace + +from vllm import LLM, EngineArgs, PoolingParams +from vllm.utils import FlexibleArgumentParser + + +def parse_args(): + parser = FlexibleArgumentParser() + parser = EngineArgs.add_cli_args(parser) + # Set example specific arguments + parser.set_defaults(model="jinaai/jina-embeddings-v3", + task="embed", + trust_remote_code=True) + return parser.parse_args() + + +def main(args: Namespace): + # Sample prompts. + prompts = [ + "Follow the white rabbit.", # English + "Sigue al conejo blanco.", # Spanish + "Suis le lapin blanc.", # French + "跟着白兔走。", # Chinese + "اتبع الأرنب الأبيض.", # Arabic + "Folge dem weißen Kaninchen.", # German + ] + + # Create an LLM. + # You should pass task="embed" for embedding models + model = LLM(**vars(args)) + + # Generate embedding. The output is a list of EmbeddingRequestOutputs. + outputs = model.embed(prompts, pooling_params=PoolingParams(dimensions=32)) + + # Print the outputs. + print("\nGenerated Outputs:") + print("-" * 60) + for prompt, output in zip(prompts, outputs): + embeds = output.outputs.embedding + embeds_trimmed = ((str(embeds[:16])[:-1] + + ", ...]") if len(embeds) > 16 else embeds) + print(f"Prompt: {prompt!r} \n" + f"Embeddings: {embeds_trimmed} " + f"(size={len(embeds)})") + print("-" * 60) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py index 8765d1812cc53..c4916e00f473c 100644 --- a/examples/offline_inference/encoder_decoder.py +++ b/examples/offline_inference/encoder_decoder.py @@ -8,93 +8,112 @@ from vllm import LLM, SamplingParams from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt, zip_enc_dec_prompts) -dtype = "float" -# Create a BART encoder/decoder model instance -llm = LLM( - model="facebook/bart-large-cnn", - dtype=dtype, -) +def create_prompts(tokenizer): + # Test prompts + # + # This section shows all of the valid ways to prompt an + # encoder/decoder model. + # + # - Helpers for building prompts + text_prompt_raw = "Hello, my name is" + text_prompt = TextPrompt(prompt="The president of the United States is") + tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode( + prompt="The capital of France is")) + # - Pass a single prompt to encoder/decoder model + # (implicitly encoder input prompt); + # decoder input prompt is assumed to be None -# Get BART tokenizer -tokenizer = llm.llm_engine.get_tokenizer_group() + single_text_prompt_raw = text_prompt_raw # Pass a string directly + single_text_prompt = text_prompt # Pass a TextPrompt + single_tokens_prompt = tokens_prompt # Pass a TokensPrompt -# Test prompts -# -# This section shows all of the valid ways to prompt an -# encoder/decoder model. -# -# - Helpers for building prompts -text_prompt_raw = "Hello, my name is" -text_prompt = TextPrompt(prompt="The president of the United States is") -tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode( - prompt="The capital of France is")) -# - Pass a single prompt to encoder/decoder model -# (implicitly encoder input prompt); -# decoder input prompt is assumed to be None + # ruff: noqa: E501 + # - Pass explicit encoder and decoder input prompts within one data structure. + # Encoder and decoder prompts can both independently be text or tokens, with + # no requirement that they be the same prompt type. Some example prompt-type + # combinations are shown below, note that these are not exhaustive. -single_text_prompt_raw = text_prompt_raw # Pass a string directly -single_text_prompt = text_prompt # Pass a TextPrompt -single_tokens_prompt = tokens_prompt # Pass a TokensPrompt + enc_dec_prompt1 = ExplicitEncoderDecoderPrompt( + # Pass encoder prompt string directly, & + # pass decoder prompt tokens + encoder_prompt=single_text_prompt_raw, + decoder_prompt=single_tokens_prompt, + ) + enc_dec_prompt2 = ExplicitEncoderDecoderPrompt( + # Pass TextPrompt to encoder, and + # pass decoder prompt string directly + encoder_prompt=single_text_prompt, + decoder_prompt=single_text_prompt_raw, + ) + enc_dec_prompt3 = ExplicitEncoderDecoderPrompt( + # Pass encoder prompt tokens directly, and + # pass TextPrompt to decoder + encoder_prompt=single_tokens_prompt, + decoder_prompt=single_text_prompt, + ) -# - Pass explicit encoder and decoder input prompts within one data structure. -# Encoder and decoder prompts can both independently be text or tokens, with -# no requirement that they be the same prompt type. Some example prompt-type -# combinations are shown below, note that these are not exhaustive. + # - Finally, here's a useful helper function for zipping encoder and + # decoder prompts together into a list of ExplicitEncoderDecoderPrompt + # instances + zipped_prompt_list = zip_enc_dec_prompts( + ['An encoder prompt', 'Another encoder prompt'], + ['A decoder prompt', 'Another decoder prompt']) -enc_dec_prompt1 = ExplicitEncoderDecoderPrompt( - # Pass encoder prompt string directly, & - # pass decoder prompt tokens - encoder_prompt=single_text_prompt_raw, - decoder_prompt=single_tokens_prompt, -) -enc_dec_prompt2 = ExplicitEncoderDecoderPrompt( - # Pass TextPrompt to encoder, and - # pass decoder prompt string directly - encoder_prompt=single_text_prompt, - decoder_prompt=single_text_prompt_raw, -) -enc_dec_prompt3 = ExplicitEncoderDecoderPrompt( - # Pass encoder prompt tokens directly, and - # pass TextPrompt to decoder - encoder_prompt=single_tokens_prompt, - decoder_prompt=single_text_prompt, -) + # - Let's put all of the above example prompts together into one list + # which we will pass to the encoder/decoder LLM. + return [ + single_text_prompt_raw, single_text_prompt, single_tokens_prompt, + enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3 + ] + zipped_prompt_list -# - Finally, here's a useful helper function for zipping encoder and -# decoder prompts together into a list of ExplicitEncoderDecoderPrompt -# instances -zipped_prompt_list = zip_enc_dec_prompts( - ['An encoder prompt', 'Another encoder prompt'], - ['A decoder prompt', 'Another decoder prompt']) - -# - Let's put all of the above example prompts together into one list -# which we will pass to the encoder/decoder LLM. -prompts = [ - single_text_prompt_raw, single_text_prompt, single_tokens_prompt, - enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3 -] + zipped_prompt_list - -print(prompts) # Create a sampling params object. -sampling_params = SamplingParams( - temperature=0, - top_p=1.0, - min_tokens=0, - max_tokens=20, -) +def create_sampling_params(): + return SamplingParams( + temperature=0, + top_p=1.0, + min_tokens=0, + max_tokens=20, + ) -# Generate output tokens from the prompts. The output is a list of -# RequestOutput objects that contain the prompt, generated -# text, and other information. -outputs = llm.generate(prompts, sampling_params) # Print the outputs. -for output in outputs: - prompt = output.prompt - encoder_prompt = output.encoder_prompt - generated_text = output.outputs[0].text - print(f"Encoder prompt: {encoder_prompt!r}, " - f"Decoder prompt: {prompt!r}, " - f"Generated text: {generated_text!r}") +def print_outputs(outputs): + print("-" * 50) + for i, output in enumerate(outputs): + prompt = output.prompt + encoder_prompt = output.encoder_prompt + generated_text = output.outputs[0].text + print(f"Output {i+1}:") + print(f"Encoder prompt: {encoder_prompt!r}\n" + f"Decoder prompt: {prompt!r}\n" + f"Generated text: {generated_text!r}") + print("-" * 50) + + +def main(): + dtype = "float" + + # Create a BART encoder/decoder model instance + llm = LLM( + model="facebook/bart-large-cnn", + dtype=dtype, + ) + + # Get BART tokenizer + tokenizer = llm.llm_engine.get_tokenizer_group() + + prompts = create_prompts(tokenizer) + sampling_params = create_sampling_params() + + # Generate output tokens from the prompts. The output is a list of + # RequestOutput objects that contain the prompt, generated + # text, and other information. + outputs = llm.generate(prompts, sampling_params) + + print_outputs(outputs) + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py index 6d0c3ac1ee09a..2883c37ca2360 100644 --- a/examples/offline_inference/encoder_decoder_multimodal.py +++ b/examples/offline_inference/encoder_decoder_multimodal.py @@ -22,7 +22,7 @@ class ModelRequestData(NamedTuple): def run_florence2(): engine_args = EngineArgs( model="microsoft/Florence-2-large", - tokenizer="facebook/bart-large", + tokenizer="Isotr0py/Florence-2-tokenizer", max_num_seqs=8, trust_remote_code=True, limit_mm_per_prompt={"image": 1}, @@ -56,7 +56,7 @@ def run_florence2(): def run_mllama(): engine_args = EngineArgs( model="meta-llama/Llama-3.2-11B-Vision-Instruct", - max_model_len=4096, + max_model_len=8192, max_num_seqs=2, limit_mm_per_prompt={"image": 1}, dtype="half", @@ -126,6 +126,23 @@ model_example_map = { } +def parse_args(): + parser = FlexibleArgumentParser( + description='Demo on using vLLM for offline inference with ' + 'vision language models for text generation') + parser.add_argument('--model-type', + '-m', + type=str, + default="mllama", + choices=model_example_map.keys(), + help='Huggingface "model_type".') + parser.add_argument("--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.") + return parser.parse_args() + + def main(args): model = args.model_type if model not in model_example_map: @@ -133,6 +150,11 @@ def main(args): req_data = model_example_map[model]() + # Disable other modalities to save memory + default_limits = {"image": 0, "video": 0, "audio": 0} + req_data.engine_args.limit_mm_per_prompt = default_limits | dict( + req_data.engine_args.limit_mm_per_prompt or {}) + engine_args = asdict(req_data.engine_args) | {"seed": args.seed} llm = LLM(**engine_args) @@ -143,6 +165,7 @@ def main(args): temperature=0, top_p=1.0, max_tokens=64, + skip_special_tokens=False, ) start = time.time() @@ -166,19 +189,5 @@ def main(args): if __name__ == "__main__": - parser = FlexibleArgumentParser( - description='Demo on using vLLM for offline inference with ' - 'vision language models for text generation') - parser.add_argument('--model-type', - '-m', - type=str, - default="mllama", - choices=model_example_map.keys(), - help='Huggingface "model_type".') - parser.add_argument("--seed", - type=int, - default=None, - help="Set the seed when initializing `vllm.LLM`.") - - args = parser.parse_args() + args = parse_args() main(args) diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py index e94f47b72b2e9..d84cd9ee9f52b 100644 --- a/examples/offline_inference/llm_engine_example.py +++ b/examples/offline_inference/llm_engine_example.py @@ -1,5 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 - +""" +This file demonstrates using the `LLMEngine` +for processing prompts with various sampling parameters. +""" import argparse from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams @@ -26,6 +29,7 @@ def process_requests(engine: LLMEngine, """Continuously process a list of prompts and handle the outputs.""" request_id = 0 + print('-' * 50) while test_prompts or engine.has_unfinished_requests(): if test_prompts: prompt, sampling_params = test_prompts.pop(0) @@ -37,6 +41,7 @@ def process_requests(engine: LLMEngine, for request_output in request_outputs: if request_output.finished: print(request_output) + print('-' * 50) def initialize_engine(args: argparse.Namespace) -> LLMEngine: @@ -45,6 +50,13 @@ def initialize_engine(args: argparse.Namespace) -> LLMEngine: return LLMEngine.from_engine_args(engine_args) +def parse_args(): + parser = FlexibleArgumentParser( + description='Demo on using the LLMEngine class directly') + parser = EngineArgs.add_cli_args(parser) + return parser.parse_args() + + def main(args: argparse.Namespace): """Main function that sets up and runs the prompt processing.""" engine = initialize_engine(args) @@ -53,8 +65,5 @@ def main(args: argparse.Namespace): if __name__ == '__main__': - parser = FlexibleArgumentParser( - description='Demo on using the LLMEngine class directly') - parser = EngineArgs.add_cli_args(parser) - args = parser.parse_args() + args = parse_args() main(args) diff --git a/examples/offline_inference/load_sharded_state.py b/examples/offline_inference/load_sharded_state.py new file mode 100644 index 0000000000000..7e90d5d25e293 --- /dev/null +++ b/examples/offline_inference/load_sharded_state.py @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Validates the loading of a model saved with the sharded_state format. +This script demonstrates how to load a model that was previously saved +using save_sharded_state.py and validates it by running inference. +Example usage: +(First need to save a sharded_state mode) + +python save_sharded_state.py \ + --model /path/to/load \ + --quantization deepspeedfp \ + --tensor-parallel-size 8 \ + --output /path/to/save/sharded/modele + +python load_sharded_state.py \ + --model /path/to/saved/sharded/model \ + --load-format sharded_state \ + --quantization deepspeedfp \ + --tensor-parallel-size 8 \ + --prompt "Hello, my name is" \ + --max-tokens 50 +""" + +import dataclasses + +from vllm import LLM, EngineArgs, SamplingParams +from vllm.utils import FlexibleArgumentParser + + +def parse_args(): + parser = FlexibleArgumentParser() + # Add engine arguments + EngineArgs.add_cli_args(parser) + + # Override default load_format for clarity + parser.set_defaults(load_format="sharded_state") + + # Add validation arguments + parser.add_argument("--prompt", + type=str, + default="Hello, world!", + help="Prompt for validation") + parser.add_argument("--max-tokens", + type=int, + default=100, + help="Maximum number of tokens to generate") + parser.add_argument("--temperature", + type=float, + default=0.7, + help="Sampling temperature") + parser.add_argument("--top-p", + type=float, + default=1.0, + help="Top-p sampling parameter") + + return parser.parse_args() + + +def main(): + args = parse_args() + engine_args = EngineArgs.from_cli_args(args) + + print(f"Loading model from {engine_args.model} " + f"using format {engine_args.load_format}") + print(f"Tensor parallel size: {engine_args.tensor_parallel_size}") + + # Load the model using engine args + llm = LLM(**dataclasses.asdict(engine_args)) + + # Prepare sampling parameters + sampling_params = SamplingParams( + temperature=args.temperature, + top_p=args.top_p, + max_tokens=args.max_tokens, + ) + + print("\nRunning inference:") + print(f"Prompt: {args.prompt}") + + # Generate completion + outputs = llm.generate(args.prompt, sampling_params) + + # Display generated text + print("\nGenerated outputs:") + for output in outputs: + generated_text = output.outputs[0].text + print("-" * 50) + print(f"Full output: {args.prompt}{generated_text}") + print("-" * 50) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py index 43be2aa80773f..37c3181dc5faf 100644 --- a/examples/offline_inference/mistral-small.py +++ b/examples/offline_inference/mistral-small.py @@ -13,9 +13,14 @@ from vllm.sampling_params import SamplingParams # - Server: # # ```bash +# # Mistral format # vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \ # --tokenizer-mode mistral --config-format mistral --load-format mistral \ -# --limit-mm-per-prompt 'image=4' --max-model-len 16384 +# --limit-mm-per-prompt '{"image":4}' --max-model-len 16384 +# +# # HF format +# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \ +# --limit-mm-per-prompt '{"image":4}' --max-model-len 16384 # ``` # # - Client: @@ -44,19 +49,23 @@ from vllm.sampling_params import SamplingParams # python demo.py simple # python demo.py advanced +# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs. +# These scripts have been tested on 2x L40 GPUs + def run_simple_demo(args: argparse.Namespace): model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" sampling_params = SamplingParams(max_tokens=8192) - # Lower max_model_len and/or max_num_seqs on low-VRAM GPUs. llm = LLM( model=model_name, - tokenizer_mode="mistral", - config_format="mistral", - load_format="mistral", + tokenizer_mode="mistral" if args.format == "mistral" else "auto", + config_format="mistral" if args.format == "mistral" else "auto", + load_format="mistral" if args.format == "mistral" else "auto", + limit_mm_per_prompt={"image": 1}, max_model_len=4096, max_num_seqs=2, + tensor_parallel_size=2, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) @@ -82,23 +91,25 @@ def run_simple_demo(args: argparse.Namespace): }, ] outputs = llm.chat(messages, sampling_params=sampling_params) - + print("-" * 50) print(outputs[0].outputs[0].text) + print("-" * 50) def run_advanced_demo(args: argparse.Namespace): model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" - max_img_per_msg = 5 + max_img_per_msg = 3 max_tokens_per_img = 4096 sampling_params = SamplingParams(max_tokens=8192, temperature=0.7) llm = LLM( model=model_name, - tokenizer_mode="mistral", - config_format="mistral", - load_format="mistral", + tokenizer_mode="mistral" if args.format == "mistral" else "auto", + config_format="mistral" if args.format == "mistral" else "auto", + load_format="mistral" if args.format == "mistral" else "auto", limit_mm_per_prompt={"image": max_img_per_msg}, max_model_len=max_img_per_msg * max_tokens_per_img, + tensor_parallel_size=2, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) @@ -153,10 +164,12 @@ def run_advanced_demo(args: argparse.Namespace): ] outputs = llm.chat(messages=messages, sampling_params=sampling_params) + print("-" * 50) print(outputs[0].outputs[0].text) + print("-" * 50) -def main(): +def parse_args(): parser = argparse.ArgumentParser( description="Run a demo in simple or advanced mode.") @@ -166,12 +179,20 @@ def main(): help="Specify the demo mode: 'simple' or 'advanced'", ) + parser.add_argument('--format', + choices=["mistral", "hf"], + default="mistral", + help='Specify the format of the model to load.') + parser.add_argument( '--disable-mm-preprocessor-cache', action='store_true', help='If True, disables caching of multi-modal preprocessor/mapper.') + return parser.parse_args() - args = parser.parse_args() + +def main(): + args = parse_args() if args.mode == "simple": print("Running simple demo...") diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py index 380c53fab2201..53c58a76d9dc1 100644 --- a/examples/offline_inference/mlpspeculator.py +++ b/examples/offline_inference/mlpspeculator.py @@ -1,4 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 +""" +This file demonstrates the usage of text generation with an LLM model, +comparing the performance with and without speculative decoding. + +Note that still not support `v1`: +VLLM_USE_V1=0 python examples/offline_inference/mlpspeculator.py +""" import gc import time @@ -7,7 +14,7 @@ from vllm import LLM, SamplingParams def time_generation(llm: LLM, prompts: list[str], - sampling_params: SamplingParams): + sampling_params: SamplingParams, title: str): # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. # Warmup first @@ -16,15 +23,18 @@ def time_generation(llm: LLM, prompts: list[str], start = time.time() outputs = llm.generate(prompts, sampling_params) end = time.time() - print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs])) + print("-" * 50) + print(title) + print("time: ", + (end - start) / sum(len(o.outputs[0].token_ids) for o in outputs)) # Print the outputs. for output in outputs: generated_text = output.outputs[0].text print(f"text: {generated_text!r}") + print("-" * 50) -if __name__ == "__main__": - +def main(): template = ( "Below is an instruction that describes a task. Write a response " "that appropriately completes the request.\n\n### Instruction:\n{}" @@ -41,8 +51,7 @@ if __name__ == "__main__": # Create an LLM without spec decoding llm = LLM(model="meta-llama/Llama-2-13b-chat-hf") - print("Without speculation") - time_generation(llm, prompts, sampling_params) + time_generation(llm, prompts, sampling_params, "Without speculation") del llm gc.collect() @@ -55,5 +64,8 @@ if __name__ == "__main__": }, ) - print("With speculation") - time_generation(llm, prompts, sampling_params) + time_generation(llm, prompts, sampling_params, "With speculation") + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py index 4b0d115e6609c..de409740292a8 100644 --- a/examples/offline_inference/multilora_inference.py +++ b/examples/offline_inference/multilora_inference.py @@ -61,6 +61,7 @@ def process_requests(engine: LLMEngine, """Continuously process a list of prompts and handle the outputs.""" request_id = 0 + print("-" * 50) while test_prompts or engine.has_unfinished_requests(): if test_prompts: prompt, sampling_params, lora_request = test_prompts.pop(0) @@ -75,6 +76,7 @@ def process_requests(engine: LLMEngine, for request_output in request_outputs: if request_output.finished: print(request_output) + print("-" * 50) def initialize_engine() -> LLMEngine: diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py index 517d1bfce95d8..5906c7b2c6b30 100644 --- a/examples/offline_inference/neuron.py +++ b/examples/offline_inference/neuron.py @@ -12,27 +12,36 @@ prompts = [ # Create a sampling params object. sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -# Create an LLM. -llm = LLM( - model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", - max_num_seqs=8, - # The max_model_len and block_size arguments are required to be same as - # max sequence length when targeting neuron device. - # Currently, this is a known limitation in continuous batching support - # in transformers-neuronx. - # TODO(liangfu): Support paged-attention in transformers-neuronx. - max_model_len=1024, - block_size=1024, - # The device can be automatically detected when AWS Neuron SDK is installed. - # The device argument can be either unspecified for automated detection, - # or explicitly assigned. - device="neuron", - tensor_parallel_size=2) -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +def main(): + # Create an LLM. + llm = LLM( + model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + max_num_seqs=8, + # The max_model_len and block_size arguments are required to be same as + # max sequence length when targeting neuron device. + # Currently, this is a known limitation in continuous batching support + # in transformers-neuronx. + # TODO(liangfu): Support paged-attention in transformers-neuronx. + max_model_len=1024, + block_size=1024, + # ruff: noqa: E501 + # The device can be automatically detected when AWS Neuron SDK is installed. + # The device argument can be either unspecified for automated detection, + # or explicitly assigned. + device="neuron", + tensor_parallel_size=2) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + print("-" * 50) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py index c899a01a0bb93..af21274a3a5b8 100644 --- a/examples/offline_inference/neuron_int8_quantization.py +++ b/examples/offline_inference/neuron_int8_quantization.py @@ -22,31 +22,40 @@ prompts = [ # Create a sampling params object. sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -# Create an LLM. -llm = LLM( - model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", - max_num_seqs=8, - # The max_model_len and block_size arguments are required to be same as - # max sequence length when targeting neuron device. - # Currently, this is a known limitation in continuous batching support - # in transformers-neuronx. - # TODO(liangfu): Support paged-attention in transformers-neuronx. - max_model_len=2048, - block_size=2048, - # The device can be automatically detected when AWS Neuron SDK is installed. - # The device argument can be either unspecified for automated detection, - # or explicitly assigned. - device="neuron", - quantization="neuron_quant", - override_neuron_config={ - "cast_logits_dtype": "bfloat16", - }, - tensor_parallel_size=2) -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +def main(): + # Create an LLM. + llm = LLM( + model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + max_num_seqs=8, + # The max_model_len and block_size arguments are required to be same as + # max sequence length when targeting neuron device. + # Currently, this is a known limitation in continuous batching support + # in transformers-neuronx. + # TODO(liangfu): Support paged-attention in transformers-neuronx. + max_model_len=2048, + block_size=2048, + # ruff: noqa: E501 + # The device can be automatically detected when AWS Neuron SDK is installed. + # The device argument can be either unspecified for automated detection, + # or explicitly assigned. + device="neuron", + quantization="neuron_quant", + override_neuron_config={ + "cast_logits_dtype": "bfloat16", + }, + tensor_parallel_size=2) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + print("-" * 50) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/prefix_caching.py b/examples/offline_inference/prefix_caching.py index 4c326c417b4db..f0bec387d3a9b 100644 --- a/examples/offline_inference/prefix_caching.py +++ b/examples/offline_inference/prefix_caching.py @@ -31,55 +31,62 @@ generating_prompts = [prefix + prompt for prompt in prompts] # Create a sampling params object. sampling_params = SamplingParams(temperature=0.0) -# Create an LLM without prefix caching as a baseline. -regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4) -print("Results without `enable_prefix_caching`") +def main(): + # Create an LLM without prefix caching as a baseline. + regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4) -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = regular_llm.generate(generating_prompts, sampling_params) + print("Results without `enable_prefix_caching`") -regular_generated_texts = [] -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - regular_generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + # ruff: noqa: E501 + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = regular_llm.generate(generating_prompts, sampling_params) -print("-" * 80) + regular_generated_texts = [] + # Print the outputs. + print("-" * 50) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + regular_generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) -# Destroy the LLM object and free up the GPU memory. -del regular_llm -cleanup_dist_env_and_memory() + # Destroy the LLM object and free up the GPU memory. + del regular_llm + cleanup_dist_env_and_memory() -# Create an LLM with prefix caching enabled. -prefix_cached_llm = LLM(model="facebook/opt-125m", - enable_prefix_caching=True, - gpu_memory_utilization=0.4) + # Create an LLM with prefix caching enabled. + prefix_cached_llm = LLM(model="facebook/opt-125m", + enable_prefix_caching=True, + gpu_memory_utilization=0.4) -# Warmup so that the shared prompt's KV cache is computed. -prefix_cached_llm.generate(generating_prompts[0], sampling_params) + # Warmup so that the shared prompt's KV cache is computed. + prefix_cached_llm.generate(generating_prompts[0], sampling_params) -# Generate with prefix caching. -outputs = prefix_cached_llm.generate(generating_prompts, sampling_params) + # Generate with prefix caching. + outputs = prefix_cached_llm.generate(generating_prompts, sampling_params) -print("Results with `enable_prefix_caching`") + print("Results with `enable_prefix_caching`") -cached_generated_texts = [] -# Print the outputs. You should see the same outputs as before. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - cached_generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + cached_generated_texts = [] + # Print the outputs. You should see the same outputs as before. + print("-" * 50) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + cached_generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) -print("-" * 80) + # Compare the results and display the speedup + generated_same = all([ + regular_generated_texts[i] == cached_generated_texts[i] + for i in range(len(prompts)) + ]) + print(f"Generated answers are the same: {generated_same}") -# Compare the results and display the speedup -generated_same = all([ - regular_generated_texts[i] == cached_generated_texts[i] - for i in range(len(prompts)) -]) -print(f"Generated answers are the same: {generated_same}") + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py index 3ae507cac5ce1..f97a1f32e6210 100644 --- a/examples/offline_inference/prithvi_geospatial_mae.py +++ b/examples/offline_inference/prithvi_geospatial_mae.py @@ -417,6 +417,38 @@ def run_model(input_data, return pred_imgs +def parse_args(): + parser = argparse.ArgumentParser("MAE run inference", add_help=False) + + parser.add_argument( + "--data_file", + type=str, + default="./India_900498_S2Hand.tif", + help="Path to the file.", + ) + parser.add_argument( + "--output_dir", + type=str, + default="output", + help="Path to the directory where to save outputs.", + ) + parser.add_argument( + "--input_indices", + default=[1, 2, 3, 8, 11, 12], + type=int, + nargs="+", + help= + "0-based indices of the six Prithvi channels to be selected from the " + "input. By default selects [1,2,3,8,11,12] for S2L1C data.", + ) + parser.add_argument( + "--rgb_outputs", + action="store_true", + help="If present, output files will only contain RGB channels. " + "Otherwise, all bands will be saved.", + ) + + def main( data_file: str, output_dir: str, @@ -496,35 +528,7 @@ def main( if __name__ == "__main__": - parser = argparse.ArgumentParser("MAE run inference", add_help=False) - parser.add_argument( - "--data_file", - type=str, - default="./India_900498_S2Hand.tif", - help="Path to the file.", - ) - parser.add_argument( - "--output_dir", - type=str, - default="output", - help="Path to the directory where to save outputs.", - ) - parser.add_argument( - "--input_indices", - default=[1, 2, 3, 8, 11, 12], - type=int, - nargs="+", - help= - "0-based indices of the six Prithvi channels to be selected from the " - "input. By default selects [1,2,3,8,11,12] for S2L1C data.", - ) - parser.add_argument( - "--rgb_outputs", - action="store_true", - help="If present, output files will only contain RGB channels. " - "Otherwise, all bands will be saved.", - ) - args = parser.parse_args() + args = parse_args() main(**vars(args)) diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py index ffa76b4e4f2ce..9c818d0757345 100644 --- a/examples/offline_inference/profiling.py +++ b/examples/offline_inference/profiling.py @@ -234,9 +234,8 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], sampling_params.max_tokens = next(output_len_generator) assert isinstance(sampling_params.max_tokens, int) - prompt_token_ids = torch.randint( - llm.llm_engine.model_config.get_vocab_size(), - size=(prompt_len, )).tolist() + prompt_token_ids = torch.randint(llm.get_tokenizer().vocab_size, + size=(prompt_len, )).tolist() llm.llm_engine.add_request( request_id=f"seq{i}", @@ -360,7 +359,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], f" in folder {context.save_chrome_traces_folder}") -if __name__ == "__main__": +def parse_args(): parser = FlexibleArgumentParser(description=""" Profile a model @@ -450,7 +449,10 @@ Profile a model EngineArgs.add_cli_args(parser) - args = parser.parse_args() + return parser.parse_args() + + +def main(args): context = ProfileContext( engine_args=EngineArgs.from_cli_args(args), **{ @@ -459,3 +461,8 @@ Profile a model if k in inspect.signature(ProfileContext).parameters }) run_profile(context, csv_output=args.csv, json_output=args.json) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md new file mode 100644 index 0000000000000..c30541a598cee --- /dev/null +++ b/examples/offline_inference/qwen2_5_omni/README.md @@ -0,0 +1,32 @@ +# Qwen2.5-Omni Offline Inference Examples + +This folder provides several example scripts on how to inference Qwen2.5-Omni offline. + +## Thinker Only + +```bash +# Audio + image + video +python examples/offline_inference/qwen2_5_omni/only_thinker.py -q mixed_modalities + +# Read vision and audio inputs from a single video file +# NOTE: V1 engine does not support interleaved modalities yet. +VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q use_audio_in_video + +# Multiple audios +VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q multi_audios +``` + +This script will run the thinker part of Qwen2.5-Omni, and generate text response. + +You can also test Qwen2.5-Omni on a single modality: + +```bash +# Process audio inputs +python examples/offline_inference/audio_language.py --model-type qwen2_5_omni + +# Process image inputs +python examples/offline_inference/vision_language.py --modality image --model-type qwen2_5_omni + +# Process video inputs +python examples/offline_inference/vision_language.py --modality video --model-type qwen2_5_omni +``` diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py new file mode 100644 index 0000000000000..c75a990120e07 --- /dev/null +++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py @@ -0,0 +1,160 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +This example shows how to use vLLM for running offline inference +with the correct prompt format on Qwen2.5-Omni (thinker only). +""" + +from typing import NamedTuple + +import vllm.envs as envs +from vllm import LLM, SamplingParams +from vllm.assets.audio import AudioAsset +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset +from vllm.utils import FlexibleArgumentParser + + +class QueryResult(NamedTuple): + inputs: dict + limit_mm_per_prompt: dict[str, int] + + +# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on +# lower-end GPUs. +# Unless specified, these settings have been tested to work on a single L4. + +default_system = ( + "You are Qwen, a virtual human developed by the Qwen Team, Alibaba " + "Group, capable of perceiving auditory and visual inputs, as well as " + "generating text and speech.") + + +def get_mixed_modalities_query() -> QueryResult: + question = ("What is recited in the audio? " + "What is the content of this image? Why is this video funny?") + prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n" + "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>" + "<|vision_bos|><|IMAGE|><|vision_eos|>" + "<|vision_bos|><|VIDEO|><|vision_eos|>" + f"{question}<|im_end|>\n" + f"<|im_start|>assistant\n") + return QueryResult( + inputs={ + "prompt": prompt, + "multi_modal_data": { + "audio": + AudioAsset("mary_had_lamb").audio_and_sample_rate, + "image": + ImageAsset("cherry_blossom").pil_image.convert("RGB"), + "video": + VideoAsset(name="sample_demo_1.mp4", + num_frames=16).np_ndarrays, + }, + }, + limit_mm_per_prompt={ + "audio": 1, + "image": 1, + "video": 1 + }, + ) + + +def get_use_audio_in_video_query() -> QueryResult: + question = ("Describe the content of the video, " + "then convert what the baby say into text.") + prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n" + "<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>" + f"{question}<|im_end|>\n" + f"<|im_start|>assistant\n") + asset = VideoAsset(name="sample_demo_1.mp4", num_frames=16) + audio = asset.get_audio(sampling_rate=16000) + assert not envs.VLLM_USE_V1, ("V1 does not support use_audio_in_video. " + "Please launch this example with " + "`VLLM_USE_V1=0`.") + return QueryResult( + inputs={ + "prompt": prompt, + "multi_modal_data": { + "video": asset.np_ndarrays, + "audio": audio, + }, + "mm_processor_kwargs": { + "use_audio_in_video": True, + }, + }, + limit_mm_per_prompt={ + "audio": 1, + "video": 1 + }, + ) + + +def get_multi_audios_query() -> QueryResult: + question = "Are these two audio clips the same?" + prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n" + "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>" + "<|audio_bos|><|AUDIO|><|audio_eos|>" + f"{question}<|im_end|>\n" + f"<|im_start|>assistant\n") + return QueryResult( + inputs={ + "prompt": prompt, + "multi_modal_data": { + "audio": [ + AudioAsset("winning_call").audio_and_sample_rate, + AudioAsset("mary_had_lamb").audio_and_sample_rate, + ], + }, + }, + limit_mm_per_prompt={ + "audio": 2, + }, + ) + + +query_map = { + "mixed_modalities": get_mixed_modalities_query, + "use_audio_in_video": get_use_audio_in_video_query, + "multi_audios": get_multi_audios_query, +} + + +def main(args): + model_name = "Qwen/Qwen2.5-Omni-7B" + query_result = query_map[args.query_type]() + + llm = LLM(model=model_name, + max_model_len=5632, + max_num_seqs=5, + limit_mm_per_prompt=query_result.limit_mm_per_prompt, + seed=args.seed) + + # We set temperature to 0.2 so that outputs can be different + # even when all prompts are identical when running batch inference. + sampling_params = SamplingParams(temperature=0.2, max_tokens=64) + + outputs = llm.generate(query_result.inputs, + sampling_params=sampling_params) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description='Demo on using vLLM for offline inference with ' + 'audio language models') + parser.add_argument('--query-type', + '-q', + type=str, + default="mixed_modalities", + choices=query_map.keys(), + help='Query type.') + parser.add_argument("--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.") + + args = parser.parse_args() + main(args) diff --git a/examples/offline_inference/reproduciblity.py b/examples/offline_inference/reproduciblity.py index d0197bf6d5ba0..b2be117d1a0a9 100644 --- a/examples/offline_inference/reproduciblity.py +++ b/examples/offline_inference/reproduciblity.py @@ -19,8 +19,6 @@ SEED = 42 # because it is almost impossible to make the scheduling deterministic in the # online serving setting. -llm = LLM(model="facebook/opt-125m", seed=SEED) - prompts = [ "Hello, my name is", "The president of the United States is", @@ -29,8 +27,17 @@ prompts = [ ] sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -outputs = llm.generate(prompts, sampling_params) -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +def main(): + llm = LLM(model="facebook/opt-125m", seed=SEED) + outputs = llm.generate(prompts, sampling_params) + print("-" * 50) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py index b0418c092ca3c..e0ed0ac49754b 100644 --- a/examples/offline_inference/rlhf.py +++ b/examples/offline_inference/rlhf.py @@ -85,11 +85,13 @@ sampling_params = SamplingParams(temperature=0) outputs = ray.get(llm.generate.remote(prompts, sampling_params)) +print("-" * 50) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, " + print(f"Prompt: {prompt!r}\n" f"Generated text: {generated_text!r}") + print("-" * 50) # set up the communication between the training process # and the inference engine. @@ -120,8 +122,10 @@ assert all(ray.get(llm.collective_rpc.remote("check_weights_changed"))) # use the updated model to generate texts, they will be nonsense # because the weights are all zeros. outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params)) +print("-" * 50) for output in outputs_updated: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, " + print(f"Prompt: {prompt!r}\n" f"Generated text: {generated_text!r}") + print("-" * 50) diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py index 863276432cb9c..338380cc96841 100644 --- a/examples/offline_inference/save_sharded_state.py +++ b/examples/offline_inference/save_sharded_state.py @@ -29,20 +29,23 @@ from pathlib import Path from vllm import LLM, EngineArgs from vllm.utils import FlexibleArgumentParser -parser = FlexibleArgumentParser() -EngineArgs.add_cli_args(parser) -parser.add_argument("--output", - "-o", - required=True, - type=str, - help="path to output checkpoint") -parser.add_argument("--file-pattern", - type=str, - help="string pattern of saved filenames") -parser.add_argument("--max-file-size", - type=str, - default=5 * 1024**3, - help="max size (in bytes) of each safetensors file") + +def parse_args(): + parser = FlexibleArgumentParser() + EngineArgs.add_cli_args(parser) + parser.add_argument("--output", + "-o", + required=True, + type=str, + help="path to output checkpoint") + parser.add_argument("--file-pattern", + type=str, + help="string pattern of saved filenames") + parser.add_argument("--max-file-size", + type=str, + default=5 * 1024**3, + help="max size (in bytes) of each safetensors file") + return parser.parse_args() def main(args): @@ -57,10 +60,25 @@ def main(args): # Prepare output directory Path(args.output).mkdir(exist_ok=True) # Dump worker states to output directory - model_executor = llm.llm_engine.model_executor - model_executor.save_sharded_state(path=args.output, - pattern=args.file_pattern, - max_size=args.max_file_size) + + # Check which engine version is being used + is_v1_engine = hasattr(llm.llm_engine, "engine_core") + + if is_v1_engine: + # For V1 engine, we need to use engine_core.save_sharded_state + print("Using V1 engine save path") + llm.llm_engine.engine_core.save_sharded_state( + path=args.output, + pattern=args.file_pattern, + max_size=args.max_file_size) + else: + # For V0 engine + print("Using V0 engine save path") + model_executor = llm.llm_engine.model_executor + model_executor.save_sharded_state(path=args.output, + pattern=args.file_pattern, + max_size=args.max_file_size) + # Copy metadata files to output directory for file in os.listdir(model_path): if os.path.splitext(file)[1] not in (".bin", ".pt", ".safetensors"): @@ -72,5 +90,5 @@ def main(args): if __name__ == "__main__": - args = parser.parse_args() + args = parse_args() main(args) diff --git a/examples/offline_inference/simple_profiling.py b/examples/offline_inference/simple_profiling.py index b45954b3bd54a..d583110c8e69b 100644 --- a/examples/offline_inference/simple_profiling.py +++ b/examples/offline_inference/simple_profiling.py @@ -18,8 +18,8 @@ prompts = [ # Create a sampling params object. sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -if __name__ == "__main__": +def main(): # Create an LLM. llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1) @@ -32,11 +32,17 @@ if __name__ == "__main__": llm.stop_profile() # Print the outputs. + print("-" * 50) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) # Add a buffer to wait for profiler in the background process # (in case MP is on) to finish writing profiling output. time.sleep(10) + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py index 38ffd7fb9903d..363b500e0adf8 100644 --- a/examples/offline_inference/structured_outputs.py +++ b/examples/offline_inference/structured_outputs.py @@ -1,4 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 +""" +This file demonstrates the example usage of guided decoding +to generate structured outputs using vLLM. It shows how to apply +different guided decoding techniques such as Choice, Regex, JSON schema, +and Grammar to produce structured and formatted results +based on specific prompts. +""" from enum import Enum @@ -7,26 +14,21 @@ from pydantic import BaseModel from vllm import LLM, SamplingParams from vllm.sampling_params import GuidedDecodingParams -llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100) - # Guided decoding by Choice (list of possible options) -guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"]) -sampling_params = SamplingParams(guided_decoding=guided_decoding_params) -outputs = llm.generate( - prompts="Classify this sentiment: vLLM is wonderful!", - sampling_params=sampling_params, -) -print(outputs[0].outputs[0].text) +guided_decoding_params_choice = GuidedDecodingParams( + choice=["Positive", "Negative"]) +sampling_params_choice = SamplingParams( + guided_decoding=guided_decoding_params_choice) +prompt_choice = "Classify this sentiment: vLLM is wonderful!" # Guided decoding by Regex -guided_decoding_params = GuidedDecodingParams(regex="\w+@\w+\.com\n") -sampling_params = SamplingParams(guided_decoding=guided_decoding_params, - stop=["\n"]) -prompt = ("Generate an email address for Alan Turing, who works in Enigma." - "End in .com and new line. Example result:" - "alan.turing@enigma.com\n") -outputs = llm.generate(prompts=prompt, sampling_params=sampling_params) -print(outputs[0].outputs[0].text) +guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n") +sampling_params_regex = SamplingParams( + guided_decoding=guided_decoding_params_regex, stop=["\n"]) +prompt_regex = ( + "Generate an email address for Alan Turing, who works in Enigma." + "End in .com and new line. Example result:" + "alan.turing@enigma.com\n") # Guided decoding by JSON using Pydantic schema @@ -44,37 +46,54 @@ class CarDescription(BaseModel): json_schema = CarDescription.model_json_schema() - -guided_decoding_params = GuidedDecodingParams(json=json_schema) -sampling_params = SamplingParams(guided_decoding=guided_decoding_params) -prompt = ("Generate a JSON with the brand, model and car_type of" - "the most iconic car from the 90's") -outputs = llm.generate( - prompts=prompt, - sampling_params=sampling_params, -) -print(outputs[0].outputs[0].text) +guided_decoding_params_json = GuidedDecodingParams(json=json_schema) +sampling_params_json = SamplingParams( + guided_decoding=guided_decoding_params_json) +prompt_json = ("Generate a JSON with the brand, model and car_type of" + "the most iconic car from the 90's") # Guided decoding by Grammar simplified_sql_grammar = """ - ?start: select_statement - - ?select_statement: "SELECT " column_list " FROM " table_name - - ?column_list: column_name ("," column_name)* - - ?table_name: identifier - - ?column_name: identifier - - ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ +root ::= select_statement +select_statement ::= "SELECT " column " from " table " where " condition +column ::= "col_1 " | "col_2 " +table ::= "table_1 " | "table_2 " +condition ::= column "= " number +number ::= "1 " | "2 " """ -guided_decoding_params = GuidedDecodingParams(grammar=simplified_sql_grammar) -sampling_params = SamplingParams(guided_decoding=guided_decoding_params) -prompt = ("Generate an SQL query to show the 'username' and 'email'" - "from the 'users' table.") -outputs = llm.generate( - prompts=prompt, - sampling_params=sampling_params, -) -print(outputs[0].outputs[0].text) +guided_decoding_params_grammar = GuidedDecodingParams( + grammar=simplified_sql_grammar) +sampling_params_grammar = SamplingParams( + guided_decoding=guided_decoding_params_grammar) +prompt_grammar = ("Generate an SQL query to show the 'username' and 'email'" + "from the 'users' table.") + + +def format_output(title: str, output: str): + print(f"{'-' * 50}\n{title}: {output}\n{'-' * 50}") + + +def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM): + outputs = llm.generate(prompts=prompt, sampling_params=sampling_params) + return outputs[0].outputs[0].text + + +def main(): + llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100) + + choice_output = generate_output(prompt_choice, sampling_params_choice, llm) + format_output("Guided decoding by Choice", choice_output) + + regex_output = generate_output(prompt_regex, sampling_params_regex, llm) + format_output("Guided decoding by Regex", regex_output) + + json_output = generate_output(prompt_json, sampling_params_json, llm) + format_output("Guided decoding by JSON", json_output) + + grammar_output = generate_output(prompt_grammar, sampling_params_grammar, + llm) + format_output("Guided decoding by Grammar", grammar_output) + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py index 35df6011550f2..c6d9e6b47e21f 100644 --- a/examples/offline_inference/torchrun_example.py +++ b/examples/offline_inference/torchrun_example.py @@ -23,20 +23,26 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Use `distributed_executor_backend="external_launcher"` so that # this llm engine/instance only creates one worker. +# it is important to set an explicit seed to make sure that +# all ranks have the same random seed, so that sampling can be +# deterministic across ranks. llm = LLM( model="facebook/opt-125m", tensor_parallel_size=2, distributed_executor_backend="external_launcher", + seed=0, ) outputs = llm.generate(prompts, sampling_params) # all ranks will have the same outputs +print("-" * 50) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, " + print(f"Prompt: {prompt!r}\n" f"Generated text: {generated_text!r}") + print("-" * 50) """ Further tips: diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py index 4a8f17ba1d0d7..dea717c36082f 100644 --- a/examples/offline_inference/tpu.py +++ b/examples/offline_inference/tpu.py @@ -14,19 +14,24 @@ answers = [ ] N = 1 # Currently, top-p sampling is disabled. `top_p` should be 1.0. -sampling_params = SamplingParams(temperature=0.7, - top_p=1.0, - n=N, - max_tokens=16) +sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16) -# Set `enforce_eager=True` to avoid ahead-of-time compilation. -# In real workloads, `enforace_eager` should be `False`. -llm = LLM(model="Qwen/Qwen2-1.5B-Instruct", - max_num_batched_tokens=64, - max_num_seqs=4) -outputs = llm.generate(prompts, sampling_params) -for output, answer in zip(outputs, answers): - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - assert generated_text.startswith(answer) + +def main(): + # Set `enforce_eager=True` to avoid ahead-of-time compilation. + # In real workloads, `enforace_eager` should be `False`. + llm = LLM(model="Qwen/Qwen2-1.5B-Instruct", + max_num_batched_tokens=64, + max_num_seqs=4) + outputs = llm.generate(prompts, sampling_params) + print("-" * 50) + for output, answer in zip(outputs, answers): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + assert generated_text.startswith(answer) + print("-" * 50) + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 0adbe574370d3..d02ac17cfdd68 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -8,6 +8,7 @@ on HuggingFace model repository. """ import os import random +from contextlib import contextmanager from dataclasses import asdict from typing import NamedTuple, Optional @@ -44,7 +45,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData: max_model_len=4096, max_num_seqs=2, dtype="bfloat16", - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) prompts = [(f"<|im_start|>user\n<|img|>{question}" @@ -60,6 +61,28 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData: ) +# Aya Vision +def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + model_name = "CohereForAI/aya-vision-8b" + + engine_args = EngineArgs( + model=model_name, + max_model_len=2048, + max_num_seqs=2, + mm_processor_kwargs={"crop_to_patches": True}, + limit_mm_per_prompt={"image": 1}, + ) + prompts = [ + f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" + for question in questions + ] + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # BLIP-2 def run_blip2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -68,8 +91,8 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData: # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa prompts = [f"Question: {question} Answer:" for question in questions] engine_args = EngineArgs( - model="Salesforce/blip2-opt-2.7b", - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + model="Salesforce/blip2-opt-6.7b", + limit_mm_per_prompt={"image": 1}, ) return ModelRequestData( @@ -87,7 +110,7 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData: model="facebook/chameleon-7b", max_model_len=4096, max_num_seqs=2, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) return ModelRequestData( @@ -106,8 +129,8 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData: model=model_name, max_model_len=4096, max_num_seqs=2, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}, + limit_mm_per_prompt={"image": 1}, ) prompts = [ @@ -127,11 +150,12 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData: engine_args = EngineArgs( model="microsoft/Florence-2-large", - tokenizer="facebook/bart-large", - max_num_seqs=8, + tokenizer="Isotr0py/Florence-2-tokenizer", + max_model_len=4096, + max_num_seqs=2, trust_remote_code=True, dtype="bfloat16", - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) prompts = ["" for _ in questions] @@ -151,7 +175,7 @@ def run_fuyu(questions: list[str], modality: str) -> ModelRequestData: model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) return ModelRequestData( @@ -170,7 +194,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData: max_model_len=2048, max_num_seqs=2, mm_processor_kwargs={"do_pan_and_scan": True}, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) prompts = [("user\n" @@ -195,7 +219,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData: trust_remote_code=True, enforce_eager=True, hf_overrides={"architectures": ["GLM4VForCausalLM"]}, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) prompts = [ @@ -222,7 +246,7 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData: model=model_name, trust_remote_code=True, max_model_len=8192, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) tokenizer = AutoTokenizer.from_pretrained(model_name, @@ -263,7 +287,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: "longest_edge": 3 * 364 }, }, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) prompts = [( f"<|begin_of_text|>User:{question}\nAssistant:" @@ -275,6 +299,34 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: ) +# SmolVLM2-2.2B-Instruct +def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" + + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=2, + enforce_eager=True, + mm_processor_kwargs={ + "max_image_size": { + "longest_edge": 384 + }, + }, + limit_mm_per_prompt={"image": 1}, + ) + prompts = [ + (f"<|im_start|>User:{question}\nAssistant:") + for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # InternVL def run_internvl(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -285,7 +337,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData: model=model_name, trust_remote_code=True, max_model_len=4096, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) tokenizer = AutoTokenizer.from_pretrained(model_name, @@ -312,6 +364,29 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData: ) +# Kimi-VL +def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + prompts = [ + "<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>" + f"<|media_pad|><|media_end|>{question}<|im_end|>" + "<|im_assistant|>assistant<|im_middle|>" for question in questions + ] + + engine_args = EngineArgs( + model="moonshotai/Kimi-VL-A3B-Instruct", + trust_remote_code=True, + max_model_len=4096, + limit_mm_per_prompt={"image": 1}, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # LLaVA-1.5 def run_llava(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -323,7 +398,7 @@ def run_llava(questions: list[str], modality: str) -> ModelRequestData: engine_args = EngineArgs( model="llava-hf/llava-1.5-7b-hf", max_model_len=4096, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) return ModelRequestData( @@ -340,7 +415,7 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData: engine_args = EngineArgs( model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) return ModelRequestData( @@ -362,7 +437,7 @@ def run_llava_next_video(questions: list[str], model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192, max_num_seqs=2, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) return ModelRequestData( @@ -390,7 +465,7 @@ def run_llava_onevision(questions: list[str], engine_args = EngineArgs( model="llava-hf/llava-onevision-qwen2-7b-ov-hf", max_model_len=16384, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) return ModelRequestData( @@ -413,7 +488,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData: model="TIGER-Lab/Mantis-8B-siglip-llama3", max_model_len=4096, hf_overrides={"architectures": ["MantisForConditionalGeneration"]}, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) stop_token_ids = [128009] @@ -454,7 +529,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name): max_model_len=4096, max_num_seqs=2, trust_remote_code=True, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) # NOTE The stop_token_ids are different for various versions of MiniCPM-V # 2.0 @@ -497,6 +572,29 @@ def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData: return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6") +# Mistral-3 HF-format +def run_mistral3(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" + + # NOTE: Need L40 (or equivalent) to avoid OOM + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=2, + tensor_parallel_size=2, + limit_mm_per_prompt={"image": 1}, + ) + + prompts = [f"[INST]{question}\n[IMG][/INST]" for question in questions] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # LLama 3.2 def run_mllama(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -510,9 +608,9 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData: # The configuration below has been confirmed to launch on a single L40 GPU. engine_args = EngineArgs( model=model_name, - max_model_len=4096, - max_num_seqs=16, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + max_model_len=8192, + max_num_seqs=2, + limit_mm_per_prompt={"image": 1}, ) tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -536,6 +634,42 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData: ) +def run_llama4(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" + + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=4, + tensor_parallel_size=8, + gpu_memory_utilization=0.4, + limit_mm_per_prompt={"image": 1}, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_name) + messages = [[{ + "role": + "user", + "content": [{ + "type": "image" + }, { + "type": "text", + "text": f"{question}" + }] + }] for question in questions] + prompts = tokenizer.apply_chat_template(messages, + add_generation_prompt=True, + tokenize=False) + stop_token_ids = None + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) + + # Molmo def run_molmo(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -546,7 +680,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData: model=model_name, trust_remote_code=True, dtype="bfloat16", - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) prompts = [ @@ -572,7 +706,7 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData: trust_remote_code=True, max_model_len=4096, tensor_parallel_size=4, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) tokenizer = AutoTokenizer.from_pretrained(model_name, @@ -599,7 +733,8 @@ def run_paligemma(questions: list[str], modality: str) -> ModelRequestData: prompts = ["caption en" for _ in questions] engine_args = EngineArgs( model="google/paligemma-3b-mix-224", - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) + limit_mm_per_prompt={"image": 1}, + ) return ModelRequestData( engine_args=engine_args, @@ -615,7 +750,8 @@ def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData: prompts = ["caption en" for _ in questions] engine_args = EngineArgs( model="google/paligemma2-3b-ft-docci-448", - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) + limit_mm_per_prompt={"image": 1}, + ) return ModelRequestData( engine_args=engine_args, @@ -651,7 +787,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData: max_num_seqs=2, # Note - mm_processor_kwargs can also be passed to generate/chat calls mm_processor_kwargs={"num_crops": 16}, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) return ModelRequestData( @@ -678,10 +814,14 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData: engine_args = EngineArgs( model=model_path, trust_remote_code=True, - max_model_len=4096, + max_model_len=5120, max_num_seqs=2, + max_num_batched_tokens=12800, enable_lora=True, max_lora_rank=320, + # Note - mm_processor_kwargs can also be passed to generate/chat calls + mm_processor_kwargs={"dynamic_hd": 16}, + limit_mm_per_prompt={"image": 1}, ) return ModelRequestData( @@ -700,9 +840,9 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData: # NOTE: Need L40 (or equivalent) to avoid OOM engine_args = EngineArgs( model=model_name, - max_model_len=8192, + max_model_len=6144, max_num_seqs=2, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) prompts = [f"[INST]{question}\n[IMG][/INST]" for question in questions] @@ -723,7 +863,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData: max_model_len=1024, max_num_seqs=2, hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) prompts = [f"{question}Picture 1: \n" for question in questions] @@ -748,7 +888,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData: "min_pixels": 28 * 28, "max_pixels": 1280 * 28 * 28, }, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) if modality == "image": @@ -783,7 +923,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData: "max_pixels": 1280 * 28 * 28, "fps": 1, }, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) if modality == "image": @@ -804,8 +944,80 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData: ) +# Qwen2.5-Omni +def run_qwen2_5_omni(questions: list[str], modality: str): + model_name = "Qwen/Qwen2.5-Omni-7B" + + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=5, + mm_processor_kwargs={ + "min_pixels": 28 * 28, + "max_pixels": 1280 * 28 * 28, + "fps": [1], + }, + limit_mm_per_prompt={"image": 1}, + ) + + if modality == "image": + placeholder = "<|IMAGE|>" + elif modality == "video": + placeholder = "<|VIDEO|>" + + default_system = ( + "You are Qwen, a virtual human developed by the Qwen Team, Alibaba " + "Group, capable of perceiving auditory and visual inputs, as well as " + "generating text and speech.") + + prompts = [(f"<|im_start|>system\n{default_system}<|im_end|>\n" + f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n") for question in questions] + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + +# SkyworkR1V +def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "Skywork/Skywork-R1V-38B" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + limit_mm_per_prompt={"image": 1}, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + messages = [[{ + 'role': 'user', + 'content': f"\n{question}" + }] for question in questions] + prompts = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + # Stop tokens for SkyworkR1V + # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py + stop_tokens = ["<|end▁of▁sentence|>", "<|endoftext|>"] + stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) + + model_example_map = { "aria": run_aria, + "aya_vision": run_aya_vision, "blip-2": run_blip2, "chameleon": run_chameleon, "deepseek_vl_v2": run_deepseek_vl2, @@ -816,6 +1028,7 @@ model_example_map = { "h2ovl_chat": run_h2ovl, "idefics3": run_idefics3, "internvl_chat": run_internvl, + "kimi_vl": run_kimi_vl, "llava": run_llava, "llava-next": run_llava_next, "llava-next-video": run_llava_next_video, @@ -823,7 +1036,9 @@ model_example_map = { "mantis": run_mantis, "minicpmo": run_minicpmo, "minicpmv": run_minicpmv, + "mistral3": run_mistral3, "mllama": run_mllama, + "llama4": run_llama4, "molmo": run_molmo, "NVLM_D": run_nvlm_d, "paligemma": run_paligemma, @@ -834,6 +1049,9 @@ model_example_map = { "qwen_vl": run_qwen_vl, "qwen2_vl": run_qwen2_vl, "qwen2_5_vl": run_qwen2_5_vl, + "qwen2_5_omni": run_qwen2_5_omni, + "skywork_chat": run_skyworkr1v, + "smolvlm": run_smolvlm, } @@ -905,80 +1123,21 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data, return inputs -def main(args): - model = args.model_type - if model not in model_example_map: - raise ValueError(f"Model type {model} is not supported.") - - modality = args.modality - mm_input = get_multi_modal_input(args) - data = mm_input["data"] - questions = mm_input["questions"] - - req_data = model_example_map[model](questions, modality) - - engine_args = asdict(req_data.engine_args) | {"seed": args.seed} - llm = LLM(**engine_args) - - # To maintain code compatibility in this script, we add LoRA here. - # You can also add LoRA using: - # llm.generate(prompts, lora_request=lora_request,...) - if req_data.lora_requests: - for lora_request in req_data.lora_requests: - llm.llm_engine.add_lora(lora_request=lora_request) - - # Don't want to check the flag multiple times, so just hijack `prompts`. - prompts = req_data.prompts if args.use_different_prompt_per_request else [ - req_data.prompts[0] - ] - - # We set temperature to 0.2 so that outputs can be different - # even when all prompts are identical when running batch inference. - sampling_params = SamplingParams(temperature=0.2, - max_tokens=64, - stop_token_ids=req_data.stop_token_ids) - - assert args.num_prompts > 0 - if args.num_prompts == 1: - # Single inference - inputs = { - "prompt": prompts[0], - "multi_modal_data": { - modality: data - }, - } - else: - # Batch inference - if args.image_repeat_prob is not None: - # Repeat images with specified probability of "image_repeat_prob" - inputs = apply_image_repeat(args.image_repeat_prob, - args.num_prompts, data, prompts, - modality) - else: - # Use the same image for all prompts - inputs = [{ - "prompt": prompts[i % len(prompts)], - "multi_modal_data": { - modality: data - }, - } for i in range(args.num_prompts)] - - if args.time_generate: +@contextmanager +def time_counter(enable: bool): + if enable: import time start_time = time.time() - outputs = llm.generate(inputs, sampling_params=sampling_params) + yield elapsed_time = time.time() - start_time + print("-" * 50) print("-- generate time = {}".format(elapsed_time)) - + print("-" * 50) else: - outputs = llm.generate(inputs, sampling_params=sampling_params) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) + yield -if __name__ == "__main__": +def parse_args(): parser = FlexibleArgumentParser( description='Demo on using vLLM for offline inference with ' 'vision language models for text generation') @@ -1028,6 +1187,86 @@ if __name__ == "__main__": action='store_true', help='If True, then use different prompt (with the same multi-modal ' 'data) for each request.') + return parser.parse_args() - args = parser.parse_args() + +def main(args): + model = args.model_type + if model not in model_example_map: + raise ValueError(f"Model type {model} is not supported.") + + modality = args.modality + mm_input = get_multi_modal_input(args) + data = mm_input["data"] + questions = mm_input["questions"] + + req_data = model_example_map[model](questions, modality) + + # Disable other modalities to save memory + default_limits = {"image": 0, "video": 0, "audio": 0} + req_data.engine_args.limit_mm_per_prompt = default_limits | dict( + req_data.engine_args.limit_mm_per_prompt or {}) + + engine_args = asdict(req_data.engine_args) | { + "seed": args.seed, + "disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache, + } + llm = LLM(**engine_args) + + # Don't want to check the flag multiple times, so just hijack `prompts`. + prompts = req_data.prompts if args.use_different_prompt_per_request else [ + req_data.prompts[0] + ] + + # We set temperature to 0.2 so that outputs can be different + # even when all prompts are identical when running batch inference. + sampling_params = SamplingParams(temperature=0.2, + max_tokens=64, + stop_token_ids=req_data.stop_token_ids) + + assert args.num_prompts > 0 + if args.num_prompts == 1: + # Single inference + inputs = { + "prompt": prompts[0], + "multi_modal_data": { + modality: data + }, + } + else: + # Batch inference + if args.image_repeat_prob is not None: + # Repeat images with specified probability of "image_repeat_prob" + inputs = apply_image_repeat(args.image_repeat_prob, + args.num_prompts, data, prompts, + modality) + else: + # Use the same image for all prompts + inputs = [{ + "prompt": prompts[i % len(prompts)], + "multi_modal_data": { + modality: data + }, + } for i in range(args.num_prompts)] + + # Add LoRA request if applicable + lora_request = (req_data.lora_requests * + args.num_prompts if req_data.lora_requests else None) + + with time_counter(args.time_generate): + outputs = llm.generate( + inputs, + sampling_params=sampling_params, + lora_request=lora_request, + ) + + print("-" * 50) + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + print("-" * 50) + + +if __name__ == "__main__": + args = parse_args() main(args) diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py index a0b2b44b4e829..2637949551a1a 100644 --- a/examples/offline_inference/vision_language_embedding.py +++ b/examples/offline_inference/vision_language_embedding.py @@ -63,6 +63,7 @@ def run_e5_v(query: Query) -> ModelRequestData: model="royokong/e5-v", task="embed", max_model_len=4096, + limit_mm_per_prompt={"image": 1}, ) return ModelRequestData( @@ -93,6 +94,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData: task="embed", trust_remote_code=True, mm_processor_kwargs={"num_crops": 4}, + limit_mm_per_prompt={"image": 1}, ) return ModelRequestData( @@ -131,6 +133,11 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]): query = get_query(modality) req_data = model_example_map[model](query) + # Disable other modalities to save memory + default_limits = {"image": 0, "video": 0, "audio": 0} + req_data.engine_args.limit_mm_per_prompt = default_limits | dict( + req_data.engine_args.limit_mm_per_prompt or {}) + engine_args = asdict(req_data.engine_args) | {"seed": seed} llm = LLM(**engine_args) @@ -143,12 +150,10 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]): "multi_modal_data": mm_data, }) + print("-" * 50) for output in outputs: print(output.outputs.embedding) - - -def main(args: Namespace): - run_encode(args.model_name, args.modality, args.seed) + print("-" * 50) model_example_map = { @@ -156,7 +161,8 @@ model_example_map = { "vlm2vec": run_vlm2vec, } -if __name__ == "__main__": + +def parse_args(): parser = FlexibleArgumentParser( description='Demo on using vLLM for offline inference with ' 'vision language models for multimodal embedding') @@ -175,6 +181,13 @@ if __name__ == "__main__": type=int, default=None, help="Set the seed when initializing `vllm.LLM`.") + return parser.parse_args() - args = parser.parse_args() + +def main(args: Namespace): + run_encode(args.model_name, args.modality, args.seed) + + +if __name__ == "__main__": + args = parse_args() main(args) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 98a739169d702..f165ea9efa10f 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -22,6 +22,16 @@ QUESTION = "What is the content of each image?" IMAGE_URLS = [ "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg", "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg", + "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG", + "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg", + "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg", + "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg", + "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg", + "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg", + "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg", + "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg", + "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg", + "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg", ] @@ -61,6 +71,41 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "CohereForAI/aya-vision-8b" + + engine_args = EngineArgs( + model=model_name, + max_num_seqs=2, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [{ + "role": + "user", + "content": [ + *placeholders, + { + "type": "text", + "text": question + }, + ], + }] + + processor = AutoProcessor.from_pretrained(model_name) + + prompt = processor.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "deepseek-ai/deepseek-vl2-tiny" @@ -182,6 +227,33 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" + + # The configuration below has been confirmed to launch on a single L40 GPU. + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=16, + enforce_eager=True, + limit_mm_per_prompt={"image": len(image_urls)}, + mm_processor_kwargs={ + "max_image_size": { + "longest_edge": 384 + }, + }, + ) + + placeholders = "\n".join(f"Image-{i}: \n" + for i, _ in enumerate(image_urls, start=1)) + prompt = f"<|im_start|>User:{placeholders}\n{question}\nAssistant:" # noqa: E501 + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "OpenGVLab/InternVL2-2B" @@ -218,19 +290,115 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" + + engine_args = EngineArgs( + model=model_name, + max_model_len=131072, + tensor_parallel_size=8, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [{ + "role": + "user", + "content": [ + *placeholders, + { + "type": "text", + "text": question + }, + ], + }] + + processor = AutoProcessor.from_pretrained(model_name) + + prompt = processor.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + +def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "moonshotai/Kimi-VL-A3B-Instruct" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + max_num_seqs=4, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [{ + "role": + "user", + "content": [ + *placeholders, + { + "type": "text", + "text": question + }, + ], + }] + + processor = AutoProcessor.from_pretrained(model_name, + trust_remote_code=True) + + prompt = processor.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + +def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" + + # Adjust this as necessary to fit in GPU + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=2, + tensor_parallel_size=2, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = "[IMG]" * len(image_urls) + prompt = f"[INST]{question}\n{placeholders}[/INST]" + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" # The configuration below has been confirmed to launch on a single L40 GPU. engine_args = EngineArgs( model=model_name, - max_model_len=4096, - max_num_seqs=16, + max_model_len=8192, + max_num_seqs=2, limit_mm_per_prompt={"image": len(image_urls)}, ) - placeholders = "<|image|>" * len(image_urls) - prompt = f"{placeholders}<|begin_of_text|>{question}" + img_prompt = "Given the first image <|image|> and the second image<|image|>" + prompt = f"<|begin_of_text|>{img_prompt}, {question}?" return ModelRequestData( engine_args=engine_args, prompt=prompt, @@ -335,11 +503,13 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData: engine_args = EngineArgs( model=model_path, trust_remote_code=True, - max_model_len=10000, + max_model_len=4096, max_num_seqs=2, limit_mm_per_prompt={"image": len(image_urls)}, enable_lora=True, max_lora_rank=320, + # Note - mm_processor_kwargs can also be passed to generate/chat calls + mm_processor_kwargs={"dynamic_hd": 4}, ) placeholders = "".join(f"<|image_{i}|>" @@ -504,11 +674,15 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: model_example_map = { "aria": load_aria, + "aya_vision": load_aya_vision, "deepseek_vl_v2": load_deepseek_vl2, "gemma3": load_gemma3, "h2ovl_chat": load_h2ovl, "idefics3": load_idefics3, "internvl_chat": load_internvl, + "kimi_vl": load_kimi_vl, + "llama4": load_llama4, + "mistral3": load_mistral3, "mllama": load_mllama, "NVLM_D": load_nvlm_d, "phi3_v": load_phi3v, @@ -517,6 +691,7 @@ model_example_map = { "qwen_vl_chat": load_qwen_vl_chat, "qwen2_vl": load_qwen2_vl, "qwen2_5_vl": load_qwen2_5_vl, + "smolvlm": load_smolvlm, } @@ -527,15 +702,8 @@ def run_generate(model, question: str, image_urls: list[str], engine_args = asdict(req_data.engine_args) | {"seed": args.seed} llm = LLM(**engine_args) - # To maintain code compatibility in this script, we add LoRA here. - # You can also add LoRA using: - # llm.generate(prompts, lora_request=lora_request,...) - if req_data.lora_requests: - for lora_request in req_data.lora_requests: - llm.llm_engine.add_lora(lora_request=lora_request) - sampling_params = SamplingParams(temperature=0.0, - max_tokens=128, + max_tokens=256, stop_token_ids=req_data.stop_token_ids) outputs = llm.generate( @@ -545,29 +713,31 @@ def run_generate(model, question: str, image_urls: list[str], "image": req_data.image_data }, }, - sampling_params=sampling_params) + sampling_params=sampling_params, + lora_request=req_data.lora_requests, + ) + print("-" * 50) for o in outputs: generated_text = o.outputs[0].text print(generated_text) + print("-" * 50) def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]): req_data = model_example_map[model](question, image_urls) + # Disable other modalities to save memory + default_limits = {"image": 0, "video": 0, "audio": 0} + req_data.engine_args.limit_mm_per_prompt = default_limits | dict( + req_data.engine_args.limit_mm_per_prompt or {}) + engine_args = asdict(req_data.engine_args) | {"seed": seed} llm = LLM(**engine_args) - # To maintain code compatibility in this script, we add LoRA here. - # You can also add LoRA using: - # llm.generate(prompts, lora_request=lora_request,...) - if req_data.lora_requests: - for lora_request in req_data.lora_requests: - llm.llm_engine.add_lora(lora_request=lora_request) - sampling_params = SamplingParams(temperature=0.0, - max_tokens=128, + max_tokens=256, stop_token_ids=req_data.stop_token_ids) outputs = llm.chat( [{ @@ -588,27 +758,17 @@ def run_chat(model: str, question: str, image_urls: list[str], }], sampling_params=sampling_params, chat_template=req_data.chat_template, + lora_request=req_data.lora_requests, ) + print("-" * 50) for o in outputs: generated_text = o.outputs[0].text print(generated_text) + print("-" * 50) -def main(args: Namespace): - model = args.model_type - method = args.method - seed = args.seed - - if method == "generate": - run_generate(model, QUESTION, IMAGE_URLS, seed) - elif method == "chat": - run_chat(model, QUESTION, IMAGE_URLS, seed) - else: - raise ValueError(f"Invalid method: {method}") - - -if __name__ == "__main__": +def parse_args(): parser = FlexibleArgumentParser( description='Demo on using vLLM for offline inference with ' 'vision language models that support multi-image input for text ' @@ -628,6 +788,30 @@ if __name__ == "__main__": type=int, default=None, help="Set the seed when initializing `vllm.LLM`.") + parser.add_argument( + "--num-images", + "-n", + choices=list(range(1, 13)), # 12 is the max number of images + default=2, + help="Number of images to use for the demo.") + return parser.parse_args() - args = parser.parse_args() + +def main(args: Namespace): + model = args.model_type + method = args.method + seed = args.seed + + image_urls = IMAGE_URLS[:args.num_images] + + if method == "generate": + run_generate(model, QUESTION, image_urls, seed) + elif method == "chat": + run_chat(model, QUESTION, image_urls, seed) + else: + raise ValueError(f"Invalid method: {method}") + + +if __name__ == "__main__": + args = parse_args() main(args) diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py index e2944896d1610..36079ff11d07e 100644 --- a/examples/online_serving/api_client.py +++ b/examples/online_serving/api_client.py @@ -1,5 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Example Python client for `vllm.entrypoints.api_server` +Start the demo server: + python -m vllm.entrypoints.api_server --model + NOTE: The API server is used only for demonstration and simple performance benchmarks. It is not intended for production use. For production use, we recommend `vllm serve` and the OpenAI client API. @@ -7,6 +10,7 @@ For production use, we recommend `vllm serve` and the OpenAI client API. import argparse import json +from argparse import Namespace from collections.abc import Iterable import requests @@ -27,7 +31,6 @@ def post_http_request(prompt: str, pload = { "prompt": prompt, "n": n, - "use_beam_search": True, "temperature": 0.0, "max_tokens": 16, "stream": stream, @@ -55,14 +58,17 @@ def get_response(response: requests.Response) -> list[str]: return output -if __name__ == "__main__": +def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) - parser.add_argument("--n", type=int, default=4) + parser.add_argument("--n", type=int, default=1) parser.add_argument("--prompt", type=str, default="San Francisco is a") parser.add_argument("--stream", action="store_true") - args = parser.parse_args() + return parser.parse_args() + + +def main(args: Namespace): prompt = args.prompt api_url = f"http://{args.host}:{args.port}/generate" n = args.n @@ -83,3 +89,8 @@ if __name__ == "__main__": output = get_response(response) for i, line in enumerate(output): print(f"Beam candidate {i}: {line!r}", flush=True) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/cohere_rerank_client.py index fc434ada1d156..c2d4ef08ddbbe 100644 --- a/examples/online_serving/cohere_rerank_client.py +++ b/examples/online_serving/cohere_rerank_client.py @@ -2,32 +2,46 @@ """ Example of using the OpenAI entrypoint's rerank API which is compatible with the Cohere SDK: https://github.com/cohere-ai/cohere-python +Note that `pip install cohere` is needed to run this example. run: vllm serve BAAI/bge-reranker-base """ +from typing import Union + import cohere +from cohere import Client, ClientV2 -# cohere v1 client -co = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key") -rerank_v1_result = co.rerank( - model="BAAI/bge-reranker-base", - query="What is the capital of France?", - documents=[ - "The capital of France is Paris", "Reranking is fun!", - "vLLM is an open-source framework for fast AI serving" - ]) +model = "BAAI/bge-reranker-base" -print(rerank_v1_result) +query = "What is the capital of France?" -# or the v2 -co2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000") +documents = [ + "The capital of France is Paris", "Reranking is fun!", + "vLLM is an open-source framework for fast AI serving" +] -v2_rerank_result = co2.rerank( - model="BAAI/bge-reranker-base", - query="What is the capital of France?", - documents=[ - "The capital of France is Paris", "Reranking is fun!", - "vLLM is an open-source framework for fast AI serving" - ]) -print(v2_rerank_result) +def cohere_rerank(client: Union[Client, ClientV2], model: str, query: str, + documents: list[str]) -> dict: + return client.rerank(model=model, query=query, documents=documents) + + +def main(): + # cohere v1 client + cohere_v1 = cohere.Client(base_url="http://localhost:8000", + api_key="sk-fake-key") + rerank_v1_result = cohere_rerank(cohere_v1, model, query, documents) + print("-" * 50) + print("rerank_v1_result:\n", rerank_v1_result) + print("-" * 50) + + # or the v2 + cohere_v2 = cohere.ClientV2("sk-fake-key", + base_url="http://localhost:8000") + rerank_v2_result = cohere_rerank(cohere_v2, model, query, documents) + print("rerank_v2_result:\n", rerank_v2_result) + print("-" * 50) + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/disagg_examples/disagg_proxy_demo.py b/examples/online_serving/disagg_examples/disagg_proxy_demo.py new file mode 100644 index 0000000000000..a701636f357a8 --- /dev/null +++ b/examples/online_serving/disagg_examples/disagg_proxy_demo.py @@ -0,0 +1,450 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +This file provides a disaggregated prefilling proxy demo to demonstrate an +example usage of XpYd disaggregated prefilling. +We can launch multiple vllm instances (2 for prefill and 2 for decode), and +launch this proxy demo through: + python3 examples/online_serving/disagg_examples/disagg_proxy_demo.py \ + --model $model_name \ + --prefill localhost:8100 localhost:8101 \ + --decode localhost:8200 localhost:8201 \ + --port 8000 + +Note: This demo will be removed once the PDController implemented in PR 15343 +(https://github.com/vllm-project/vllm/pull/15343) supports XpYd. +""" +import argparse +import ipaddress +import itertools +import json +import logging +import os +import sys +from abc import ABC, abstractmethod +from typing import Callable, Optional + +import aiohttp +import requests +import uvicorn +from fastapi import (APIRouter, Depends, FastAPI, Header, HTTPException, + Request, status) +from fastapi.responses import JSONResponse, StreamingResponse + +AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) +logger = logging.getLogger() +logging.basicConfig(level=logging.INFO) + + +class SchedulingPolicy(ABC): + + @abstractmethod + def schedule(self, cycler: itertools.cycle): + raise NotImplementedError("Scheduling Proxy is not set.") + + +class Proxy: + + def __init__( + self, + prefill_instances: list[str], + decode_instances: list[str], + model: str, + scheduling_policy: SchedulingPolicy, + custom_create_completion: Optional[Callable[[Request], + StreamingResponse]] = None, + custom_create_chat_completion: Optional[Callable[ + [Request], StreamingResponse]] = None, + ): + self.prefill_instances = prefill_instances + self.decode_instances = decode_instances + self.prefill_cycler = itertools.cycle(prefill_instances) + self.decode_cycler = itertools.cycle(decode_instances) + self.model = model + self.scheduling_policy = scheduling_policy + self.custom_create_completion = custom_create_completion + self.custom_create_chat_completion = custom_create_chat_completion + self.router = APIRouter() + self.setup_routes() + + def setup_routes(self): + self.router.post( + "/v1/completions", + dependencies=[ + Depends(self.validate_json_request) + ])(self.custom_create_completion if self. + custom_create_completion else self.create_completion) + self.router.post( + "/v1/chat/completions", + dependencies=[ + Depends(self.validate_json_request) + ])(self.custom_create_chat_completion if self. + custom_create_chat_completion else self.create_chat_completion) + self.router.get("/status", + response_class=JSONResponse)(self.get_status) + self.router.post("/instances/add", + dependencies=[Depends(self.api_key_authenticate) + ])(self.add_instance_endpoint) + + async def validate_json_request(self, raw_request: Request): + content_type = raw_request.headers.get("content-type", "").lower() + if content_type != "application/json": + raise HTTPException( + status_code=415, + detail= + "Unsupported Media Type: Only 'application/json' is allowed", + ) + + def api_key_authenticate(self, x_api_key: str = Header(...)): + expected_api_key = os.environ.get("ADMIN_API_KEY") + if not expected_api_key: + logger.error("ADMIN_API_KEY is not set in the environment.") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Server configuration error.", + ) + if x_api_key != expected_api_key: + logger.warning("Unauthorized access attempt with API Key: %s", + x_api_key) + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Forbidden: Invalid API Key.", + ) + + async def validate_instance(self, instance: str) -> bool: + url = f"http://{instance}/v1/models" + try: + async with aiohttp.ClientSession( + timeout=AIOHTTP_TIMEOUT) as client: + logger.info("Verifying %s ...", instance) + async with client.get(url) as response: + if response.status == 200: + data = await response.json() + if "data" in data and len(data["data"]) > 0: + model_cur = data["data"][0].get("id", "") + if model_cur == self.model: + logger.info("Instance: %s could be added.", + instance) + return True + else: + logger.warning("Mismatch model %s : %s != %s", + instance, model_cur, self.model) + return False + else: + return False + else: + return False + except aiohttp.ClientError as e: + logger.error(str(e)) + return False + except Exception as e: + logger.error(str(e)) + return False + + async def add_instance_endpoint(self, request: Request): + try: + data = await request.json() + logger.warning(str(data)) + instance_type = data.get("type") + instance = data.get("instance") + if instance_type not in ["prefill", "decode"]: + raise HTTPException(status_code=400, + detail="Invalid instance type.") + if not instance or ":" not in instance: + raise HTTPException(status_code=400, + detail="Invalid instance format.") + host, port_str = instance.split(":") + try: + if host != "localhost": + ipaddress.ip_address(host) + port = int(port_str) + if not (0 < port < 65536): + raise HTTPException(status_code=400, + detail="Invalid port number.") + except Exception as e: + raise HTTPException(status_code=400, + detail="Invalid instance address.") from e + + is_valid = await self.validate_instance(instance) + if not is_valid: + raise HTTPException(status_code=400, + detail="Instance validation failed.") + + if instance_type == "prefill": + if instance not in self.prefill_instances: + self.prefill_instances.append(instance) + self.prefill_cycler = itertools.cycle( + self.prefill_instances) + else: + raise HTTPException(status_code=400, + detail="Instance already exists.") + else: + if instance not in self.decode_instances: + self.decode_instances.append(instance) + self.decode_cycler = itertools.cycle(self.decode_instances) + else: + raise HTTPException(status_code=400, + detail="Instance already exists.") + + return JSONResponse(content={ + "message": + f"Added {instance} to {instance_type}_instances." + }) + except HTTPException as http_exc: + raise http_exc + except Exception as e: + logger.error("Error in add_instance_endpoint: %s", str(e)) + raise HTTPException(status_code=500, detail=str(e)) from e + + async def forward_request(self, url, data, use_chunked=True): + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" + } + try: + async with session.post(url=url, json=data, + headers=headers) as response: + if 200 <= response.status < 300 or 400 <= response.status < 500: # noqa: E501 + if use_chunked: + async for chunk_bytes in response.content.iter_chunked( # noqa: E501 + 1024): + yield chunk_bytes + else: + content = await response.read() + yield content + else: + error_content = await response.text() + try: + error_content = json.loads(error_content) + except json.JSONDecodeError: + error_content = error_content + logger.error("Request failed with status %s: %s", + response.status, error_content) + raise HTTPException( + status_code=response.status, + detail= + f"Request failed with status {response.status}: " + f"{error_content}", + ) + except aiohttp.ClientError as e: + logger.error("ClientError occurred: %s", str(e)) + raise HTTPException( + status_code=502, + detail= + "Bad Gateway: Error communicating with upstream server.", + ) from e + except Exception as e: + logger.error("Unexpected error: %s", str(e)) + raise HTTPException(status_code=500, detail=str(e)) from e + + def schedule(self, cycler: itertools.cycle) -> str: + return self.scheduling_policy.schedule(cycler) + + async def get_status(self): + status = { + "prefill_node_count": len(self.prefill_instances), + "decode_node_count": len(self.decode_instances), + "prefill_nodes": self.prefill_instances, + "decode_nodes": self.decode_instances, + } + return status + + async def create_completion(self, raw_request: Request): + try: + request = await raw_request.json() + + kv_prepare_request = request.copy() + kv_prepare_request["max_tokens"] = 1 + + prefill_instance = self.schedule(self.prefill_cycler) + try: + async for _ in self.forward_request( + f"http://{prefill_instance}/v1/completions", + kv_prepare_request): + continue + except HTTPException as http_exc: + self.remove_instance_endpoint("prefill", prefill_instance) + raise http_exc + + # Perform kv recv and decoding stage + decode_instance = self.schedule(self.decode_cycler) + + try: + generator = self.forward_request( + f"http://{decode_instance}/v1/completions", request) + except HTTPException as http_exc: + self.remove_instance_endpoint("decode", decode_instance) + raise http_exc + response = StreamingResponse(generator) + return response + except Exception: + import sys + + exc_info = sys.exc_info() + print("Error occurred in disagg proxy server") + print(exc_info) + + async def create_chat_completion(self, raw_request: Request): + try: + request = await raw_request.json() + + # add params to request + kv_prepare_request = request.copy() + kv_prepare_request["max_tokens"] = 1 + + # prefill stage + prefill_instance = self.schedule(self.prefill_cycler) + try: + async for _ in self.forward_request( + f"http://{prefill_instance}/v1/chat/completions", + kv_prepare_request): + continue + except HTTPException as http_exc: + self.remove_instance_endpoint("prefill", prefill_instance) + raise http_exc + # Perform kv recv and decoding stage + decode_instance = self.schedule(self.decode_cycler) + + try: + generator = self.forward_request( + "http://" + decode_instance + "/v1/chat/completions", + request) + except HTTPException as http_exc: + self.remove_instance_endpoint("decode", decode_instance) + raise http_exc + response = StreamingResponse(content=generator) + return response + except Exception: + exc_info = sys.exc_info() + error_messages = [str(e) for e in exc_info if e] + print("Error occurred in disagg proxy server") + print(error_messages) + return StreamingResponse(content=iter(error_messages), + media_type="text/event-stream") + + def remove_instance_endpoint(self, instance_type, instance): + if (instance_type == "decode" and instance in self.decode_instances): + self.decode_instances.remove(instance) + self.decode_cycler = itertools.cycle(self.decode_instances) + if (instance_type == "prefill" and instance in self.decode_instances): + self.prefill_instances.remove(instance) + self.prefill_cycler = itertools.cycle(self.decode_instances) + + +class RoundRobinSchedulingPolicy(SchedulingPolicy): + + def __init__(self): + super().__init__() + + def schedule(self, cycler: itertools.cycle) -> str: + return next(cycler) + + +class ProxyServer: + + def __init__( + self, + args: argparse.Namespace, + scheduling_policy: Optional[SchedulingPolicy] = None, + create_completion: Optional[Callable[[Request], + StreamingResponse]] = None, + create_chat_completion: Optional[Callable[[Request], + StreamingResponse]] = None, + ): + self.validate_parsed_serve_args(args) + self.port = args.port + self.proxy_instance = Proxy( + prefill_instances=[] if args.prefill is None else args.prefill, + decode_instances=[] if args.decode is None else args.decode, + model=args.model, + scheduling_policy=(scheduling_policy if scheduling_policy + is not None else RoundRobinSchedulingPolicy()), + custom_create_completion=create_completion, + custom_create_chat_completion=create_chat_completion, + ) + + def validate_parsed_serve_args(self, args: argparse.Namespace): + if not args.prefill: + raise ValueError("Please specify at least one prefill node.") + if not args.decode: + raise ValueError("Please specify at least one decode node.") + self.validate_instances(args.prefill) + self.validate_instances(args.decode) + self.verify_model_config(args.prefill, args.model) + self.verify_model_config(args.decode, args.model) + + def validate_instances(self, instances: list): + for instance in instances: + if len(instance.split(":")) != 2: + raise ValueError(f"Invalid instance format: {instance}") + host, port = instance.split(":") + try: + if host != "localhost": + ipaddress.ip_address(host) + port = int(port) + if not (0 < port < 65536): + raise ValueError( + f"Invalid port number in instance: {instance}") + except Exception as e: + raise ValueError( + f"Invalid instance {instance}: {str(e)}") from e + + def verify_model_config(self, instances: list, model: str) -> None: + model_suffix = model.split("/")[-1] + for instance in instances: + try: + response = requests.get(f"http://{instance}/v1/models") + if response.status_code == 200: + model_cur = response.json()["data"][0]["id"] + model_cur_suffix = model_cur.split("/")[-1] + if model_cur_suffix != model_suffix: + raise ValueError( + f"{instance} serves a different model: " + f"{model_cur} != {model}") + else: + raise ValueError(f"Cannot get model id from {instance}!") + except requests.RequestException as e: + raise ValueError( + f"Error communicating with {instance}: {str(e)}") from e + + def run_server(self): + app = FastAPI() + app.include_router(self.proxy_instance.router) + config = uvicorn.Config(app, port=self.port, loop="uvloop") + server = uvicorn.Server(config) + server.run() + + +if __name__ == "__main__": + # Todo: allow more config + parser = argparse.ArgumentParser("vLLM disaggregated proxy server.") + parser.add_argument("--model", + "-m", + type=str, + required=True, + help="Model name") + + parser.add_argument( + "--prefill", + "-p", + type=str, + nargs="+", + help="List of prefill node URLs (host:port)", + ) + + parser.add_argument( + "--decode", + "-d", + type=str, + nargs="+", + help="List of decode node URLs (host:port)", + ) + + parser.add_argument( + "--port", + type=int, + default=8000, + help="Server port number", + ) + args = parser.parse_args() + proxy_server = ProxyServer(args=args) + proxy_server.run_server() diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py index ee01e1eae6281..314f1c5b73951 100644 --- a/examples/online_serving/gradio_openai_chatbot_webserver.py +++ b/examples/online_serving/gradio_openai_chatbot_webserver.py @@ -1,52 +1,32 @@ # SPDX-License-Identifier: Apache-2.0 +"""Example for starting a Gradio OpenAI Chatbot Webserver +Start vLLM API server: + vllm serve meta-llama/Llama-2-7b-chat-hf +Start Gradio OpenAI Chatbot Webserver: + python examples/online_serving/gradio_openai_chatbot_webserver.py \ + -m meta-llama/Llama-2-7b-chat-hf + +Note that `pip install --upgrade gradio` is needed to run this example. +More details: https://github.com/gradio-app/gradio + +If your antivirus software blocks the download of frpc for gradio, +you can install it manually by following these steps: + +1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64 +2. Rename the downloaded file to: frpc_linux_amd64_v0.3 +3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc +""" import argparse import gradio as gr from openai import OpenAI -# Argument parser setup -parser = argparse.ArgumentParser( - description='Chatbot Interface with Customizable Parameters') -parser.add_argument('--model-url', - type=str, - default='http://localhost:8000/v1', - help='Model URL') -parser.add_argument('-m', - '--model', - type=str, - required=True, - help='Model name for the chatbot') -parser.add_argument('--temp', - type=float, - default=0.8, - help='Temperature for text generation') -parser.add_argument('--stop-token-ids', - type=str, - default='', - help='Comma-separated stop token IDs') -parser.add_argument("--host", type=str, default=None) -parser.add_argument("--port", type=int, default=8001) -# Parse the arguments -args = parser.parse_args() - -# Set OpenAI's API key and API base to use vLLM's API server. -openai_api_key = "EMPTY" -openai_api_base = args.model_url - -# Create an OpenAI client to interact with the API server -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) - - -def predict(message, history): - # Convert chat history to OpenAI format +def format_history_to_openai(history): history_openai_format = [{ "role": "system", - "content": "You are a great ai assistant." + "content": "You are a great AI assistant." }] for human, assistant in history: history_openai_format.append({"role": "user", "content": human}) @@ -54,31 +34,92 @@ def predict(message, history): "role": "assistant", "content": assistant }) + return history_openai_format + + +def predict(message, history, client, model_name, temp, stop_token_ids): + # Format history to OpenAI chat format + history_openai_format = format_history_to_openai(history) history_openai_format.append({"role": "user", "content": message}) - # Create a chat completion request and send it to the API server + # Send request to OpenAI API (vLLM server) stream = client.chat.completions.create( - model=args.model, # Model name to use - messages=history_openai_format, # Chat history - temperature=args.temp, # Temperature for text generation - stream=True, # Stream response + model=model_name, + messages=history_openai_format, + temperature=temp, + stream=True, extra_body={ 'repetition_penalty': 1, - 'stop_token_ids': [ - int(id.strip()) for id in args.stop_token_ids.split(',') - if id.strip() - ] if args.stop_token_ids else [] + 'stop_token_ids': + [int(id.strip()) + for id in stop_token_ids.split(',')] if stop_token_ids else [] }) - # Read and return generated text from response stream - partial_message = "" + # Collect all chunks and concatenate them into a full message + full_message = "" for chunk in stream: - partial_message += (chunk.choices[0].delta.content or "") - yield partial_message + full_message += (chunk.choices[0].delta.content or "") + + # Return the full message as a single response + return full_message -# Create and launch a chat interface with Gradio -gr.ChatInterface(predict).queue().launch(server_name=args.host, - server_port=args.port, - share=True) +def parse_args(): + parser = argparse.ArgumentParser( + description='Chatbot Interface with Customizable Parameters') + parser.add_argument('--model-url', + type=str, + default='http://localhost:8000/v1', + help='Model URL') + parser.add_argument('-m', + '--model', + type=str, + required=True, + help='Model name for the chatbot') + parser.add_argument('--temp', + type=float, + default=0.8, + help='Temperature for text generation') + parser.add_argument('--stop-token-ids', + type=str, + default='', + help='Comma-separated stop token IDs') + parser.add_argument("--host", type=str, default=None) + parser.add_argument("--port", type=int, default=8001) + return parser.parse_args() + + +def build_gradio_interface(client, model_name, temp, stop_token_ids): + + def chat_predict(message, history): + return predict(message, history, client, model_name, temp, + stop_token_ids) + + return gr.ChatInterface(fn=chat_predict, + title="Chatbot Interface", + description="A simple chatbot powered by vLLM") + + +def main(): + # Parse the arguments + args = parse_args() + + # Set OpenAI's API key and API base to use vLLM's API server + openai_api_key = "EMPTY" + openai_api_base = args.model_url + + # Create an OpenAI client + client = OpenAI(api_key=openai_api_key, base_url=openai_api_base) + + # Define the Gradio chatbot interface using the predict function + gradio_interface = build_gradio_interface(client, args.model, args.temp, + args.stop_token_ids) + + gradio_interface.queue().launch(server_name=args.host, + server_port=args.port, + share=True) + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/gradio_webserver.py b/examples/online_serving/gradio_webserver.py index 85a9119c6aa2f..2e7c2a0c5838c 100644 --- a/examples/online_serving/gradio_webserver.py +++ b/examples/online_serving/gradio_webserver.py @@ -1,5 +1,22 @@ # SPDX-License-Identifier: Apache-2.0 +"""Example for starting a Gradio Webserver +Start vLLM API server: + python -m vllm.entrypoints.api_server \ + --model meta-llama/Llama-2-7b-chat-hf +Start Webserver: + python examples/online_serving/gradio_webserver.py + +Note that `pip install --upgrade gradio` is needed to run this example. +More details: https://github.com/gradio-app/gradio + +If your antivirus software blocks the download of frpc for gradio, +you can install it manually by following these steps: + +1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64 +2. Rename the downloaded file to: frpc_linux_amd64_v0.3 +3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc +""" import argparse import json @@ -39,16 +56,23 @@ def build_demo(): return demo -if __name__ == "__main__": +def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--host", type=str, default=None) parser.add_argument("--port", type=int, default=8001) parser.add_argument("--model-url", type=str, default="http://localhost:8000/generate") - args = parser.parse_args() + return parser.parse_args() + +def main(args): demo = build_demo() demo.queue().launch(server_name=args.host, server_port=args.port, share=True) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/jinaai_rerank_client.py index 3e760e1717883..3076bba765ce5 100644 --- a/examples/online_serving/jinaai_rerank_client.py +++ b/examples/online_serving/jinaai_rerank_client.py @@ -23,12 +23,19 @@ data = { "The capital of France is Paris.", "Horses and cows are both animals" ] } -response = requests.post(url, headers=headers, json=data) -# Check the response -if response.status_code == 200: - print("Request successful!") - print(json.dumps(response.json(), indent=2)) -else: - print(f"Request failed with status code: {response.status_code}") - print(response.text) + +def main(): + response = requests.post(url, headers=headers, json=data) + + # Check the response + if response.status_code == 200: + print("Request successful!") + print(json.dumps(response.json(), indent=2)) + else: + print(f"Request failed with status code: {response.status_code}") + print(response.text) + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py index a815620411309..74e0c045d6214 100644 --- a/examples/online_serving/openai_chat_completion_client.py +++ b/examples/online_serving/openai_chat_completion_client.py @@ -1,38 +1,49 @@ # SPDX-License-Identifier: Apache-2.0 - +"""Example Python client for OpenAI Chat Completion using vLLM API server +NOTE: start a supported chat completion model server with `vllm serve`, e.g. + vllm serve meta-llama/Llama-2-7b-chat-hf +""" from openai import OpenAI # Modify OpenAI's API key and API base to use vLLM's API server. openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -client = OpenAI( - # defaults to os.environ.get("OPENAI_API_KEY") - api_key=openai_api_key, - base_url=openai_api_base, -) +messages = [{ + "role": "system", + "content": "You are a helpful assistant." +}, { + "role": "user", + "content": "Who won the world series in 2020?" +}, { + "role": "assistant", + "content": "The Los Angeles Dodgers won the World Series in 2020." +}, { + "role": "user", + "content": "Where was it played?" +}] -models = client.models.list() -model = models.data[0].id -chat_completion = client.chat.completions.create( - messages=[{ - "role": "system", - "content": "You are a helpful assistant." - }, { - "role": "user", - "content": "Who won the world series in 2020?" - }, { - "role": - "assistant", - "content": - "The Los Angeles Dodgers won the World Series in 2020." - }, { - "role": "user", - "content": "Where was it played?" - }], - model=model, -) +def main(): + client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, + ) -print("Chat completion results:") -print(chat_completion) + models = client.models.list() + model = models.data[0].id + + chat_completion = client.chat.completions.create( + messages=messages, + model=model, + ) + + print("-" * 50) + print("Chat completion results:") + print(chat_completion) + print("-" * 50) + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index ecfcf05a90d16..70db4d95e6494 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -9,7 +9,7 @@ vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja (multi-image inference with Phi-3.5-vision-instruct) vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ - --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 + --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}' (audio inference with Ultravox) vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096 @@ -303,12 +303,7 @@ example_function_map = { } -def main(args) -> None: - chat_type = args.chat_type - example_function_map[chat_type]() - - -if __name__ == "__main__": +def parse_args(): parser = FlexibleArgumentParser( description='Demo on using OpenAI client for online serving with ' 'multimodal language models served with vLLM.') @@ -318,5 +313,14 @@ if __name__ == "__main__": default="single-image", choices=list(example_function_map.keys()), help='Conversation type with multimodal data.') - args = parser.parse_args() + return parser.parse_args() + + +def main(args) -> None: + chat_type = args.chat_type + example_function_map[chat_type]() + + +if __name__ == "__main__": + args = parse_args() main(args) diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py index 416fb61ca8bb5..c25203860ff39 100644 --- a/examples/online_serving/openai_chat_completion_client_with_tools.py +++ b/examples/online_serving/openai_chat_completion_client_with_tools.py @@ -17,6 +17,7 @@ vllm serve --model NousResearch/Hermes-2-Pro-Llama-3-8B \ --enable-auto-tool-choice --tool-call-parser hermes """ import json +from typing import Any from openai import OpenAI @@ -24,15 +25,6 @@ from openai import OpenAI openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -client = OpenAI( - # defaults to os.environ.get("OPENAI_API_KEY") - api_key=openai_api_key, - base_url=openai_api_base, -) - -models = client.models.list() -model = models.data[0].id - tools = [{ "type": "function", "function": { @@ -78,86 +70,123 @@ messages = [{ "Can you tell me what the temperate will be in Dallas, in fahrenheit?" }] -chat_completion = client.chat.completions.create(messages=messages, - model=model, - tools=tools) -print("Chat completion results:") -print(chat_completion) -print("\n\n") - -tool_calls_stream = client.chat.completions.create(messages=messages, - model=model, - tools=tools, - stream=True) - -chunks = [] -for chunk in tool_calls_stream: - chunks.append(chunk) - if chunk.choices[0].delta.tool_calls: - print(chunk.choices[0].delta.tool_calls[0]) - else: - print(chunk.choices[0].delta) - -arguments = [] -tool_call_idx = -1 -for chunk in chunks: - - if chunk.choices[0].delta.tool_calls: - tool_call = chunk.choices[0].delta.tool_calls[0] - - if tool_call.index != tool_call_idx: - if tool_call_idx >= 0: - print( - f"streamed tool call arguments: {arguments[tool_call_idx]}" - ) - tool_call_idx = chunk.choices[0].delta.tool_calls[0].index - arguments.append("") - if tool_call.id: - print(f"streamed tool call id: {tool_call.id} ") - - if tool_call.function: - if tool_call.function.name: - print(f"streamed tool call name: {tool_call.function.name}") - - if tool_call.function.arguments: - arguments[tool_call_idx] += tool_call.function.arguments - -if len(arguments): - print(f"streamed tool call arguments: {arguments[-1]}") - -print("\n\n") - -messages.append({ - "role": "assistant", - "tool_calls": chat_completion.choices[0].message.tool_calls -}) - - -# Now, simulate a tool call def get_current_weather(city: str, state: str, unit: 'str'): return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is " "partly cloudly, with highs in the 90's.") -available_tools = {"get_current_weather": get_current_weather} +def handle_tool_calls_stream( + client: OpenAI, + messages: list[dict[str, str]], + model: str, + tools: list[dict[str, Any]], +) -> list[Any]: + tool_calls_stream = client.chat.completions.create(messages=messages, + model=model, + tools=tools, + stream=True) + chunks = [] + print("chunks: ") + for chunk in tool_calls_stream: + chunks.append(chunk) + if chunk.choices[0].delta.tool_calls: + print(chunk.choices[0].delta.tool_calls[0]) + else: + print(chunk.choices[0].delta) + return chunks -completion_tool_calls = chat_completion.choices[0].message.tool_calls -for call in completion_tool_calls: - tool_to_call = available_tools[call.function.name] - args = json.loads(call.function.arguments) - result = tool_to_call(**args) - print(result) + +def handle_tool_calls_arguments(chunks: list[Any]) -> list[str]: + arguments = [] + tool_call_idx = -1 + print("arguments: ") + for chunk in chunks: + if chunk.choices[0].delta.tool_calls: + tool_call = chunk.choices[0].delta.tool_calls[0] + if tool_call.index != tool_call_idx: + if tool_call_idx >= 0: + print(f"streamed tool call arguments: " + f"{arguments[tool_call_idx]}") + tool_call_idx = chunk.choices[0].delta.tool_calls[0].index + arguments.append("") + if tool_call.id: + print(f"streamed tool call id: {tool_call.id} ") + + if tool_call.function: + if tool_call.function.name: + print( + f"streamed tool call name: {tool_call.function.name}") + + if tool_call.function.arguments: + arguments[tool_call_idx] += tool_call.function.arguments + + return arguments + + +def main(): + # Initialize OpenAI client + client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, + ) + + # Get available models and select one + models = client.models.list() + model = models.data[0].id + + chat_completion = client.chat.completions.create(messages=messages, + model=model, + tools=tools) + + print("-" * 70) + print("Chat completion results:") + print(chat_completion) + print("-" * 70) + + # Stream tool calls + chunks = handle_tool_calls_stream(client, messages, model, tools) + print("-" * 70) + + # Handle arguments from streamed tool calls + arguments = handle_tool_calls_arguments(chunks) + + if len(arguments): + print(f"streamed tool call arguments: {arguments[-1]}\n") + + print("-" * 70) + + # Add tool call results to the conversation messages.append({ - "role": "tool", - "content": result, - "tool_call_id": call.id, - "name": call.function.name + "role": "assistant", + "tool_calls": chat_completion.choices[0].message.tool_calls }) -chat_completion_2 = client.chat.completions.create(messages=messages, - model=model, - tools=tools, - stream=False) -print("\n\n") -print(chat_completion_2) + # Now, simulate a tool call + available_tools = {"get_current_weather": get_current_weather} + + completion_tool_calls = chat_completion.choices[0].message.tool_calls + for call in completion_tool_calls: + tool_to_call = available_tools[call.function.name] + args = json.loads(call.function.arguments) + result = tool_to_call(**args) + print("tool_to_call result: ", result) + messages.append({ + "role": "tool", + "content": result, + "tool_call_id": call.id, + "name": call.function.name + }) + + chat_completion_2 = client.chat.completions.create(messages=messages, + model=model, + tools=tools, + stream=False) + print("Chat completion2 results:") + print(chat_completion_2) + print("-" * 70) + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py new file mode 100644 index 0000000000000..97d900bb75f1a --- /dev/null +++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py @@ -0,0 +1,142 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +To run this example, you can start the vLLM server +without any specific flags: + +```bash +VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \ + --guided-decoding-backend outlines +``` + +This example demonstrates how to generate chat completions +using the OpenAI Python client library. +""" + +from openai import OpenAI + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": + "string", + "description": + "The city to find the weather for" + ", e.g. 'San Francisco'", + }, + "state": { + "type": + "string", + "description": + "the two-letter abbreviation for the state that the " + "city is in, e.g. 'CA' which would mean 'California'", + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"], + }, + }, + "required": ["city", "state", "unit"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_forecast", + "description": "Get the weather forecast for a given location", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": + "string", + "description": + "The city to get the forecast for, e.g. 'New York'", + }, + "state": { + "type": + "string", + "description": + "The two-letter abbreviation for the state, e.g. 'NY'", + }, + "days": { + "type": + "integer", + "description": + "Number of days to get the forecast for (1-7)", + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"], + }, + }, + "required": ["city", "state", "days", "unit"], + }, + }, + }, +] + +messages = [ + { + "role": "user", + "content": "Hi! How are you doing today?" + }, + { + "role": "assistant", + "content": "I'm doing well! How can I help you?" + }, + { + "role": + "user", + "content": + "Can you tell me what the current weather is in Dallas \ + and the forecast for the next 5 days, in fahrenheit?", + }, +] + + +def main(): + client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, + ) + + models = client.models.list() + model = models.data[0].id + + chat_completion = client.chat.completions.create( + messages=messages, + model=model, + tools=tools, + tool_choice="required", + stream=True # Enable streaming response + ) + + for chunk in chat_completion: + if chunk.choices and chunk.choices[0].delta.tool_calls: + print(chunk.choices[0].delta.tool_calls) + + chat_completion = client.chat.completions.create(messages=messages, + model=model, + tools=tools, + tool_choice="required") + + print(chat_completion.choices[0].message.tool_calls) + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py index 9e7a69c6c87d6..8c6470aa3dd41 100644 --- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py @@ -31,14 +31,6 @@ available_tools = {"get_current_weather": get_current_weather} openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) - -models = client.models.list() -model = models.data[0].id - tools = [{ "type": "function", "function": { @@ -109,69 +101,87 @@ def extract_reasoning_and_calls(chunks: list): return reasoning_content, arguments, function_names -print("---------Full Generate With Automatic Function Calling-------------") -tool_calls = client.chat.completions.create(messages=messages, - model=model, - tools=tools) -print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}") -print(f"function name: " - f"{tool_calls.choices[0].message.tool_calls[0].function.name}") -print(f"function arguments: " - f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}") +def main(): + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) -print("----------Stream Generate With Automatic Function Calling-----------") -tool_calls_stream = client.chat.completions.create(messages=messages, - model=model, - tools=tools, - stream=True) -chunks = [] -for chunk in tool_calls_stream: - chunks.append(chunk) + models = client.models.list() + model = models.data[0].id -reasoning_content, arguments, function_names = extract_reasoning_and_calls( - chunks) + print( + "---------Full Generate With Automatic Function Calling-------------") + tool_calls = client.chat.completions.create(messages=messages, + model=model, + tools=tools) + print( + f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}" + ) + print(f"function name: " + f"{tool_calls.choices[0].message.tool_calls[0].function.name}") + print(f"function arguments: " + f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}") -print(f"reasoning_content: {reasoning_content}") -print(f"function name: {function_names[0]}") -print(f"function arguments: {arguments[0]}") + print( + "----------Stream Generate With Automatic Function Calling-----------") + tool_calls_stream = client.chat.completions.create(messages=messages, + model=model, + tools=tools, + stream=True) -print("----------Full Generate With Named Function Calling-----------------") -tool_calls = client.chat.completions.create(messages=messages, - model=model, - tools=tools, - tool_choice={ - "type": "function", - "function": { - "name": - "get_current_weather" - } - }) + chunks = list(tool_calls_stream) -tool_call = tool_calls.choices[0].message.tool_calls[0].function -print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}") -print(f"function name: {tool_call.name}") -print(f"function arguments: {tool_call.arguments}") -print("----------Stream Generate With Named Function Calling--------------") + reasoning_content, arguments, function_names = extract_reasoning_and_calls( + chunks) -tool_calls_stream = client.chat.completions.create( - messages=messages, - model=model, - tools=tools, - tool_choice={ - "type": "function", - "function": { - "name": "get_current_weather" - } - }, - stream=True) + print(f"reasoning_content: {reasoning_content}") + print(f"function name: {function_names[0]}") + print(f"function arguments: {arguments[0]}") -chunks = [] -for chunk in tool_calls_stream: - chunks.append(chunk) + print( + "----------Full Generate With Named Function Calling-----------------") + tool_calls = client.chat.completions.create(messages=messages, + model=model, + tools=tools, + tool_choice={ + "type": "function", + "function": { + "name": + "get_current_weather" + } + }) -reasoning_content, arguments, function_names = extract_reasoning_and_calls( - chunks) -print(f"reasoning_content: {reasoning_content}") -print(f"function name: {function_names[0]}") -print(f"function arguments: {arguments[0]}") -print("\n\n") + tool_call = tool_calls.choices[0].message.tool_calls[0].function + print( + f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}" + ) + print(f"function name: {tool_call.name}") + print(f"function arguments: {tool_call.arguments}") + print( + "----------Stream Generate With Named Function Calling--------------") + + tool_calls_stream = client.chat.completions.create( + messages=messages, + model=model, + tools=tools, + tool_choice={ + "type": "function", + "function": { + "name": "get_current_weather" + } + }, + stream=True) + + chunks = list(tool_calls_stream) + + reasoning_content, arguments, function_names = extract_reasoning_and_calls( + chunks) + print(f"reasoning_content: {reasoning_content}") + print(f"function name: {function_names[0]}") + print(f"function arguments: {arguments[0]}") + print("\n\n") + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py index b5dbed1205d35..6f5f7b5fa20ba 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning.py @@ -3,8 +3,8 @@ An example shows how to generate chat completions from reasoning models like DeepSeekR1. -To run this example, you need to start the vLLM server with the reasoning -parser: +To run this example, you need to start the vLLM server +with the reasoning parser: ```bash vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ @@ -21,34 +21,44 @@ from openai import OpenAI openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) -models = client.models.list() -model = models.data[0].id +def main(): + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) -# Round 1 -messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] -response = client.chat.completions.create(model=model, messages=messages) + models = client.models.list() + model = models.data[0].id -reasoning_content = response.choices[0].message.reasoning_content -content = response.choices[0].message.content + # Round 1 + messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] + # ruff: noqa: E501 + # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` + response = client.chat.completions.create(model=model, messages=messages) -print("reasoning_content for Round 1:", reasoning_content) -print("content for Round 1:", content) + reasoning_content = response.choices[0].message.reasoning_content + content = response.choices[0].message.content -# Round 2 -messages.append({"role": "assistant", "content": content}) -messages.append({ - "role": "user", - "content": "How many Rs are there in the word 'strawberry'?", -}) -response = client.chat.completions.create(model=model, messages=messages) + print("reasoning_content for Round 1:", reasoning_content) + print("content for Round 1:", content) -reasoning_content = response.choices[0].message.reasoning_content -content = response.choices[0].message.content + # Round 2 + messages.append({"role": "assistant", "content": content}) + messages.append({ + "role": + "user", + "content": + "How many Rs are there in the word 'strawberry'?", + }) + response = client.chat.completions.create(model=model, messages=messages) -print("reasoning_content for Round 2:", reasoning_content) -print("content for Round 2:", content) + reasoning_content = response.choices[0].message.reasoning_content + content = response.choices[0].message.content + + print("reasoning_content for Round 2:", reasoning_content) + print("content for Round 2:", content) + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py index fe4332576d438..90481cdc0fb79 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py @@ -3,7 +3,7 @@ An example shows how to generate chat completions from reasoning models like DeepSeekR1. -To run this example, you need to start the vLLM server with the reasoning +To run this example, you need to start the vLLM server with the reasoning parser: ```bash @@ -29,40 +29,49 @@ from openai import OpenAI openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) - -models = client.models.list() -model = models.data[0].id - messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] -stream = client.chat.completions.create(model=model, - messages=messages, - stream=True) -print("client: Start streaming chat completions...") -printed_reasoning_content = False -printed_content = False -for chunk in stream: - reasoning_content = None - content = None - # Check the content is reasoning_content or content - if hasattr(chunk.choices[0].delta, "reasoning_content"): - reasoning_content = chunk.choices[0].delta.reasoning_content - elif hasattr(chunk.choices[0].delta, "content"): - content = chunk.choices[0].delta.content +def main(): + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) - if reasoning_content is not None: - if not printed_reasoning_content: - printed_reasoning_content = True - print("reasoning_content:", end="", flush=True) - print(reasoning_content, end="", flush=True) - elif content is not None: - if not printed_content: - printed_content = True - print("\ncontent:", end="", flush=True) - # Extract and print the content - print(content, end="", flush=True) + models = client.models.list() + model = models.data[0].id + + # ruff: noqa: E501 + # For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}` + stream = client.chat.completions.create(model=model, + messages=messages, + stream=True) + + print("client: Start streaming chat completions...") + printed_reasoning_content = False + printed_content = False + + for chunk in stream: + reasoning_content = None + content = None + # Check the content is reasoning_content or content + if hasattr(chunk.choices[0].delta, "reasoning_content"): + reasoning_content = chunk.choices[0].delta.reasoning_content + elif hasattr(chunk.choices[0].delta, "content"): + content = chunk.choices[0].delta.content + + if reasoning_content is not None: + if not printed_reasoning_content: + printed_reasoning_content = True + print("reasoning_content:", end="", flush=True) + print(reasoning_content, end="", flush=True) + elif content is not None: + if not printed_content: + printed_content = True + print("\ncontent:", end="", flush=True) + # Extract and print the content + print(content, end="", flush=True) + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py index 2c63c5ec370e3..c850b5aa2f800 100644 --- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py @@ -98,7 +98,7 @@ def dse_qwen2_vl(inp: dict): print("Embedding output:", response_json["data"][0]["embedding"]) -if __name__ == '__main__': +def parse_args(): parser = argparse.ArgumentParser( "Script to call a specified VLM through the API. Make sure to serve " "the model with --task embed before running this.") @@ -107,8 +107,10 @@ if __name__ == '__main__': choices=["vlm2vec", "dse_qwen2_vl"], required=True, help="Which model to call.") - args = parser.parse_args() + return parser.parse_args() + +def main(args): if args.model == "vlm2vec": vlm2vec() elif args.model == "dse_qwen2_vl": @@ -120,3 +122,8 @@ if __name__ == '__main__': "type": "text", "content": "What is the weather like today?", }) + + +if __name__ == '__main__': + args = parse_args() + main(args) diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py index 06b93d7d19315..6ab7619bff192 100644 --- a/examples/online_serving/openai_completion_client.py +++ b/examples/online_serving/openai_completion_client.py @@ -6,28 +6,36 @@ from openai import OpenAI openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -client = OpenAI( - # defaults to os.environ.get("OPENAI_API_KEY") - api_key=openai_api_key, - base_url=openai_api_base, -) -models = client.models.list() -model = models.data[0].id +def main(): + client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, + ) -# Completion API -stream = False -completion = client.completions.create( - model=model, - prompt="A robot may not injure a human being", - echo=False, - n=2, - stream=stream, - logprobs=3) + models = client.models.list() + model = models.data[0].id -print("Completion results:") -if stream: - for c in completion: - print(c) -else: - print(completion) + # Completion API + stream = False + completion = client.completions.create( + model=model, + prompt="A robot may not injure a human being", + echo=False, + n=2, + stream=stream, + logprobs=3) + + print("-" * 50) + print("Completion results:") + if stream: + for c in completion: + print(c) + else: + print(completion) + print("-" * 50) + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py index 67c5fc91bc65b..20a64ddb21413 100644 --- a/examples/online_serving/openai_cross_encoder_score.py +++ b/examples/online_serving/openai_cross_encoder_score.py @@ -16,13 +16,15 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response: return response -if __name__ == "__main__": +def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3") + return parser.parse_args() - args = parser.parse_args() + +def main(args): api_url = f"http://{args.host}:{args.port}/score" model_name = args.model @@ -30,9 +32,9 @@ if __name__ == "__main__": text_2 = "The capital of Brazil is Brasilia." prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} score_response = post_http_request(prompt=prompt, api_url=api_url) - print("Prompt when text_1 and text_2 are both strings:") + print("\nPrompt when text_1 and text_2 are both strings:") pprint.pprint(prompt) - print("Score Response:") + print("\nScore Response:") pprint.pprint(score_response.json()) text_1 = "What is the capital of France?" @@ -41,9 +43,9 @@ if __name__ == "__main__": ] prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} score_response = post_http_request(prompt=prompt, api_url=api_url) - print("Prompt when text_1 is string and text_2 is a list:") + print("\nPrompt when text_1 is string and text_2 is a list:") pprint.pprint(prompt) - print("Score Response:") + print("\nScore Response:") pprint.pprint(score_response.json()) text_1 = [ @@ -54,7 +56,12 @@ if __name__ == "__main__": ] prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} score_response = post_http_request(prompt=prompt, api_url=api_url) - print("Prompt when text_1 and text_2 are both lists:") + print("\nPrompt when text_1 and text_2 are both lists:") pprint.pprint(prompt) - print("Score Response:") + print("\nScore Response:") pprint.pprint(score_response.json()) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py index b7c5651e3bab2..bc217f7ca7a0b 100644 --- a/examples/online_serving/openai_embedding_client.py +++ b/examples/online_serving/openai_embedding_client.py @@ -6,22 +6,29 @@ from openai import OpenAI openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -client = OpenAI( - # defaults to os.environ.get("OPENAI_API_KEY") - api_key=openai_api_key, - base_url=openai_api_base, -) -models = client.models.list() -model = models.data[0].id +def main(): + client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, + ) -responses = client.embeddings.create( - input=[ - "Hello my name is", - "The best thing about vLLM is that it supports many different models" - ], - model=model, -) + models = client.models.list() + model = models.data[0].id -for data in responses.data: - print(data.embedding) # List of float of len 4096 + responses = client.embeddings.create( + # ruff: noqa: E501 + input=[ + "Hello my name is", + "The best thing about vLLM is that it supports many different models" + ], + model=model, + ) + + for data in responses.data: + print(data.embedding) # List of float of len 4096 + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/openai_embedding_matryoshka_fy.py b/examples/online_serving/openai_embedding_matryoshka_fy.py new file mode 100644 index 0000000000000..27ab8cb64037b --- /dev/null +++ b/examples/online_serving/openai_embedding_matryoshka_fy.py @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Example Python client for embedding API dimensions using vLLM API server +NOTE: + start a supported Matryoshka Embeddings model server with `vllm serve`, e.g. + vllm serve jinaai/jina-embeddings-v3 --trust-remote-code +""" + +from openai import OpenAI + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + + +def main(): + client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, + ) + + models = client.models.list() + model = models.data[0].id + + responses = client.embeddings.create( + input=["Follow the white rabbit."], + model=model, + dimensions=1, + ) + + for data in responses.data: + print(data.embedding) # List of float of len 1 + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py index e17f9c5efd659..abcfe27c27699 100644 --- a/examples/online_serving/openai_pooling_client.py +++ b/examples/online_serving/openai_pooling_client.py @@ -17,7 +17,7 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response: return response -if __name__ == "__main__": +def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) @@ -25,15 +25,20 @@ if __name__ == "__main__": type=str, default="jason9693/Qwen2.5-1.5B-apeach") - args = parser.parse_args() + return parser.parse_args() + + +def main(args): api_url = f"http://{args.host}:{args.port}/pooling" model_name = args.model # Input like Completions API prompt = {"model": model_name, "input": "vLLM is great!"} pooling_response = post_http_request(prompt=prompt, api_url=api_url) + print("-" * 50) print("Pooling Response:") pprint.pprint(pooling_response.json()) + print("-" * 50) # Input like Chat API prompt = { @@ -50,3 +55,9 @@ if __name__ == "__main__": pooling_response = post_http_request(prompt=prompt, api_url=api_url) print("Pooling Response:") pprint.pprint(pooling_response.json()) + print("-" * 50) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py index 494e7c8ebe12a..5fcb7c5264162 100644 --- a/examples/online_serving/openai_transcription_client.py +++ b/examples/online_serving/openai_transcription_client.py @@ -23,10 +23,15 @@ def sync_openai(): with open(str(mary_had_lamb), "rb") as f: transcription = client.audio.transcriptions.create( file=f, - model="openai/whisper-small", + model="openai/whisper-large-v3", language="en", response_format="json", - temperature=0.0) + temperature=0.0, + # Additional sampling params not provided by OpenAI API. + extra_body=dict( + seed=4419, + repetition_penalty=1.3, + )) print("transcription result:", transcription.text) diff --git a/examples/template_florence2.jinja b/examples/template_florence2.jinja new file mode 100644 index 0000000000000..d257aed6a85b0 --- /dev/null +++ b/examples/template_florence2.jinja @@ -0,0 +1,7 @@ +{%- for message in messages -%} + {%- if message['role'] == 'user' -%} + {{- message['content'] -}} + {%- elif message['role'] == 'assistant' -%} + {{- message['content'] -}} + {%- endif -%} +{%- endfor -%} diff --git a/examples/tool_chat_template_llama3.2_pythonic.jinja b/examples/tool_chat_template_llama3.2_pythonic.jinja index 8c38de6c6a907..e4ec2353b3509 100644 --- a/examples/tool_chat_template_llama3.2_pythonic.jinja +++ b/examples/tool_chat_template_llama3.2_pythonic.jinja @@ -76,7 +76,7 @@ {{- tool_call.name + '(' -}} {%- for param in tool_call.arguments %} {{- param + '=' -}} - {{- "%sr" | format(tool_call.arguments[param]) -}} + {{- "%s" | format(tool_call.arguments[param]) -}} {% if not loop.last %}, {% endif %} {%- endfor %} {{- ')' -}} diff --git a/examples/tool_chat_template_llama4_pythonic.jinja b/examples/tool_chat_template_llama4_pythonic.jinja new file mode 100644 index 0000000000000..bd18a35bdda93 --- /dev/null +++ b/examples/tool_chat_template_llama4_pythonic.jinja @@ -0,0 +1,139 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = false %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- if messages[0]['content'] is string %} + {%- set system_message = messages[0]['content']|trim %} + {%- else %} + {%- set system_message = messages[0]['content'][0]['text']|trim %} + {%- endif %} + {%- set messages = messages[1:] %} +{%- else %} + {%- if tools is not none %} + {#- Add default tool system message when tools are provided #} + {%- set system_message = "You are a helpful assistant with tool calling " + "capabilities. Only reply with a tool call if the function exists in the " + "library provided by the user. If it doesn't exist, just reply directly in " + "natural language. When you receive a tool call response, use the output to " + "format an answer to the original user question." %} + {%- else %} + {%- set system_message = "" %} + {%- endif %} +{%- endif %} + +{#- System message if the user supplied one, or if tools are used (default tool system message) #} +{%- if system_message %} + {#- always use user provided system message to override default tool system message #} + {{- "<|header_start|>system<|header_end|>\n\n" }} + {{- system_message }} + {%- if tools is not none and not tools_in_user_message %} + {{- "Tools: You have access to the following tools. You might need to use one " + "or more function/tool calls to fulfill the task. \n" + "If none are needed, then proceed to the response.\n\n" + "Tool Call Syntax: You can call tools using the following syntax:\n" + "[func_name1(params_name1=params_value1, params_name2=params_value2, ...), ...]\n" + "Do not include anything else when calling the tools with the syntax above.\n\n" + "Here is a list of functions in JSON format that you can invoke.\n " }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {%- endif %} + {{- "<|eot|>" }} +{%- endif %} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and tools is not none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- if messages[0]['content'] is string %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- else %} + {%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %} + {%- endif %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|header_start|>user<|header_end|>\n\n' -}} + {{- first_user_message}} + {{- "\nHere is a list of functions in JSON format that you can invoke:"}} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- "Should you decide to return the function call(s), put them in the format " + "of [func_name1(params_name1=params_value1, params_name2=params_value2, " + "...), ...]\nDo not include anything else when calling the tools with the " + "syntax above." }} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }} + {%- if message['content'] is string %} + {{- message['content'] }} + {%- else %} + {%- for content in message['content'] %} + {%- if content['type'] == 'image' %} + {{- '<|image|>' }} + {%- elif content['type'] == 'text' %} + {{- content['text'] | trim }} + {%- endif %} + {%- endfor %} + {%- endif %} + {{- "<|eot|>" }} + {%- elif 'tool_calls' in message and message.tool_calls|length > 0 %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|header_start|>assistant<|header_end|>\n\n' -}} + {%- if message['content'] is string %} + {{- message['content'] }} + {%- else %} + {%- for content in message['content'] %} + {%- if content['type'] == 'image' %} + {{- '<|image|>' }} + {%- elif content['type'] == 'text' %} + {{- content['text'] }} + {%- endif %} + {%- endfor %} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- tool_call.name + '(' -}} + {%- for param in tool_call.arguments %} + {{- param + '=' -}} + {{- "%s" | format(tool_call.arguments[param]) -}} + {% if not loop.last %}, {% endif %} + {%- endfor %} + {{- ')' -}} + {% if not loop.last %}, {% endif %} + {%- endfor %} + {{- "<|eom|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|header_start|>ipython<|header_end|>\n\n" }} + {%- if message.content is string %} + {{- message.content | tojson }} + {%- else %} + {%- for content in message['content'] %} + {%- if content['type'] == 'text' %} + {{- content['text'] | tojson }} + {%- endif %} + {%- endfor %} + {%- endif %} + {{- "<|eom|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|header_start|>assistant<|header_end|>\n\n' }} +{%- endif %} diff --git a/examples/tool_chat_template_phi4_mini.jinja b/examples/tool_chat_template_phi4_mini.jinja new file mode 100644 index 0000000000000..36423b6c4240a --- /dev/null +++ b/examples/tool_chat_template_phi4_mini.jinja @@ -0,0 +1,60 @@ +{%- if messages %} + {%- if system_message or tools %} +<|system|> + +{%- if system_message %} +{{ system_message }} +{%- endif %} +In addition to plain text responses, you can chose to call one or more of the provided functions. + +Use the following rule to decide when to call a function: + * if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so + * if you need external information that can be obtained by calling one or more of the provided functions, generate a function calls + +If you decide to call functions: + * prefix function calls with functools marker (no closing marker required) + * all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...] + * follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples + * respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0 + * make sure you pick the right functions that match the user intent + + +{%- if tools %} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %}<|end|> + {%- endif %} + + {%- for message in messages %} + {%- if message.role != "system" %} +<|{{ message.role }}|> + {%- if message.content and message.role == "tools" %} +{"result": {{ message.content }}} + {%- elif message.content %} +{{ message.content }} + {%- elif message.tool_calls %} + {%- for call in message.tool_calls %} +{"name": "{{ call.function.name }}", "arguments": {{ call.function.arguments }}} + {%- if not loop.last %},{% endif %} + {%- endfor %} + {%- endif %}<|end|> + {%- endif %} + {%- endfor %}<|assistant|> + +{%- else %} + {%- if system_message %} +<|system|> + +{{ system_message }}<|end|> + {%- endif %} + {%- if prompt %} +<|user|> + +{{ prompt }}<|end|> + {%- endif %}<|assistant|> + +{%- endif %} +{{ response }} +{%- if response %}<|user|>{% endif %} \ No newline at end of file diff --git a/examples/tool_chat_template_toolace.jinja b/examples/tool_chat_template_toolace.jinja index a9b3b7189dddf..da0f25cdcb337 100644 --- a/examples/tool_chat_template_toolace.jinja +++ b/examples/tool_chat_template_toolace.jinja @@ -44,7 +44,7 @@ {{- tool_call.name + '(' -}} {%- for param in tool_call.arguments %} {{- param + '=' -}} - {{- "%sr" | format(tool_call.arguments[param]) -}} + {{- "%s" | format(tool_call.arguments[param]) -}} {% if not loop.last %}, {% endif %} {%- endfor %} {{- ')' -}} diff --git a/format.sh b/format.sh index fb503ec4bbfcd..6ba93e0a19ba8 100755 --- a/format.sh +++ b/format.sh @@ -1,6 +1,6 @@ #!/bin/bash -echo "vLLM linting system has been moved from format.sh to pre-commit hook." +echo "vLLM linting system has been moved from format.sh to pre-commit hooks." echo "Please run 'pip install -r requirements/lint.txt', followed by" -echo "'pre-commit install --hook-type pre-commit --hook-type commit-msg' to install the pre-commit hook." -echo "Then linters will run automatically before each commit." +echo "'pre-commit install' to install the pre-commit hooks." +echo "Then linters will run automatically before each commit." \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 07616c858f1f3..167e975c70fdb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Scientific/Engineering :: Information Analysis", ] -requires-python = ">=3.9" +requires-python = ">=3.9,<3.13" dynamic = [ "version", "dependencies", "optional-dependencies"] [project.urls] diff --git a/python_only_dev.py b/python_only_dev.py deleted file mode 100644 index a303697b780a6..0000000000000 --- a/python_only_dev.py +++ /dev/null @@ -1,16 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -msg = """Old style python only build (without compilation) is deprecated, please check https://docs.vllm.ai/en/latest/getting_started/installation.html#python-only-build-without-compilation for the new way to do python only build (without compilation). - -TL;DR: - -VLLM_USE_PRECOMPILED=1 pip install -e . - -or - -export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch -export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl -pip install -e . -""" # noqa - -print(msg) diff --git a/requirements/common.txt b/requirements/common.txt index 14084b79121bb..33c4c3219f159 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -1,13 +1,14 @@ cachetools psutil sentencepiece # Required for LLaMA tokenizer. -numpy < 2.0.0 +numpy requests >= 2.26.0 tqdm blake3 py-cpuinfo -transformers >= 4.48.2 # Required for Bamba model and Transformers backend. -tokenizers >= 0.19.1 # Required for Llama 3. +transformers >= 4.51.1 +huggingface-hub[hf_xet] >= 0.30.0 # Required for Xet downloads. +tokenizers >= 0.21.1 # Required for fast incremental detokenization. protobuf # Required by LlamaTokenizer. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. aiohttp @@ -21,23 +22,28 @@ lm-format-enforcer >= 0.10.11, < 0.11 llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" outlines == 0.1.11 lark == 1.2.2 -xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64" +xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64" typing_extensions >= 4.10 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 partial-json-parser # used for parsing partial JSON outputs -pyzmq +pyzmq >= 25.0.0 msgspec -gguf == 0.10.0 +gguf >= 0.13.0 importlib_metadata mistral_common[opencv] >= 1.5.4 +opencv-python-headless >= 4.11.0 # required for video IO pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.9.2 # required for compressed-tensors +compressed-tensors == 0.9.3 # required for compressed-tensors depyf==0.18.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py watchfiles # required for http server to monitor the updates of TLS files python-json-logger # Used by logging as per examples/other/logging_configuration.md scipy # Required for phi-4-multimodal-instruct ninja # Required for xgrammar, rocm, tpu, xpu +opentelemetry-sdk>=1.26.0,<1.27.0 # vllm.tracing +opentelemetry-api>=1.26.0,<1.27.0 # vllm.tracing +opentelemetry-exporter-otlp>=1.26.0,<1.27.0 # vllm.tracing +opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0 # vllm.tracing diff --git a/requirements/cpu.txt b/requirements/cpu.txt index fc09083781e6f..69f732c2417a1 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -12,6 +12,9 @@ torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x" torchaudio==2.6.0; platform_machine == "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch -torchvision; platform_machine != "ppc64le" and platform_machine != "s390x" +torchvision; platform_machine != "ppc64le" and platform_machine != "s390x" torchvision==0.21.0; platform_machine == "ppc64le" datasets # for benchmark scripts + +# cpu cannot use triton 3.3.0 +triton==3.2.0; platform_machine == "x86_64" diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 702d4b0bb320c..cdc6ee75afbcd 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -1,10 +1,11 @@ # Common dependencies -r common.txt -numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding +numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding +numba == 0.61.2; python_version > '3.9' # Dependencies for NVIDIA GPUs -ray[cgraph]>=2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1. +ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1. torch==2.6.0 torchaudio==2.6.0 # These must be updated alongside torch diff --git a/requirements/docs.txt b/requirements/docs.txt index 416ca503b36c0..99fb87def6dd2 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -18,6 +18,7 @@ transformers mistral_common >= 1.5.4 aiohttp starlette +scipy openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args diff --git a/requirements/hpu.txt b/requirements/hpu.txt index a61d72d04f409..830f6ef3f50cb 100644 --- a/requirements/hpu.txt +++ b/requirements/hpu.txt @@ -5,6 +5,7 @@ ray triton==3.1.0 pandas +numpy==1.26.4 tabulate setuptools>=61 setuptools-scm>=8 diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt new file mode 100644 index 0000000000000..20372a9b2ef16 --- /dev/null +++ b/requirements/nightly_torch_test.txt @@ -0,0 +1,28 @@ +# Dependency that able to run entrypoints test +# pytest and its extensions +pytest +pytest-asyncio +pytest-forked +pytest-mock +pytest-rerunfailures +pytest-shard +pytest-timeout + + +librosa # required by audio tests in entrypoints/openai +sentence-transformers +numba == 0.61.2; python_version > '3.9' +# testing utils +awscli +boto3 +botocore +datasets +ray >= 2.10.0 +peft +runai-model-streamer==0.11.0 +runai-model-streamer-s3==0.11.0 +tensorizer>=2.9.0 +lm-eval==0.4.8 +buildkite-test-collector==0.1.9 + +lm-eval[api]==0.4.8 # required for model evaluation test diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt index 6af78da4993db..05de4ff168453 100644 --- a/requirements/rocm-build.txt +++ b/requirements/rocm-build.txt @@ -6,7 +6,8 @@ torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 -cmake>=3.26 +triton==3.2 +cmake>=3.26,<4 packaging setuptools>=61 setuptools-scm>=8 diff --git a/requirements/rocm.txt b/requirements/rocm.txt index 345c84b0f6cf2..4df92aab3749e 100644 --- a/requirements/rocm.txt +++ b/requirements/rocm.txt @@ -1,7 +1,8 @@ # Common dependencies -r common.txt -numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding +numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding +numba == 0.61.2; python_version > '3.9' # Dependencies for AMD GPUs awscli diff --git a/requirements/test.in b/requirements/test.in index 5c59bbd1ac7ae..3be580db0674c 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -5,11 +5,12 @@ pytest-forked pytest-asyncio pytest-rerunfailures pytest-shard +pytest-timeout # testing utils awscli backoff # required for phi4mm test -decord # required for video tests +blobfile # required for kimi-vl test einops # required for MPT, qwen-vl and Mamba httpx librosa # required for audio tests @@ -17,7 +18,7 @@ vector_quantize_pytorch # required for minicpmo_26 test vocos # required for minicpmo_26 test peft pqdm -ray[cgraph]>=2.43.0 # Ray Compiled Graph, required by pipeline parallelism tests +ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests sentence-transformers # required for embedding tests soundfile # required for audio tests jiwer # required for audio tests @@ -26,11 +27,17 @@ torch==2.6.0 torchaudio==2.6.0 torchvision==0.21.0 transformers_stream_generator # required for qwen-vl test +mamba_ssm # required for plamo2 test matplotlib # required for qwen-vl test mistral_common[opencv] >= 1.5.4 # required for pixtral test +num2words # required for smolvlm test +opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test -lm-eval[api]==0.4.4 # required for model evaluation test -transformers==4.48.2 +lm-eval[api]==0.4.8 # required for model evaluation test +transformers==4.51.1 +tokenizers==0.21.1 +huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads. +schemathesis>=3.39.15 # Required for openai schema test. # quantization bitsandbytes>=0.45.3 buildkite-test-collector==0.1.9 @@ -38,7 +45,9 @@ buildkite-test-collector==0.1.9 genai_perf==0.0.8 tritonclient==2.51.0 -numpy < 2.0.0 +numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding +numba == 0.61.2; python_version > '3.9' +numpy runai-model-streamer==0.11.0 runai-model-streamer-s3==0.11.0 fastsafetensors>=0.1.10 diff --git a/requirements/test.txt b/requirements/test.txt index b0ae479604a1e..6dcd4ff01460c 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -20,25 +20,35 @@ aiosignal==1.3.1 annotated-types==0.7.0 # via pydantic anyio==4.6.2.post1 - # via httpx + # via + # httpx + # starlette argcomplete==3.5.1 # via datamodel-code-generator +arrow==1.3.0 + # via isoduration attrs==24.2.0 # via # aiohttp + # hypothesis # jsonlines # jsonschema + # pytest-subtests # referencing audioread==3.0.1 # via librosa awscli==1.35.23 # via -r requirements/test.in backoff==2.2.1 - # via -r requirements/test.in + # via + # -r requirements/test.in + # schemathesis bitsandbytes==0.45.3 # via -r requirements/test.in black==24.10.0 # via datamodel-code-generator +blobfile==3.0.0 + # via -r requirements/test.in boto3==1.35.57 # via tensorizer botocore==1.35.57 @@ -67,11 +77,13 @@ click==8.1.7 # jiwer # nltk # ray + # schemathesis # typer colorama==0.4.6 # via # awscli # sacrebleu + # schemathesis # tqdm-multiprocess contourpy==1.3.0 # via matplotlib @@ -93,8 +105,6 @@ datasets==3.0.2 # lm-eval decorator==5.1.1 # via librosa -decord==0.6.0 - # via -r requirements/test.in dill==0.3.8 # via # datasets @@ -103,12 +113,15 @@ dill==0.3.8 # multiprocess dnspython==2.7.0 # via email-validator +docopt==0.6.2 + # via num2words docutils==0.16 # via awscli einops==0.8.0 # via # -r requirements/test.in # encodec + # mamba-ssm # vector-quantize-pytorch # vocos einx==0.3.0 @@ -127,6 +140,7 @@ fastsafetensors==0.1.10 # via -r requirements/test.in filelock==3.16.1 # via + # blobfile # datasets # huggingface-hub # ray @@ -134,6 +148,8 @@ filelock==3.16.1 # transformers fonttools==4.54.1 # via matplotlib +fqdn==1.5.1 + # via jsonschema frozendict==2.4.6 # via einx frozenlist==1.5.0 @@ -152,16 +168,25 @@ genai-perf==0.0.8 # via -r requirements/test.in genson==1.3.0 # via datamodel-code-generator +graphql-core==3.2.6 + # via hypothesis-graphql h11==0.14.0 # via httpcore +harfile==0.3.0 + # via schemathesis +hf-xet==0.1.4 + # via huggingface-hub hiredis==3.0.0 # via tensorizer httpcore==1.0.6 # via httpx httpx==0.27.2 - # via -r requirements/test.in -huggingface-hub==0.26.2 # via + # -r requirements/test.in + # schemathesis +huggingface-hub==0.30.1 + # via + # -r requirements/test.in # accelerate # datasets # evaluate @@ -173,17 +198,29 @@ huggingface-hub==0.26.2 # vocos humanize==4.11.0 # via runai-model-streamer +hypothesis==6.131.0 + # via + # hypothesis-graphql + # hypothesis-jsonschema + # schemathesis +hypothesis-graphql==0.11.1 + # via schemathesis +hypothesis-jsonschema==0.23.1 + # via schemathesis idna==3.10 # via # anyio # email-validator # httpx + # jsonschema # requests # yarl inflect==5.6.2 # via datamodel-code-generator iniconfig==2.0.0 # via pytest +isoduration==20.11.0 + # via jsonschema isort==5.13.2 # via datamodel-code-generator jinja2==3.1.6 @@ -203,12 +240,18 @@ joblib==1.4.2 # scikit-learn jsonlines==4.0.0 # via lm-eval +jsonpointer==3.0.0 + # via jsonschema jsonschema==4.23.0 # via + # hypothesis-jsonschema # mistral-common # ray + # schemathesis jsonschema-specifications==2024.10.1 # via jsonschema +junit-xml==1.9 + # via schemathesis kaleido==0.2.1 # via genai-perf kiwisolver==1.4.7 @@ -219,16 +262,22 @@ libnacl==2.1.0 # via tensorizer librosa==0.10.2.post1 # via -r requirements/test.in -llvmlite==0.43.0 +llvmlite==0.44.0 # via numba -lm-eval==0.4.4 +lm-eval==0.4.8 # via -r requirements/test.in lxml==5.3.0 - # via sacrebleu + # via + # blobfile + # sacrebleu +mamba-ssm==2.2.4 + # via -r requirements/test.in markdown-it-py==3.0.0 # via rich markupsafe==3.0.2 - # via jinja2 + # via + # jinja2 + # werkzeug matplotlib==3.9.2 # via -r requirements/test.in mbstrdecoder==1.1.3 @@ -260,10 +309,16 @@ mypy-extensions==1.0.0 # via black networkx==3.2.1 # via torch +ninja==1.11.1.3 + # via mamba-ssm nltk==3.9.1 # via rouge-score -numba==0.60.0 - # via librosa +num2words==0.5.14 + # via -r requirements/test.in +numba==0.61.2 + # via + # -r requirements/test.in + # librosa numexpr==2.10.1 # via lm-eval numpy==1.26.4 @@ -274,7 +329,6 @@ numpy==1.26.4 # contourpy # cupy-cuda12x # datasets - # decord # einx # encodec # evaluate @@ -335,8 +389,10 @@ nvidia-nvjitlink-cu12==12.4.127 # torch nvidia-nvtx-cu12==12.4.127 # via torch -opencv-python-headless==4.10.0.84 - # via mistral-common +opencv-python-headless==4.11.0.86 + # via + # -r requirements/test.in + # mistral-common packaging==24.1 # via # accelerate @@ -347,6 +403,7 @@ packaging==24.1 # fastparquet # huggingface-hub # lazy-loader + # mamba-ssm # matplotlib # peft # plotly @@ -418,6 +475,8 @@ pybind11==2.13.6 # via lm-eval pycparser==2.22 # via cffi +pycryptodomex==3.22.0 + # via blobfile pydantic==2.9.2 # via # datamodel-code-generator @@ -428,6 +487,8 @@ pygments==2.18.0 # via rich pyparsing==3.2.0 # via matplotlib +pyrate-limiter==3.7.0 + # via schemathesis pytablewriter==1.2.0 # via lm-eval pytest==8.3.3 @@ -440,6 +501,9 @@ pytest==8.3.3 # pytest-mock # pytest-rerunfailures # pytest-shard + # pytest-subtests + # pytest-timeout + # schemathesis pytest-asyncio==0.24.0 # via -r requirements/test.in pytest-forked==1.6.0 @@ -450,8 +514,13 @@ pytest-rerunfailures==14.0 # via -r requirements/test.in pytest-shard==0.1.2 # via -r requirements/test.in +pytest-subtests==0.14.1 + # via schemathesis +pytest-timeout==2.3.1 + # via -r requirements/test.in python-dateutil==2.9.0.post0 # via + # arrow # botocore # matplotlib # pandas @@ -473,6 +542,7 @@ pyyaml==6.0.2 # peft # ray # responses + # schemathesis # timm # transformers # vocos @@ -503,10 +573,16 @@ requests==2.32.3 # pooch # ray # responses + # schemathesis + # starlette-testclient # tiktoken # transformers responses==0.25.3 # via genai-perf +rfc3339-validator==0.1.4 + # via jsonschema +rfc3987==1.3.8 + # via jsonschema rich==13.9.4 # via # genai-perf @@ -535,6 +611,8 @@ safetensors==0.4.5 # peft # timm # transformers +schemathesis==3.39.15 + # via -r requirements/test.in scikit-learn==1.5.2 # via # librosa @@ -553,18 +631,23 @@ sentencepiece==0.2.0 # via mistral-common setuptools==75.8.0 # via + # mamba-ssm # pytablewriter # torch shellingham==1.5.4 # via typer six==1.16.0 # via + # junit-xml # python-dateutil + # rfc3339-validator # rouge-score sniffio==1.3.1 # via # anyio # httpx +sortedcontainers==2.4.0 + # via hypothesis soundfile==0.12.1 # via # -r requirements/test.in @@ -573,6 +656,12 @@ soxr==0.5.0.post1 # via librosa sqlitedict==2.1.0 # via lm-eval +starlette==0.46.2 + # via + # schemathesis + # starlette-testclient +starlette-testclient==0.4.1 + # via schemathesis statsmodels==0.14.4 # via genai-perf sympy==1.13.1 @@ -599,8 +688,14 @@ tiktoken==0.7.0 # mistral-common timm==1.0.11 # via -r requirements/test.in -tokenizers==0.21.0 - # via transformers +tokenizers==0.21.1 + # via + # -r requirements/test.in + # transformers +tomli==2.2.1 + # via schemathesis +tomli-w==1.2.0 + # via schemathesis torch==2.6.0 # via # -r requirements/test.in @@ -609,6 +704,7 @@ torch==2.6.0 # encodec # fastsafetensors # lm-eval + # mamba-ssm # peft # runai-model-streamer # sentence-transformers @@ -641,11 +737,12 @@ tqdm==4.66.6 # transformers tqdm-multiprocess==0.0.11 # via lm-eval -transformers==4.48.2 +transformers==4.51.1 # via # -r requirements/test.in # genai-perf # lm-eval + # mamba-ssm # peft # sentence-transformers # transformers-stream-generator @@ -664,6 +761,8 @@ typepy==1.3.2 # tabledata typer==0.15.2 # via fastsafetensors +types-python-dateutil==2.9.0.20241206 + # via arrow typing-extensions==4.12.2 # via # huggingface-hub @@ -676,8 +775,11 @@ typing-extensions==4.12.2 # typer tzdata==2024.2 # via pandas +uri-template==1.3.0 + # via jsonschema urllib3==2.2.3 # via + # blobfile # botocore # requests # responses @@ -686,6 +788,10 @@ vector-quantize-pytorch==1.21.2 # via -r requirements/test.in vocos==0.1.0 # via -r requirements/test.in +webcolors==24.11.1 + # via jsonschema +werkzeug==3.1.3 + # via schemathesis word2number==1.1 # via lm-eval xxhash==3.5.0 @@ -693,6 +799,8 @@ xxhash==3.5.0 # datasets # evaluate yarl==1.17.1 - # via aiohttp + # via + # aiohttp + # schemathesis zstandard==0.23.0 # via lm-eval diff --git a/requirements/tpu.txt b/requirements/tpu.txt index 35d5db6c46006..b63993ba1ee45 100644 --- a/requirements/tpu.txt +++ b/requirements/tpu.txt @@ -17,9 +17,9 @@ ray[data] --find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" -torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" -torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" +torch==2.8.0.dev20250408 +torchvision==0.22.0.dev20250408 +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" + diff --git a/setup.py b/setup.py index 37f3e78926c6e..b0cc2f48163c3 100755 --- a/setup.py +++ b/setup.py @@ -201,6 +201,9 @@ class cmake_build_ext(build_ext): else: # Default build tool to whatever cmake picks. build_tool = [] + # Make sure we use the nvcc from CUDA_HOME + if _is_cuda(): + cmake_args += [f'-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc'] subprocess.check_call( ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args], cwd=self.build_temp) @@ -592,9 +595,8 @@ def get_requirements() -> list[str]: for line in requirements: if line.startswith("-r "): resolved_requirements += _read_requirements(line.split()[1]) - elif line.startswith("--"): - continue - else: + elif not line.startswith("--") and not line.startswith( + "#") and line.strip() != "": resolved_requirements.append(line) return resolved_requirements @@ -640,11 +642,10 @@ if _is_hip(): if _is_cuda(): ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C")) - if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.0"): - # FA3 requires CUDA 12.0 or later + if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"): + # FA3 requires CUDA 12.3 or later ext_modules.append( CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C")) - if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"): # Optional since this doesn't get built (produce an .so file) when # not targeting a hopper system ext_modules.append( @@ -683,7 +684,7 @@ setup( "fastsafetensors": ["fastsafetensors >= 0.1.10"], "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"], "audio": ["librosa", "soundfile"], # Required for audio processing - "video": ["decord"] # Required for video processing + "video": [] # Kept for backwards compatibility }, cmdclass=cmdclass, package_data=package_data, diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py index 436e43638a3dd..be3ad12396b4b 100644 --- a/tests/basic_correctness/test_cpu_offload.py +++ b/tests/basic_correctness/test_cpu_offload.py @@ -1,15 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 -import pytest - from ..utils import compare_two_settings -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - monkeypatch.setenv('VLLM_USE_V1', '0') - - def test_cpu_offload(): compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [], ["--cpu-offload-gb", "1"]) diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index 31aa898282004..76b266aada684 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -155,6 +155,24 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool): llm.wake_up() output2 = llm.generate(prompt, sampling_params) - # cmp output assert output[0].outputs[0].text == output2[0].outputs[0].text + + llm.sleep(level=1) + llm.wake_up(tags=["weights"]) + + free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info() + used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline + + # should just reallocate memory for weights (1B model, ~2GiB weights) + if use_v1: + assert used_bytes < 10 * GiB_bytes + else: + assert used_bytes < 6 * GiB_bytes + + # now allocate kv cache memory + llm.wake_up(tags=["kv_cache"]) + output3 = llm.generate(prompt, sampling_params) + + # cmp output + assert output[0].outputs[0].text == output3[0].outputs[0].text diff --git a/tests/entrypoints/openai/reasoning_parsers/__init__.py b/tests/benchmarks/__init__.py similarity index 100% rename from tests/entrypoints/openai/reasoning_parsers/__init__.py rename to tests/benchmarks/__init__.py diff --git a/tests/benchmarks/test_latency_cli.py b/tests/benchmarks/test_latency_cli.py new file mode 100644 index 0000000000000..8537459b9f94d --- /dev/null +++ b/tests/benchmarks/test_latency_cli.py @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: Apache-2.0 +import subprocess + +import pytest + +MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" + + +@pytest.mark.benchmark +def test_bench_latency(): + command = [ + "vllm", "bench", "latency", "--model", MODEL_NAME, "--input-len", "32", + "--output-len", "1", "--enforce-eager", "--load-format", "dummy" + ] + result = subprocess.run(command, capture_output=True, text=True) + print(result.stdout) + print(result.stderr) + + assert result.returncode == 0, f"Benchmark failed: {result.stderr}" diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py new file mode 100644 index 0000000000000..b746d6b7853c9 --- /dev/null +++ b/tests/benchmarks/test_serve_cli.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: Apache-2.0 +import subprocess + +import pytest + +from ..utils import RemoteOpenAIServer + +MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--max-model-len", "1024", "--enforce-eager", "--load-format", "dummy" + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest.mark.benchmark +def test_bench_serve(server): + command = [ + "vllm", + "bench", + "serve", + "--model", + MODEL_NAME, + "--host", + server.host, + "--port", + str(server.port), + "--random-input-len", + "32", + "--random-output-len", + "4", + "--num-prompts", + "5", + ] + result = subprocess.run(command, capture_output=True, text=True) + print(result.stdout) + print(result.stderr) + + assert result.returncode == 0, f"Benchmark failed: {result.stderr}" diff --git a/tests/benchmarks/test_throughput_cli.py b/tests/benchmarks/test_throughput_cli.py new file mode 100644 index 0000000000000..2045b36293565 --- /dev/null +++ b/tests/benchmarks/test_throughput_cli.py @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: Apache-2.0 +import subprocess + +import pytest + +MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" + + +@pytest.mark.benchmark +def test_bench_throughput(): + command = [ + "vllm", "bench", "throughput", "--model", MODEL_NAME, "--input-len", + "32", "--output-len", "1", "--enforce-eager", "--load-format", "dummy" + ] + result = subprocess.run(command, capture_output=True, text=True) + print(result.stdout) + print(result.stderr) + + assert result.returncode == 0, f"Benchmark failed: {result.stderr}" diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 7307f44b6184e..d4551b1cc3aec 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -63,7 +63,8 @@ class LlamaConfig: factors.append((k, v)) factors.sort() import hashlib - return hashlib.md5(str(factors).encode()).hexdigest() + return hashlib.md5(str(factors).encode(), + usedforsecurity=False).hexdigest() def __post_init__(self): assert self.mlp_size >= self.hidden_size diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 3a45c35442ca8..579133ec0c3f6 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -2,21 +2,20 @@ from __future__ import annotations -from typing import Any +from typing import Any, Optional, Union import pytest import torch from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams -from vllm.config import CompilationLevel +from vllm.config import CompilationConfig, CompilationLevel from vllm.platforms import current_platform from ..utils import create_new_process_for_each_test -@pytest.fixture(params=None, name="model_info") -def models_list_fixture(request): +def models_list(*, all: bool = True, keywords: Optional[list[str]] = None): TEST_MODELS: list[tuple[str, dict[str, Any]]] = [ ("facebook/opt-125m", {}), ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", { @@ -33,51 +32,57 @@ def models_list_fixture(request): ("meta-llama/Llama-3.2-1B-Instruct", {}), ] - if is_quant_method_supported("aqlm"): - TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", { - "quantization": "aqlm" - })) - - # TODO: figure out why this fails. - if False and is_quant_method_supported("gguf"): # noqa: SIM223 - TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", { - "quantization": "gguf" - })) - - if is_quant_method_supported("gptq"): - TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", { - "quantization": "gptq" - })) - - if is_quant_method_supported("gptq_marlin"): - TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", { - "quantization": "gptq_marlin" - })) - - if is_quant_method_supported("gptq_marlin_24"): - TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", { - "quantization": "gptq_marlin_24" - })) - - if is_quant_method_supported("marlin"): - TEST_MODELS.append( - ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", { - "quantization": "marlin" + if all: + if is_quant_method_supported("aqlm"): + TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", { + "quantization": "aqlm" })) - if not current_platform.is_rocm() and is_quant_method_supported("awq"): - TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", { - "quantization": "AWQ" - })) + # TODO: figure out why this fails. + if False and is_quant_method_supported("gguf"): # noqa: SIM223 + TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", { + "quantization": "gguf" + })) - return TEST_MODELS + if is_quant_method_supported("gptq"): + TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", { + "quantization": "gptq" + })) + + if is_quant_method_supported("gptq_marlin"): + TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", { + "quantization": "gptq_marlin" + })) + + if is_quant_method_supported("gptq_marlin_24"): + TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", { + "quantization": "gptq_marlin_24" + })) + + if is_quant_method_supported("marlin"): + TEST_MODELS.append( + ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", { + "quantization": "marlin" + })) + + if not current_platform.is_rocm() and is_quant_method_supported("awq"): + TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", { + "quantization": "AWQ" + })) + + if keywords is None: + return TEST_MODELS + + # filter by keywords + pred = lambda model: any(keyword in model[0] for keyword in keywords) + return list(filter(pred, TEST_MODELS)) @pytest.mark.parametrize( "optimization_level", [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE], ) -@pytest.mark.parametrize("model_info", "", indirect=True) +@pytest.mark.parametrize("model_info", models_list(all=True)) @create_new_process_for_each_test() def test_full_graph( monkeypatch: pytest.MonkeyPatch, @@ -91,25 +96,60 @@ def test_full_graph( m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") print(f"MODEL={model}") - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0) - llm = LLM( - model=model, - enforce_eager=True, - tensor_parallel_size=1, - disable_custom_all_reduce=True, - compilation_config=optimization_level, - **model_kwargs, - ) - outputs = llm.generate(prompts, sampling_params) + run_model(optimization_level, model, model_kwargs) - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +PassConfig = CompilationConfig.PassConfig + + +# TODO(luka) add other supported compilation config scenarios here +@pytest.mark.parametrize( + "compilation_config, model_info", + [ + # additional compile sizes, only some of the models + (CompilationConfig(level=CompilationLevel.PIECEWISE, + compile_sizes=[1, 2]), model) + for model in models_list(all=False) + ] + [ + # RMSNorm + quant fusion, only 8-bit quant models + (CompilationConfig(level=CompilationLevel.PIECEWISE, + custom_ops=["+rms_norm"], + pass_config=PassConfig(enable_fusion=True, + enable_noop=True)), model) + for model in models_list(keywords=["FP8-dynamic", "quantized.w8a8"]) + ]) +# only test some of the models +@create_new_process_for_each_test() +def test_custom_compile_config( + compilation_config: CompilationConfig, + model_info: tuple[str, dict[str, Any]], +): + model, model_kwargs = model_info + print(f"MODEL={model}") + run_model(compilation_config, model, model_kwargs) + + +def run_model(compile_config: Union[int, CompilationConfig], model: str, + model_kwargs: dict[str, Any]): + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0) + llm = LLM( + model=model, + enforce_eager=True, + tensor_parallel_size=1, + disable_custom_all_reduce=True, + compilation_config=compile_config, + **model_kwargs, + ) + outputs = llm.generate(prompts, sampling_params) + + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index aaf0277810907..efebf05b6b047 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -2,7 +2,6 @@ import pytest import torch -from compressed_tensors.quantization import FP8_DTYPE import vllm.envs as envs import vllm.plugins @@ -14,9 +13,12 @@ from vllm.config import CompilationConfig, CompilationLevel, VllmConfig from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( CUTLASS_FP8_SUPPORTED, Fp8LinearOp, maybe_create_device_identity) +from vllm.platforms import current_platform from .backend import TestBackend +FP8_DTYPE = current_platform.fp8_dtype() + class TestModel(torch.nn.Module): @@ -42,12 +44,17 @@ class TestModel(torch.nn.Module): resid = torch.sqrt(x) y = self.norm[0](x) - x2 = self.fp8_linear.apply(y, self.w[0], self.wscale[0], self.scale[0]) + x2 = self.fp8_linear.apply(y, + self.w[0], + self.wscale[0], + input_scale=self.scale[0]) # make sure resid is used for replacement to work y2, resid = self.norm[1](x2, resid) - x3 = self.fp8_linear.apply(y2, self.w[1], self.wscale[1], - self.scale[1]) + x3 = self.fp8_linear.apply(y2, + self.w[1], + self.wscale[1], + input_scale=self.scale[1]) y3, resid = self.norm[2](x3, resid) # use resid here return y3 @@ -59,8 +66,8 @@ class TestModel(torch.nn.Module): @pytest.mark.parametrize("static", [True, False]) @pytest.mark.parametrize("cutlass_fp8_enabled", [True, False] if CUTLASS_FP8_SUPPORTED else [False]) -@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda", - reason="Only test on CUDA") +@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], + reason="Only test on CUDA and ROCm") def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static, cutlass_fp8_enabled): torch.set_default_device("cuda") diff --git a/tests/data/test_config.yaml b/tests/config/test_config.yaml similarity index 100% rename from tests/data/test_config.yaml rename to tests/config/test_config.yaml diff --git a/tests/config/test_config_with_model.yaml b/tests/config/test_config_with_model.yaml new file mode 100644 index 0000000000000..d8c8c7bc8162a --- /dev/null +++ b/tests/config/test_config_with_model.yaml @@ -0,0 +1,7 @@ +# Same as test_config.yaml but with model specified +model: config-model +port: 12312 +served_model_name: mymodel +tensor_parallel_size: 2 +trust_remote_code: true +multi_step_stream_outputs: false diff --git a/tests/conftest.py b/tests/conftest.py index cc48fceb8eff0..25e70319e2cc8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -29,12 +29,11 @@ from vllm.distributed import (cleanup_dist_env_and_memory, init_distributed_environment, initialize_model_parallel) from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, - TokensPrompt, to_enc_dec_tuple_list, - zip_enc_dec_prompts) + to_enc_dec_tuple_list, zip_enc_dec_prompts) from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams -from vllm.utils import cuda_device_count_stateless, is_list_of +from vllm.utils import cuda_device_count_stateless logger = init_logger(__name__) @@ -469,12 +468,19 @@ class HfRunner: prompts: list[str], beam_width: int, max_tokens: int, + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, ) -> list[tuple[list[list[int]], list[str]]]: outputs = self.generate(prompts, do_sample=False, max_new_tokens=max_tokens, num_beams=beam_width, - num_return_sequences=beam_width) + num_return_sequences=beam_width, + images=images, + videos=videos, + audios=audios) + for i in range(len(outputs)): output_ids, output_str = outputs[i] for j in range(len(output_ids)): @@ -671,8 +677,9 @@ class HfRunner: return [(output_ids, output_str, output_logprobs) for output_ids, output_str, output_logprobs in outputs] - def encode(self, prompts: list[str]) -> list[list[torch.Tensor]]: - return self.model.encode(prompts) + def encode(self, prompts: list[str], *args, + **kwargs) -> list[list[torch.Tensor]]: + return self.model.encode(prompts, *args, **kwargs) def predict(self, prompts: list[list[str]]) -> torch.Tensor: return self.model.predict(prompts, convert_to_tensor=True) @@ -747,30 +754,27 @@ class VllmRunner: videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, ) -> list[TextPrompt]: - if images is not None: - assert len(prompts) == len(images) - if videos is not None: - assert len(prompts) == len(videos) + if any(x is not None and len(x) != len(prompts) + for x in [images, videos, audios]): + raise ValueError( + "All non-None multimodal inputs must have the same length as " + "prompts") - if audios is not None: - assert len(prompts) == len(audios) + inputs = [] + for i, prompt in enumerate(prompts): + multi_modal_data = {} + if images is not None and (image := images[i]) is not None: + multi_modal_data["image"] = image + if videos is not None and (video := videos[i]) is not None: + multi_modal_data["video"] = video + if audios is not None and (audio := audios[i]) is not None: + multi_modal_data["audio"] = audio - inputs = [TextPrompt(prompt=prompt) for prompt in prompts] - if images is not None: - for i, image in enumerate(images): - if image is not None: - inputs[i]["multi_modal_data"] = {"image": image} - - if videos is not None: - for i, video in enumerate(videos): - if video is not None: - inputs[i]["multi_modal_data"] = {"video": video} - - if audios is not None: - for i, audio in enumerate(audios): - if audio is not None: - inputs[i]["multi_modal_data"] = {"audio": audio} + inputs.append( + TextPrompt(prompt=prompt, + multi_modal_data=multi_modal_data + if multi_modal_data else None)) return inputs @@ -921,6 +925,7 @@ class VllmRunner: max_tokens: int, num_logprobs: int, num_prompt_logprobs: Optional[int] = None, + skip_special_tokens: bool = True, ) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]: greedy_logprobs_params = SamplingParams( @@ -928,6 +933,7 @@ class VllmRunner: max_tokens=max_tokens, logprobs=num_logprobs, prompt_logprobs=(num_prompt_logprobs), + skip_special_tokens=skip_special_tokens, ) ''' Greedy logprobs generation for vLLM encoder/decoder models @@ -938,18 +944,20 @@ class VllmRunner: def generate_beam_search( self, - prompts: Union[list[str], list[list[int]]], + prompts: list[str], beam_width: int, max_tokens: int, + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, ) -> list[tuple[list[list[int]], list[str]]]: - if is_list_of(prompts, str, check="all"): - prompts = [TextPrompt(prompt=prompt) for prompt in prompts] - else: - prompts = [ - TokensPrompt(prompt_token_ids=tokens) for tokens in prompts - ] + inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + outputs = self.model.beam_search( - prompts, + inputs, BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens)) returned_outputs = [] for output in outputs: @@ -962,19 +970,19 @@ class VllmRunner: req_outputs = self.model.classify(prompts) return [req_output.outputs.probs for req_output in req_outputs] - def encode( - self, - prompts: list[str], - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, - ) -> list[list[float]]: + def encode(self, + prompts: list[str], + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + *args, + **kwargs) -> list[list[float]]: inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios) - req_outputs = self.model.embed(inputs) + req_outputs = self.model.embed(inputs, *args, **kwargs) return [req_output.outputs.embedding for req_output in req_outputs] def score( @@ -1120,3 +1128,15 @@ def pytest_collection_modifyitems(config, items): for item in items: if "optional" in item.keywords: item.add_marker(skip_optional) + + +@pytest.fixture(scope="session") +def cli_config_file(): + """Return the path to the CLI config file.""" + return os.path.join(_TEST_DIR, "config", "test_config.yaml") + + +@pytest.fixture(scope="session") +def cli_config_file_with_model(): + """Return the path to the CLI config file with model.""" + return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml") diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index e9b537ed5150e..9e8e315d87b18 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -195,15 +195,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, ]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{ - "block_size": 8, + "block_size": 16, "max_num_batched_tokens": 2, "max_num_seqs": 2, }, { - "block_size": 8, + "block_size": 16, "max_num_batched_tokens": 3, "max_num_seqs": 2, }, { - "block_size": 8, + "block_size": 16, "max_num_batched_tokens": 256, "max_num_seqs": 10, }]) diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py index e23b8718cb632..039b5e739892a 100644 --- a/tests/core/block/e2e/test_correctness_sliding_window.py +++ b/tests/core/block/e2e/test_correctness_sliding_window.py @@ -129,12 +129,16 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed, check_answers(indices, answer, test_texts) -def prep_prompts(batch_size: int): +def prep_prompts(batch_size: int, ln_range: tuple[int, int] = (800, 1100)): """ Generate prompts which a bunch of assignments, then asking for the value of one of them. The prompt is just under 10k tokens; sliding window is 4k so the answer is outside sliding window, but should still be correct. + + Args: + batch_size: number of prompts to generate + ln_range: an argument to control the length of the prompt """ prompts: list[str] = [] answer: list[int] = [] @@ -145,7 +149,7 @@ def prep_prompts(batch_size: int): indices.append(idx) prompt = "```python\n# We set a number of variables, " + \ f"x{idx} will be important later\n" - ln = random.randint(800, 1100) + ln = random.randint(*ln_range) for k in range(30, ln): v = random.randint(10, 99) if k == idx: @@ -157,7 +161,10 @@ def prep_prompts(batch_size: int): return prompts, answer, indices -def check_answers(indices: list[int], answer: list[int], outputs: list[str]): +def check_answers(indices: list[int], + answer: list[int], + outputs: list[str], + accept_rate: float = 0.7): answer2 = [int(text[0:2].strip()) for text in outputs] print(list(zip(indices, zip(answer, answer2)))) numok = 0 @@ -166,7 +173,7 @@ def check_answers(indices: list[int], answer: list[int], outputs: list[str]): numok += 1 frac_ok = numok / len(answer) print(f"Num OK: {numok}/{len(answer)} {frac_ok}") - assert frac_ok > 0.7 + assert frac_ok >= accept_rate def check_window(prompts: list[str]): diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index bfa7d06c4d075..a7ba45c9e546e 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -106,7 +106,7 @@ def eager_allreduce( # communicate independently num_communication = rank // tp_size + 1 sz = 1024 - fa = get_tp_group().ca_comm + fa = get_tp_group().device_communicator.ca_comm inp = torch.ones(sz, dtype=torch.float32, device=device) out = inp for _ in range(num_communication): diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index e757db45c8cf5..05e30f855ced2 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -175,7 +175,7 @@ TEXT_GENERATION_MODELS = { "inceptionai/jais-13b-chat": PPTestSettings.fast(), "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(), "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(), - # Tests TransformersModel + # Tests TransformersForCausalLM "ArthurZ/Ilama-3.2-1B": PPTestSettings.fast(), "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(), "openbmb/MiniCPM3-4B": PPTestSettings.fast(), @@ -217,7 +217,7 @@ EMBEDDING_MODELS = { # type: ignore[var-annotated] MULTIMODAL_MODELS = { # [Decoder-only] - "Salesforce/blip2-opt-2.7b": PPTestSettings.fast(), + "Salesforce/blip2-opt-6.7b": PPTestSettings.fast(), "facebook/chameleon-7b": PPTestSettings.fast(), "adept/fuyu-8b": PPTestSettings.fast(), "THUDM/glm-4v-9b": PPTestSettings.fast(), @@ -245,7 +245,7 @@ TEST_MODELS = [ # [LANGUAGE GENERATION] "microsoft/Phi-3.5-MoE-instruct", "meta-llama/Llama-3.2-1B-Instruct", - # "ArthurZ/Ilama-3.2-1B", NOTE: Uncomment after #13905 + "ArthurZ/Ilama-3.2-1B", "ibm/PowerLM-3b", # [LANGUAGE EMBEDDING] "intfloat/e5-mistral-7b-instruct", diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 8698d124e73ff..7902011519d90 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from argparse import ArgumentTypeError +from argparse import ArgumentError, ArgumentTypeError import pytest @@ -10,7 +10,7 @@ from vllm.utils import FlexibleArgumentParser @pytest.mark.parametrize(("arg", "expected"), [ - (None, None), + (None, dict()), ("image=16", { "image": 16 }), @@ -24,6 +24,10 @@ from vllm.utils import FlexibleArgumentParser }), ]) def test_limit_mm_per_prompt_parser(arg, expected): + """This functionality is deprecated and will be removed in the future. + This argument should be passed as JSON string instead. + + TODO: Remove with nullable_kvs.""" parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) if arg is None: args = parser.parse_args([]) @@ -53,12 +57,20 @@ def test_compilation_config(): assert args.compilation_config.level == 3 # set to string form of a dict - args = parser.parse_args(["--compilation-config", "{'level': 3}"]) - assert args.compilation_config.level == 3 + args = parser.parse_args([ + "--compilation-config", + "{'level': 3, 'cudagraph_capture_sizes': [1, 2, 4, 8]}", + ]) + assert (args.compilation_config.level == 3 and + args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]) # set to string form of a dict - args = parser.parse_args(["--compilation-config={'level': 3}"]) - assert args.compilation_config.level == 3 + args = parser.parse_args([ + "--compilation-config=" + "{'level': 3, 'cudagraph_capture_sizes': [1, 2, 4, 8]}", + ]) + assert (args.compilation_config.level == 3 and + args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]) def test_prefix_cache_default(): @@ -142,3 +154,39 @@ def test_composite_arg_parser(arg, expected, option): else: args = parser.parse_args([f"--{option}", arg]) assert getattr(args, option.replace("-", "_")) == expected + + +def test_human_readable_model_len(): + # `exit_on_error` disabled to test invalid values below + parser = EngineArgs.add_cli_args( + FlexibleArgumentParser(exit_on_error=False)) + + args = parser.parse_args([]) + assert args.max_model_len is None + + args = parser.parse_args(["--max-model-len", "1024"]) + assert args.max_model_len == 1024 + + # Lower + args = parser.parse_args(["--max-model-len", "1m"]) + assert args.max_model_len == 1_000_000 + args = parser.parse_args(["--max-model-len", "10k"]) + assert args.max_model_len == 10_000 + + # Capital + args = parser.parse_args(["--max-model-len", "3K"]) + assert args.max_model_len == 1024 * 3 + args = parser.parse_args(["--max-model-len", "10M"]) + assert args.max_model_len == 2**20 * 10 + + # Decimal values + args = parser.parse_args(["--max-model-len", "10.2k"]) + assert args.max_model_len == 10200 + # ..truncated to the nearest int + args = parser.parse_args(["--max-model-len", "10.212345k"]) + assert args.max_model_len == 10212 + + # Invalid (do not allow decimals with binary multipliers) + for invalid in ["1a", "pwd", "10.24", "1.23M"]: + with pytest.raises(ArgumentError): + args = parser.parse_args(["--max-model-len", invalid]) diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py index d5111e3fda8fd..b29d6362f571b 100644 --- a/tests/engine/test_short_mm_context.py +++ b/tests/engine/test_short_mm_context.py @@ -18,7 +18,8 @@ models = ["llava-hf/llava-1.5-7b-hf"] def test_context_length_too_short(vllm_runner, image_assets, model): images = [asset.pil_image for asset in image_assets] - with pytest.raises(ValueError, match="too long to fit into the model"): + with pytest.raises(ValueError, + match="longer than the maximum model length"): vllm_model = vllm_runner( model, max_model_len=128, # LLaVA has a feature size of 576 diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py index 77fbb5827da9e..95657455bd7bb 100644 --- a/tests/entrypoints/llm/test_accuracy.py +++ b/tests/entrypoints/llm/test_accuracy.py @@ -13,18 +13,24 @@ import pytest from vllm.platforms import current_platform -MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct" +MODEL_NAMES = [ + "Qwen/Qwen2-1.5B-Instruct", + "google/gemma-3-1b-it", +] NUM_CONCURRENT = 500 TASK = "gsm8k" FILTER = "exact_match,strict-match" RTOL = 0.03 -EXPECTED_VALUE = 0.58 +EXPECTED_VALUES = { + "Qwen/Qwen2-1.5B-Instruct": 0.58, + "google/gemma-3-1b-it": 0.25, +} -def run_test(more_args=None): +def run_test(model_name, more_args=None): """Run the end to end accuracy test.""" - model_args = f"pretrained={MODEL_NAME},max_model_len=4096" + model_args = f"pretrained={model_name},max_model_len=4096" if more_args is not None: model_args = "{},{}".format(model_args, more_args) @@ -37,9 +43,12 @@ def run_test(more_args=None): ) measured_value = results["results"][TASK][FILTER] - assert (measured_value - RTOL < EXPECTED_VALUE - and measured_value + RTOL > EXPECTED_VALUE - ), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}" + assert model_name in EXPECTED_VALUES, ( + f"Cannot find the expected value for the model {model_name=}") + expected_value = EXPECTED_VALUES[model_name] + assert (measured_value - RTOL < expected_value + and measured_value + RTOL > expected_value + ), f"Expected: {expected_value} | Measured: {measured_value}" # TODO: [AlexM] Fix it with new CI/CD tests @@ -49,7 +58,8 @@ TPU_TP_TEST_STR = "" #"tensor_parallel_size=4" @pytest.mark.skipif(not current_platform.is_cuda() and not current_platform.is_tpu(), reason="V1 is currently only supported on CUDA and TPU") -def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch): +@pytest.mark.parametrize("model", MODEL_NAMES) +def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch): """Run with the V1 Engine.""" with monkeypatch.context() as m: @@ -58,13 +68,13 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch): more_args = None if current_platform.is_tpu(): # Limit compilation time for TPU V1 - more_args = "max_num_seqs=64" + more_args = "max_model_len=2048,max_num_seqs=64" # Add TP test (if provided) if TPU_TP_TEST_STR: more_args += ",{}".format(TPU_TP_TEST_STR) - run_test(more_args) + run_test(model, more_args) def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch): @@ -72,4 +82,4 @@ def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "0") - run_test() + run_test("Qwen/Qwen2-1.5B-Instruct") diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py index 90e1d58141378..099af0f36088b 100644 --- a/tests/entrypoints/llm/test_generate_multiple_loras.py +++ b/tests/entrypoints/llm/test_generate_multiple_loras.py @@ -23,7 +23,19 @@ LORA_NAME = "typeof/zephyr-7b-beta-lora" @pytest.fixture(scope="module") -def llm(): +def monkeypatch_module(): + from _pytest.monkeypatch import MonkeyPatch + mpatch = MonkeyPatch() + yield mpatch + mpatch.undo() + + +@pytest.fixture(scope="module", params=[False, True]) +def llm(request, monkeypatch_module): + + use_v1 = request.param + monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0') + # pytest caches the fixture so we use weakref.proxy to # enable garbage collection llm = LLM(model=MODEL_NAME, diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py index 5f1a91cb2b19f..e43e9826e8f9b 100644 --- a/tests/entrypoints/llm/test_guided_generate.py +++ b/tests/entrypoints/llm/test_guided_generate.py @@ -3,6 +3,7 @@ import json import re import weakref +from enum import Enum import jsonschema import pytest @@ -15,7 +16,10 @@ from vllm.sampling_params import GuidedDecodingParams, SamplingParams MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" GUIDED_DECODING_BACKENDS = [ - "outlines", "lm-format-enforcer", "xgrammar", "guidance" + "outlines", + "lm-format-enforcer", + "xgrammar:disable-any-whitespace", + "guidance:disable-any-whitespace", ] @@ -282,15 +286,26 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm): @pytest.mark.skip_global_cleanup def test_disable_guided_decoding_fallback(sample_regex, llm): + # see has_xgrammar_unsupported_json_features() + unsupported_json = { + "type": "object", + "properties": { + "example": { + "type": "string", + "minLength": 5 # unsupported by xgrammar + } + } + } sampling_params = SamplingParams(temperature=0.8, top_p=0.95, guided_decoding=GuidedDecodingParams( - regex=sample_regex, + json=unsupported_json, backend="xgrammar:no-fallback")) with pytest.raises( ValueError, - match="xgrammar does not support regex guided decoding"): + match="xgrammar does not support advanced JSON schema features " + "like enums, patterns or numeric ranges."): llm.generate(prompts="This should fail", sampling_params=sampling_params, use_tqdm=True) @@ -322,59 +337,50 @@ def test_guided_json_object(llm, guided_decoding_backend: str): print(generated_text) assert generated_text is not None + if 'disable-any-whitespace' in guided_decoding_backend: + assert "\n" not in generated_text + # Parse to verify it is valid JSON parsed_json = json.loads(generated_text) assert isinstance(parsed_json, dict) +class CarType(str, Enum): + sedan = "sedan" + suv = "SUV" + truck = "Truck" + coupe = "Coupe" + + +class CarDescription(BaseModel): + brand: str + model: str + car_type: CarType + + @pytest.mark.skip_global_cleanup -def test_json_with_any_whitespace_disabled(llm): - - class ResponseSchema(BaseModel): - clarifying_question: str - cost_per_serving: str - calories: str - type_dish_ids: str - type_meal_ids: str - product_ids: list[str] - exclude_product_ids: list[str] - allergen_ids: list[str] - total_cooking_time: str - kitchen_ids: str - holiday_ids: str - - # Note: Without this setting, the response is sometimes full of `\n` - # for some models. This option prevents that. - guided_decoding_backend = 'xgrammar:disable-any-whitespace' - - schema = ResponseSchema.model_json_schema() - guided_params = GuidedDecodingParams(json=schema, - backend=\ - guided_decoding_backend) - sampling_params = SamplingParams(max_tokens=2000, - frequency_penalty=0, - presence_penalty=-1.1, - repetition_penalty=1.3, - guided_decoding=guided_params) - - prompt = ("<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You" - "are a helpful assistant.<|im_end|>\n<|im_start|>user\nI want a " - "quick launch fast with $10.<|im_end|>\n<|im_start|>assistant\n") - outputs = llm.generate(prompts=prompt, - sampling_params=sampling_params, - use_tqdm=True) +@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) +def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str): + json_schema = CarDescription.model_json_schema() + sampling_params = SamplingParams(temperature=1.0, + max_tokens=1000, + guided_decoding=GuidedDecodingParams( + json=json_schema, + backend=guided_decoding_backend)) + outputs = llm.generate( + prompts="Generate a JSON with the brand, model and car_type of" + "the most iconic car from the 90's", + sampling_params=sampling_params, + use_tqdm=True) assert outputs is not None - for output in outputs: assert output is not None assert isinstance(output, RequestOutput) + prompt = output.prompt generated_text = output.outputs[0].text assert generated_text is not None - assert "\n" not in generated_text - - # Parse to verify it is valid JSON - parsed_json = json.loads(generated_text) - assert isinstance(parsed_json, dict) - jsonschema.validate(instance=parsed_json, schema=schema) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + output_json = json.loads(generated_text) + jsonschema.validate(instance=output_json, schema=json_schema) \ No newline at end of file diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py index 61bd1d462a50f..665c6ea1e6994 100644 --- a/tests/entrypoints/llm/test_prompt_validation.py +++ b/tests/entrypoints/llm/test_prompt_validation.py @@ -15,7 +15,7 @@ def v1(run_with_both_engines): def test_empty_prompt(): llm = LLM(model="openai-community/gpt2", enforce_eager=True) - with pytest.raises(ValueError, match='Prompt cannot be empty'): + with pytest.raises(ValueError, match='decoder prompt cannot be empty'): llm.generate([""]) diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py index eca5d184f5d60..642c204b9ff00 100644 --- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py +++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py @@ -150,6 +150,7 @@ def test_wer_correctness(model_name, expected_wer, n_examples=-1, max_concurrent_request=None): + # TODO refactor to use `ASRDataset` with RemoteOpenAIServer(model_name, ['--enforce-eager']) as remote_server: dataset = load_hf_dataset(dataset_repo) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index 3267dcc15e4a9..72e616656775e 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +import json + import openai import pytest import pytest_asyncio @@ -12,7 +14,9 @@ from ...utils import RemoteOpenAIServer MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b" TEST_AUDIO_URLS = [ AudioAsset("winning_call").url, + AudioAsset("mary_had_lamb").url, ] +MAXIMUM_AUDIOS = 2 @pytest.fixture(scope="module") @@ -24,6 +28,8 @@ def server(): "5", "--enforce-eager", "--trust-remote-code", + "--limit-mm-per-prompt", + json.dumps({"audio": MAXIMUM_AUDIOS}), ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: @@ -46,7 +52,7 @@ def base64_encoded_audio() -> dict[str, str]: @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) +@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]]) async def test_single_chat_session_audio(client: openai.AsyncOpenAI, model_name: str, audio_url: str): messages = [{ @@ -100,7 +106,36 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) +@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]]) +async def test_error_on_invalid_audio_url_type(client: openai.AsyncOpenAI, + model_name: str, + audio_url: str): + messages = [{ + "role": + "user", + "content": [ + { + "type": "audio_url", + "audio_url": audio_url + }, + { + "type": "text", + "text": "What's happening in this audio?" + }, + ], + }] + + # audio_url should be a dict {"url": "some url"}, not directly a string + with pytest.raises(openai.BadRequestError): + _ = await client.chat.completions.create(model=model_name, + messages=messages, + max_completion_tokens=10, + temperature=0.0) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]]) async def test_single_chat_session_audio_base64encoded( client: openai.AsyncOpenAI, model_name: str, audio_url: str, base64_encoded_audio: dict[str, str]): @@ -158,7 +193,7 @@ async def test_single_chat_session_audio_base64encoded( @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) +@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]]) async def test_single_chat_session_input_audio( client: openai.AsyncOpenAI, model_name: str, audio_url: str, base64_encoded_audio: dict[str, str]): @@ -330,28 +365,21 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) +@pytest.mark.parametrize( + "audio_urls", [TEST_AUDIO_URLS, TEST_AUDIO_URLS + [TEST_AUDIO_URLS[0]]]) async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str, - audio_url: str, - base64_encoded_audio: dict[str, str]): + audio_urls: list[str]): messages = [{ "role": "user", "content": [ - { + *({ "type": "audio_url", "audio_url": { "url": audio_url } - }, - { - "type": "input_audio", - "input_audio": { - "data": base64_encoded_audio[audio_url], - "format": "wav" - } - }, + } for audio_url in audio_urls), { "type": "text", "text": "What's happening in this audio?" @@ -359,20 +387,30 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str, ], }] - with pytest.raises(openai.BadRequestError): # test multi-audio input - await client.chat.completions.create( + if len(audio_urls) > MAXIMUM_AUDIOS: + with pytest.raises(openai.BadRequestError): # test multi-audio input + await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + temperature=0.0, + ) + + # the server should still work afterwards + completion = await client.completions.create( + model=model_name, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + ) + completion = completion.choices[0].text + assert completion is not None and len(completion) >= 0 + else: + chat_completion = await client.chat.completions.create( model=model_name, messages=messages, max_completion_tokens=10, temperature=0.0, ) - - # the server should still work afterwards - completion = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - ) - completion = completion.choices[0].text - assert completion is not None and len(completion) >= 0 + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 0 diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 25e4595cef6f6..a10b42ea3a4b5 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -11,7 +11,7 @@ import pytest import pytest_asyncio import requests import torch -from openai import BadRequestError +from openai import BadRequestError, OpenAI from ...utils import RemoteOpenAIServer from .test_completion import zephyr_lora_added_tokens_files # noqa: F401 @@ -20,11 +20,25 @@ from .test_completion import zephyr_lora_files # noqa: F401 # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" -GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"] - @pytest.fixture(scope="module") -def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811 +def monkeypatch_module(): + from _pytest.monkeypatch import MonkeyPatch + mpatch = MonkeyPatch() + yield mpatch + mpatch.undo() + + +@pytest.fixture(scope="module", params=[False, True]) +def server( + request, + monkeypatch_module, + zephyr_lora_files, #noqa: F811 + zephyr_lora_added_tokens_files): # noqa: F811 + + use_v1 = request.param + monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0') + args = [ # use half precision for speed and memory savings in CI environment "--dtype", @@ -49,6 +63,13 @@ def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811 yield remote_server +@pytest.fixture +def is_v1_server(server): + import os + assert os.environ['VLLM_USE_V1'] in ['0', '1'] + return os.environ['VLLM_USE_V1'] == '1' + + @pytest_asyncio.fixture async def client(server): async with server.get_async_client() as async_client: @@ -464,14 +485,8 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, assert last_completion_tokens == 10 -# NOTE: Not sure why, but when I place this after `test_guided_regex_chat` -# (i.e. using the same ordering as in the Completions API tests), the test -# will fail on the second `guided_decoding_backend` even when I swap their order -# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256) @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_choice_chat(client: openai.AsyncOpenAI, - guided_decoding_backend: str, sample_guided_choice): messages = [{ "role": "system", @@ -487,8 +502,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, messages=messages, max_completion_tokens=10, temperature=0.7, - extra_body=dict(guided_choice=sample_guided_choice, - guided_decoding_backend=guided_decoding_backend)) + extra_body=dict(guided_choice=sample_guided_choice)) choice1 = chat_completion.choices[0].message.content assert choice1 in sample_guided_choice @@ -502,18 +516,16 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, messages=messages, max_completion_tokens=10, temperature=0.7, - extra_body=dict(guided_choice=sample_guided_choice, - guided_decoding_backend=guided_decoding_backend)) + extra_body=dict(guided_choice=sample_guided_choice)) choice2 = chat_completion.choices[0].message.content assert choice2 in sample_guided_choice assert choice1 != choice2 @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_json_chat(client: openai.AsyncOpenAI, - guided_decoding_backend: str, sample_json_schema): + messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -528,8 +540,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, model=MODEL_NAME, messages=messages, max_completion_tokens=1000, - extra_body=dict(guided_json=sample_json_schema, - guided_decoding_backend=guided_decoding_backend)) + extra_body=dict(guided_json=sample_json_schema)) message = chat_completion.choices[0].message assert message.content is not None json1 = json.loads(message.content) @@ -546,8 +557,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, model=MODEL_NAME, messages=messages, max_completion_tokens=1000, - extra_body=dict(guided_json=sample_json_schema, - guided_decoding_backend=guided_decoding_backend)) + extra_body=dict(guided_json=sample_json_schema)) message = chat_completion.choices[0].message assert message.content is not None json2 = json.loads(message.content) @@ -557,9 +567,8 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) -async def test_guided_regex_chat(client: openai.AsyncOpenAI, - guided_decoding_backend: str, sample_regex): +async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex): + messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -573,8 +582,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, model=MODEL_NAME, messages=messages, max_completion_tokens=20, - extra_body=dict(guided_regex=sample_regex, - guided_decoding_backend=guided_decoding_backend)) + extra_body=dict(guided_regex=sample_regex)) ip1 = chat_completion.choices[0].message.content assert ip1 is not None assert re.fullmatch(sample_regex, ip1) is not None @@ -585,8 +593,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, model=MODEL_NAME, messages=messages, max_completion_tokens=20, - extra_body=dict(guided_regex=sample_regex, - guided_decoding_backend=guided_decoding_backend)) + extra_body=dict(guided_regex=sample_regex)) ip2 = chat_completion.choices[0].message.content assert ip2 is not None assert re.fullmatch(sample_regex, ip2) is not None @@ -615,10 +622,9 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI): @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, - guided_decoding_backend: str, sample_guided_choice): + messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -634,8 +640,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, max_completion_tokens=10, logprobs=True, top_logprobs=5, - extra_body=dict(guided_choice=sample_guided_choice, - guided_decoding_backend=guided_decoding_backend)) + extra_body=dict(guided_choice=sample_guided_choice)) assert chat_completion.choices[0].logprobs is not None assert chat_completion.choices[0].logprobs.content is not None @@ -647,10 +652,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) -async def test_named_tool_use(client: openai.AsyncOpenAI, - guided_decoding_backend: str, - sample_json_schema): +async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -682,7 +684,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, "name": "dummy_function_name" } }, - extra_body=dict(guided_decoding_backend=guided_decoding_backend)) + ) message = chat_completion.choices[0].message assert len(message.content) == 0 json_string = message.tool_calls[0].function.arguments @@ -717,7 +719,6 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, "name": "dummy_function_name" } }, - extra_body=dict(guided_decoding_backend=guided_decoding_backend), stream=True) output = [] @@ -740,48 +741,128 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI, - sample_json_schema): - messages = [{ - "role": "system", - "content": "you are a helpful assistant" - }, { - "role": - "user", - "content": - f"Give an example JSON for an employee profile that " - f"fits this schema: {sample_json_schema}" - }] +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_required_tool_use(client: openai.AsyncOpenAI, + is_v1_server: bool, model_name: str): + if is_v1_server: + pytest.skip( + "tool_choice='required' requires features unsupported on V1") - with pytest.raises(openai.BadRequestError): - await client.chat.completions.create( - model=MODEL_NAME, - messages=messages, - max_completion_tokens=1000, - tools=[{ - "type": "function", - "function": { - "name": "dummy_function_name", - "description": "This is a dummy function", - "parameters": sample_json_schema - } - }], - tool_choice="required") + tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": + "The city to find the weather for, e.g. 'Vienna'", + "default": "Vienna", + }, + "country": { + "type": + "string", + "description": + "The country that the city is in, e.g. 'Austria'", + }, + "unit": { + "type": "string", + "description": + "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"], + }, + }, + "required": ["country", "unit"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_forecast", + "description": "Get the weather forecast for a given location", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": + "The city to get the forecast for, e.g. 'Vienna'", + "default": "Vienna", + }, + "country": { + "type": + "string", + "description": + "The country that the city is in, e.g. 'Austria'", + }, + "days": { + "type": + "integer", + "description": + "Number of days to get the forecast for (1-7)", + }, + "unit": { + "type": "string", + "description": + "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"], + }, + }, + "required": ["country", "days", "unit"], + }, + }, + }, + ] - with pytest.raises(openai.BadRequestError): - await client.chat.completions.create( - model=MODEL_NAME, - messages=messages, - max_completion_tokens=1000, - tools=[{ - "type": "function", - "function": { - "name": "dummy_function_name", - "description": "This is a dummy function", - "parameters": sample_json_schema - } - }], - tool_choice="auto") + messages = [ + { + "role": "user", + "content": "Hi! How are you doing today?" + }, + { + "role": "assistant", + "content": "I'm doing well! How can I help you?" + }, + { + "role": + "user", + "content": + "Can you tell me what the current weather is in Berlin and the "\ + "forecast for the next 5 days, in fahrenheit?", + }, + ] + + # Non-streaming test + chat_completion = await client.chat.completions.create( + messages=messages, + model=model_name, + tools=tools, + tool_choice="required", + ) + + assert chat_completion.choices[0].message.tool_calls is not None + assert len(chat_completion.choices[0].message.tool_calls) > 0 + + # Streaming test + stream = await client.chat.completions.create( + messages=messages, + model=model_name, + tools=tools, + tool_choice="required", + stream=True, + ) + + output = [] + async for chunk in stream: + if chunk.choices and chunk.choices[0].delta.tool_calls: + output.extend(chunk.choices[0].delta.tool_calls) + + assert len(output) > 0 @pytest.mark.asyncio @@ -1000,7 +1081,7 @@ async def test_long_seed(client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_http_chat_wo_model_name(server: RemoteOpenAIServer): +async def test_http_chat_no_model_name_with_curl(server: RemoteOpenAIServer): url = f"http://localhost:{server.port}/v1/chat/completions" headers = { "Content-Type": "application/json", @@ -1021,10 +1102,35 @@ async def test_http_chat_wo_model_name(server: RemoteOpenAIServer): response = requests.post(url, headers=headers, json=data) response_data = response.json() print(response_data) - + assert response_data.get("model") == MODEL_NAME choice = response_data.get("choices")[0] message = choice.get("message") assert message is not None content = message.get("content") assert content is not None assert len(content) > 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME, ""]) +async def test_http_chat_no_model_name_with_openai(server: RemoteOpenAIServer, + model_name: str): + + openai_api_key = "EMPTY" + openai_api_base = f"http://localhost:{server.port}/v1" + + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) + messages = [ + { + "role": "user", + "content": "Hello, vLLM!" + }, + ] + response = client.chat.completions.create( + model="", # empty string + messages=messages, + ) + assert response.model == MODEL_NAME diff --git a/tests/entrypoints/openai/test_chat_logit_bias_validation.py b/tests/entrypoints/openai/test_chat_logit_bias_validation.py new file mode 100644 index 0000000000000..9dab524ea4801 --- /dev/null +++ b/tests/entrypoints/openai/test_chat_logit_bias_validation.py @@ -0,0 +1,88 @@ +# SPDX-License-Identifier: Apache-2.0 + +import openai +import pytest +import pytest_asyncio + +from vllm.config import ModelConfig + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" + + +def get_vocab_size(model_name): + config = ModelConfig( + model=model_name, + task="auto", + tokenizer=model_name, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="bfloat16", + ) + return config.get_vocab_size() + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--dtype", + "bfloat16", + "--max-model-len", + "1024", + "--enforce-eager", + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +async def test_chat_logit_bias_valid(client): + """Test that valid logit_bias values are accepted in chat completions.""" + vocab_size = get_vocab_size(MODEL_NAME) + valid_token_id = vocab_size - 1 + + completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=[{ + "role": "user", + "content": "Testing valid logit bias" + }], + max_tokens=5, + logit_bias={str(valid_token_id): 1.0}, + ) + + assert completion.choices[0].message.content is not None + + +@pytest.mark.asyncio +async def test_chat_logit_bias_invalid(client): + """Test that invalid logit_bias values are rejected in chat completions.""" + vocab_size = get_vocab_size(MODEL_NAME) + invalid_token_id = vocab_size + 1 + + with pytest.raises(openai.BadRequestError) as excinfo: + await client.chat.completions.create( + model=MODEL_NAME, + messages=[{ + "role": "user", + "content": "Testing invalid logit bias" + }], + max_tokens=5, + logit_bias={str(invalid_token_id): 1.0}, + ) + + error = excinfo.value + error_message = str(error) + + assert error.status_code == 400 + assert str(invalid_token_id) in error_message + assert str(vocab_size) in error_message diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 0d1c936da7597..2cdeb684f75de 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -11,6 +11,7 @@ import requests from vllm.entrypoints.openai.protocol import EmbeddingResponse from vllm.transformers_utils.tokenizer import get_tokenizer +from ...models.embedding.utils import check_embeddings_close from ...utils import RemoteOpenAIServer MODEL_NAME = "intfloat/multilingual-e5-small" @@ -190,30 +191,35 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI, responses_float = await client.embeddings.create(input=input_texts, model=model_name, encoding_format="float") + float_data = [d.embedding for d in responses_float.data] responses_base64 = await client.embeddings.create(input=input_texts, model=model_name, encoding_format="base64") - - decoded_responses_base64_data = [] + base64_data = [] for data in responses_base64.data: - decoded_responses_base64_data.append( + base64_data.append( np.frombuffer(base64.b64decode(data.embedding), dtype="float32").tolist()) - assert responses_float.data[0].embedding == decoded_responses_base64_data[ - 0] - assert responses_float.data[1].embedding == decoded_responses_base64_data[ - 1] + check_embeddings_close( + embeddings_0_lst=float_data, + embeddings_1_lst=base64_data, + name_0="float", + name_1="base64", + ) # Default response is float32 decoded from base64 by OpenAI Client responses_default = await client.embeddings.create(input=input_texts, model=model_name) + default_data = [d.embedding for d in responses_default.data] - assert responses_float.data[0].embedding == responses_default.data[ - 0].embedding - assert responses_float.data[1].embedding == responses_default.data[ - 1].embedding + check_embeddings_close( + embeddings_0_lst=float_data, + embeddings_1_lst=default_data, + name_0="float", + name_1="default", + ) @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py new file mode 100644 index 0000000000000..43d109f74f5da --- /dev/null +++ b/tests/entrypoints/openai/test_embedding_dimensions.py @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`. +""" + +import openai +import pytest + +from vllm.entrypoints.openai.protocol import EmbeddingResponse + +from ...models.embedding.utils import EmbedModelInfo +from ...utils import RemoteOpenAIServer + +MODELS = [ + EmbedModelInfo(name="BAAI/bge-m3", is_matryoshka=False), + EmbedModelInfo(name="jinaai/jina-embeddings-v3", is_matryoshka=True), +] + +input_texts = [ + "The chef prepared a delicious meal.", +] * 3 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model", MODELS) +async def test_validating_dimensions(model: EmbedModelInfo): + args = [ + "--task", + "embed", + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--enforce-eager", + "--max-model-len", + "512", + "--trust_remote_code" + ] + with RemoteOpenAIServer(model.name, args) as remote_server: + client = remote_server.get_async_client() + + async def make_request(dimensions): + embedding_response = await client.embeddings.create( + model=model.name, + input=input_texts, + dimensions=dimensions, + encoding_format="float", + ) + embeddings = EmbeddingResponse.model_validate( + embedding_response.model_dump(mode="json")) + + assert embeddings.id is not None + assert len(embeddings.data) == 3 + assert len(embeddings.data[0].embedding) > 0 + assert embeddings.usage.completion_tokens == 0 + assert embeddings.usage.prompt_tokens > 0 + assert embeddings.usage.total_tokens > 0 + + if dimensions is not None: + assert len(embeddings.data[0].embedding) == dimensions + + if model.is_matryoshka: + for dimensions in [None, 16]: + await make_request(dimensions) + + with pytest.raises(openai.BadRequestError): + for dimensions in [-1]: + await make_request(dimensions) + + else: + for dimensions in [None]: + await make_request(dimensions) + + with pytest.raises(openai.BadRequestError): + for dimensions in [-1, 16]: + await make_request(dimensions) diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py index 1a62157acc478..2fc08b47513e6 100644 --- a/tests/entrypoints/openai/test_lora_adapters.py +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -53,7 +53,20 @@ def zephyr_lora_files(): @pytest.fixture(scope="module") -def server_with_lora_modules_json(zephyr_lora_files): +def monkeypatch_module(): + from _pytest.monkeypatch import MonkeyPatch + mpatch = MonkeyPatch() + yield mpatch + mpatch.undo() + + +@pytest.fixture(scope="module", params=[False, True]) +def server_with_lora_modules_json(request, monkeypatch_module, + zephyr_lora_files): + + use_v1 = request.param + monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0') + # Define the json format LoRA module configurations lora_module_1 = { "name": "zephyr-lora", diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py new file mode 100644 index 0000000000000..c96151349eb3f --- /dev/null +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -0,0 +1,209 @@ +# SPDX-License-Identifier: Apache-2.0 + +from contextlib import suppress +from dataclasses import dataclass, field +from http import HTTPStatus +from typing import Optional +from unittest.mock import MagicMock + +import pytest + +from vllm.config import MultiModalConfig +from vllm.engine.multiprocessing.client import MQLLMEngineClient +from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse +from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion +from vllm.entrypoints.openai.serving_models import (BaseModelPath, + OpenAIServingModels) +from vllm.lora.request import LoRARequest +from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry +from vllm.transformers_utils.tokenizer import get_tokenizer + +MODEL_NAME = "openai-community/gpt2" +BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)] + +MOCK_RESOLVER_NAME = "mock_test_resolver" + + +@dataclass +class MockHFConfig: + model_type: str = "any" + + +@dataclass +class MockModelConfig: + """Minimal mock ModelConfig for testing.""" + model: str = MODEL_NAME + tokenizer: str = MODEL_NAME + trust_remote_code: bool = False + tokenizer_mode: str = "auto" + max_model_len: int = 100 + tokenizer_revision: Optional[str] = None + multimodal_config: MultiModalConfig = field( + default_factory=MultiModalConfig) + hf_config: MockHFConfig = field(default_factory=MockHFConfig) + logits_processor_pattern: Optional[str] = None + diff_sampling_param: Optional[dict] = None + allowed_local_media_path: str = "" + encoder_config = None + generation_config: str = "auto" + + def get_diff_sampling_param(self): + return self.diff_sampling_param or {} + + +class MockLoRAResolver(LoRAResolver): + + async def resolve_lora(self, base_model_name: str, + lora_name: str) -> Optional[LoRARequest]: + if lora_name == "test-lora": + return LoRARequest(lora_name="test-lora", + lora_int_id=1, + lora_local_path="/fake/path/test-lora") + elif lora_name == "invalid-lora": + return LoRARequest(lora_name="invalid-lora", + lora_int_id=2, + lora_local_path="/fake/path/invalid-lora") + return None + + +@pytest.fixture(autouse=True) +def register_mock_resolver(): + """Fixture to register and unregister the mock LoRA resolver.""" + resolver = MockLoRAResolver() + LoRAResolverRegistry.register_resolver(MOCK_RESOLVER_NAME, resolver) + yield + # Cleanup: remove the resolver after the test runs + if MOCK_RESOLVER_NAME in LoRAResolverRegistry.resolvers: + del LoRAResolverRegistry.resolvers[MOCK_RESOLVER_NAME] + + +@pytest.fixture +def mock_serving_setup(): + """Provides a mocked engine and serving completion instance.""" + mock_engine = MagicMock(spec=MQLLMEngineClient) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + + def mock_add_lora_side_effect(lora_request: LoRARequest): + """Simulate engine behavior when adding LoRAs.""" + if lora_request.lora_name == "test-lora": + # Simulate successful addition + return + elif lora_request.lora_name == "invalid-lora": + # Simulate failure during addition (e.g. invalid format) + raise ValueError(f"Simulated failure adding LoRA: " + f"{lora_request.lora_name}") + + mock_engine.add_lora.side_effect = mock_add_lora_side_effect + mock_engine.generate.reset_mock() + mock_engine.add_lora.reset_mock() + + mock_model_config = MockModelConfig() + models = OpenAIServingModels(engine_client=mock_engine, + base_model_paths=BASE_MODEL_PATHS, + model_config=mock_model_config) + + serving_completion = OpenAIServingCompletion(mock_engine, + mock_model_config, + models, + request_logger=None) + + return mock_engine, serving_completion + + +@pytest.mark.asyncio +async def test_serving_completion_with_lora_resolver(mock_serving_setup, + monkeypatch): + monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true") + + mock_engine, serving_completion = mock_serving_setup + + lora_model_name = "test-lora" + req_found = CompletionRequest( + model=lora_model_name, + prompt="Generate with LoRA", + ) + + # Suppress potential errors during the mocked generate call, + # as we are primarily checking for add_lora and generate calls + with suppress(Exception): + await serving_completion.create_completion(req_found) + + mock_engine.add_lora.assert_called_once() + called_lora_request = mock_engine.add_lora.call_args[0][0] + assert isinstance(called_lora_request, LoRARequest) + assert called_lora_request.lora_name == lora_model_name + + mock_engine.generate.assert_called_once() + called_lora_request = mock_engine.generate.call_args[1]['lora_request'] + assert isinstance(called_lora_request, LoRARequest) + assert called_lora_request.lora_name == lora_model_name + + +@pytest.mark.asyncio +async def test_serving_completion_resolver_not_found(mock_serving_setup, + monkeypatch): + monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true") + + mock_engine, serving_completion = mock_serving_setup + + non_existent_model = "non-existent-lora-adapter" + req = CompletionRequest( + model=non_existent_model, + prompt="what is 1+1?", + ) + + response = await serving_completion.create_completion(req) + + mock_engine.add_lora.assert_not_called() + mock_engine.generate.assert_not_called() + + assert isinstance(response, ErrorResponse) + assert response.code == HTTPStatus.NOT_FOUND.value + assert non_existent_model in response.message + + +@pytest.mark.asyncio +async def test_serving_completion_resolver_add_lora_fails( + mock_serving_setup, monkeypatch): + monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true") + + mock_engine, serving_completion = mock_serving_setup + + invalid_model = "invalid-lora" + req = CompletionRequest( + model=invalid_model, + prompt="what is 1+1?", + ) + + response = await serving_completion.create_completion(req) + + # Assert add_lora was called before the failure + mock_engine.add_lora.assert_called_once() + called_lora_request = mock_engine.add_lora.call_args[0][0] + assert isinstance(called_lora_request, LoRARequest) + assert called_lora_request.lora_name == invalid_model + + # Assert generate was *not* called due to the failure + mock_engine.generate.assert_not_called() + + # Assert the correct error response + assert isinstance(response, ErrorResponse) + assert response.code == HTTPStatus.BAD_REQUEST.value + assert invalid_model in response.message + + +@pytest.mark.asyncio +async def test_serving_completion_flag_not_set(mock_serving_setup): + mock_engine, serving_completion = mock_serving_setup + + lora_model_name = "test-lora" + req_found = CompletionRequest( + model=lora_model_name, + prompt="Generate with LoRA", + ) + + await serving_completion.create_completion(req_found) + + mock_engine.add_lora.assert_not_called() + mock_engine.generate.assert_not_called() diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 2bffd0ce138e6..42f7b098f917d 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -13,9 +13,12 @@ import requests from prometheus_client.parser import text_string_to_metric_families from transformers import AutoTokenizer +from vllm import version + from ...utils import RemoteOpenAIServer MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" +PREV_MINOR_VERSION = version._prev_minor_version() @pytest.fixture(scope="module", params=[True, False]) @@ -55,6 +58,7 @@ def default_server_args(): "", "--enable-chunked-prefill", "--disable-frontend-multiprocessing", + f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}", ]) def server(use_v1, default_server_args, request): if request.param: @@ -129,7 +133,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer, # Loop over all expected metric_families for metric_family, suffix_values_list in EXPECTED_VALUES.items(): - if use_v1 and metric_family not in EXPECTED_METRICS_V1: + if ((use_v1 and metric_family not in EXPECTED_METRICS_V1) + or (not server.show_hidden_metrics + and metric_family in HIDDEN_DEPRECATED_METRICS)): continue found_metric = False @@ -165,10 +171,10 @@ async def test_metrics_counts(server: RemoteOpenAIServer, EXPECTED_METRICS = [ "vllm:num_requests_running", - "vllm:num_requests_swapped", + "vllm:num_requests_swapped", # deprecated "vllm:num_requests_waiting", "vllm:gpu_cache_usage_perc", - "vllm:cpu_cache_usage_perc", + "vllm:cpu_cache_usage_perc", # deprecated "vllm:time_to_first_token_seconds_sum", "vllm:time_to_first_token_seconds_bucket", "vllm:time_to_first_token_seconds_count", @@ -268,6 +274,11 @@ EXPECTED_METRICS_V1 = [ "vllm:request_decode_time_seconds_count", ] +HIDDEN_DEPRECATED_METRICS = [ + "vllm:num_requests_swapped", + "vllm:cpu_cache_usage_perc", +] + @pytest.mark.asyncio async def test_metrics_exist(server: RemoteOpenAIServer, @@ -282,7 +293,9 @@ async def test_metrics_exist(server: RemoteOpenAIServer, assert response.status_code == HTTPStatus.OK for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS): - assert metric in response.text + if (not server.show_hidden_metrics + and metric not in HIDDEN_DEPRECATED_METRICS): + assert metric in response.text def test_metrics_exist_run_batch(use_v1: bool): diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py new file mode 100644 index 0000000000000..1ccb803a328d6 --- /dev/null +++ b/tests/entrypoints/openai/test_openai_schema.py @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest +import schemathesis +from schemathesis import GenerationConfig + +from ...utils import RemoteOpenAIServer + +schemathesis.experimental.OPEN_API_3_1.enable() + +MODEL_NAME = "HuggingFaceTB/SmolVLM-256M-Instruct" +MAXIMUM_IMAGES = 2 + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--task", + "generate", + "--max-model-len", + "2048", + "--max-num-seqs", + "5", + "--enforce-eager", + "--trust-remote-code", + "--limit-mm-per-prompt", + f"image={MAXIMUM_IMAGES}", + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest.fixture(scope="module") +def get_schema(server): + # avoid generating null (\x00) bytes in strings during test case generation + return schemathesis.openapi.from_uri( + f"{server.url_root}/openapi.json", + generation_config=GenerationConfig(allow_x00=False), + ) + + +schema = schemathesis.from_pytest_fixture("get_schema") + + +@schema.parametrize() +@schema.override(headers={"Content-Type": "application/json"}) +async def test_openapi_stateless(case): + #No need to verify SSL certificate for localhost + await case.call_and_validate(verify=False) diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index 64a1eb6a63eef..f889189a99681 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -17,7 +17,7 @@ async def test_empty_prompt(): client = remote_server.get_async_client() with pytest.raises(openai.BadRequestError, - match=re.compile('.+Prompt cannot be empty.+')): + match="decoder prompt cannot be empty"): await client.completions.create(model=model_name, prompt="", max_tokens=5, diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/openai/test_sleep.py index 8bdf00bcee126..3ca8a9a410ffd 100644 --- a/tests/entrypoints/openai/test_sleep.py +++ b/tests/entrypoints/openai/test_sleep.py @@ -25,15 +25,37 @@ def test_sleep_mode(): "VLLM_SERVER_DEV_MODE": "1", "CUDA_VISIBLE_DEVICES": "0" }) as remote_server: - response = requests.post(remote_server.url_for("/sleep"), - data={"level": "1"}) + response = requests.post(remote_server.url_for("sleep"), + params={"level": "1"}) assert response.status_code == 200 - response = requests.get(remote_server.url_for("/is_sleeping")) + response = requests.get(remote_server.url_for("is_sleeping")) assert response.status_code == 200 assert response.json().get("is_sleeping") is True - response = requests.post(remote_server.url_for("/wake_up")) + response = requests.post(remote_server.url_for("wake_up")) assert response.status_code == 200 - response = requests.get(remote_server.url_for("/is_sleeping")) + response = requests.get(remote_server.url_for("is_sleeping")) + assert response.status_code == 200 + assert response.json().get("is_sleeping") is False + + # test wake up with tags + response = requests.post(remote_server.url_for("sleep"), + params={"level": "1"}) + assert response.status_code == 200 + + response = requests.post(remote_server.url_for("wake_up"), + params={"tags": ["weights"]}) + assert response.status_code == 200 + + # is sleeping should be false after waking up any part of the engine + response = requests.get(remote_server.url_for("is_sleeping")) + assert response.status_code == 200 + assert response.json().get("is_sleeping") is True + + response = requests.post(remote_server.url_for("wake_up"), + params={"tags": ["kv_cache"]}) + assert response.status_code == 200 + + response = requests.get(remote_server.url_for("is_sleeping")) assert response.status_code == 200 assert response.json().get("is_sleeping") is False diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index 29571bcd7649b..5c48df3cebbc2 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -192,3 +192,36 @@ async def test_stream_options(winning_call): else: continuous = continuous and hasattr(chunk, 'usage') assert final and continuous + + +@pytest.mark.asyncio +async def test_sampling_params(mary_had_lamb): + """ + Compare sampling with params and greedy sampling to assert results + are different when extreme sampling parameters values are picked. + """ + model_name = "openai/whisper-small" + server_args = ["--enforce-eager"] + with RemoteOpenAIServer(model_name, server_args) as remote_server: + client = remote_server.get_async_client() + transcription = await client.audio.transcriptions.create( + model=model_name, + file=mary_had_lamb, + language="en", + temperature=0.8, + extra_body=dict(seed=42, + repetition_penalty=1.9, + top_k=12, + top_p=0.4, + min_p=0.5, + frequency_penalty=1.8, + presence_penalty=2.0)) + + greedy_transcription = await client.audio.transcriptions.create( + model=model_name, + file=mary_had_lamb, + language="en", + temperature=0.0, + extra_body=dict(seed=42)) + + assert greedy_transcription.text != transcription.text diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py index f9ccce9c1c332..53f057a294c0a 100644 --- a/tests/entrypoints/openai/test_video.py +++ b/tests/entrypoints/openai/test_video.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +import json + import openai import pytest import pytest_asyncio @@ -31,7 +33,7 @@ def server(): "--enforce-eager", "--trust-remote-code", "--limit-mm-per-prompt", - f"video={MAXIMUM_VIDEOS}", + json.dumps({"video": MAXIMUM_VIDEOS}), ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: @@ -106,6 +108,35 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI, assert message.content is not None and len(message.content) >= 0 +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) +async def test_error_on_invalid_video_url_type(client: openai.AsyncOpenAI, + model_name: str, + video_url: str): + messages = [{ + "role": + "user", + "content": [ + { + "type": "video_url", + "video_url": video_url + }, + { + "type": "text", + "text": "What's in this video?" + }, + ], + }] + + # video_url should be a dict {"url": "some url"}, not directly a string + with pytest.raises(openai.BadRequestError): + _ = await client.chat.completions.create(model=model_name, + messages=messages, + max_completion_tokens=10, + temperature=0.0) + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index bb100e573b878..1ab50b41c7ecb 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -1,8 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 +import json + import openai import pytest import pytest_asyncio +import requests +from PIL import Image +from transformers import AutoProcessor from vllm.multimodal.utils import encode_image_base64, fetch_image @@ -32,7 +37,7 @@ def server(): "--enforce-eager", "--trust-remote-code", "--limit-mm-per-prompt", - f"image={MAXIMUM_IMAGES}", + json.dumps({"image": MAXIMUM_IMAGES}), ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: @@ -53,11 +58,31 @@ def base64_encoded_image() -> dict[str, str]: } +def get_hf_prompt_tokens(model_name, content, image_url): + processor = AutoProcessor.from_pretrained(model_name, + trust_remote_code=True, + num_crops=4) + + placeholder = "<|image_1|>\n" + messages = [{ + "role": "user", + "content": f"{placeholder}{content}", + }] + images = [Image.open(requests.get(image_url, stream=True).raw)] + + prompt = processor.tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True) + inputs = processor(prompt, images, return_tensors="pt") + + return inputs.input_ids.shape[1] + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) async def test_single_chat_session_image(client: openai.AsyncOpenAI, model_name: str, image_url: str): + content_text = "What's in this image?" messages = [{ "role": "user", @@ -70,16 +95,17 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, }, { "type": "text", - "text": "What's in this image?" + "text": content_text }, ], }] + max_completion_tokens = 10 # test single completion chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_completion_tokens=10, + max_completion_tokens=max_completion_tokens, logprobs=True, temperature=0.0, top_logprobs=5) @@ -87,8 +113,12 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, choice = chat_completion.choices[0] assert choice.finish_reason == "length" + hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, + image_url) assert chat_completion.usage == openai.types.CompletionUsage( - completion_tokens=10, prompt_tokens=774, total_tokens=784) + completion_tokens=max_completion_tokens, + prompt_tokens=hf_prompt_tokens, + total_tokens=hf_prompt_tokens + max_completion_tokens) message = choice.message message = chat_completion.choices[0].message @@ -107,6 +137,36 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, assert message.content is not None and len(message.content) >= 0 +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI, + model_name: str, + image_url: str): + content_text = "What's in this image?" + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": image_url + }, + { + "type": "text", + "text": content_text + }, + ], + }] + + # image_url should be a dict {"url": "some url"}, not directly a string + with pytest.raises(openai.BadRequestError): + _ = await client.chat.completions.create(model=model_name, + messages=messages, + max_completion_tokens=10, + temperature=0.0) + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @@ -150,6 +210,7 @@ async def test_single_chat_session_image_base64encoded( client: openai.AsyncOpenAI, model_name: str, image_url: str, base64_encoded_image: dict[str, str]): + content_text = "What's in this image?" messages = [{ "role": "user", @@ -163,16 +224,17 @@ async def test_single_chat_session_image_base64encoded( }, { "type": "text", - "text": "What's in this image?" + "text": content_text }, ], }] + max_completion_tokens = 10 # test single completion chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_completion_tokens=10, + max_completion_tokens=max_completion_tokens, logprobs=True, temperature=0.0, top_logprobs=5) @@ -180,8 +242,12 @@ async def test_single_chat_session_image_base64encoded( choice = chat_completion.choices[0] assert choice.finish_reason == "length" + hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, + image_url) assert chat_completion.usage == openai.types.CompletionUsage( - completion_tokens=10, prompt_tokens=774, total_tokens=784) + completion_tokens=max_completion_tokens, + prompt_tokens=hf_prompt_tokens, + total_tokens=hf_prompt_tokens + max_completion_tokens) message = choice.message message = chat_completion.choices[0].message diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py index 74e5c4cc7ea4a..26c68e06c199f 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -1,7 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 +import json + import pytest import requests +from PIL import Image +from transformers import AutoProcessor from vllm.entrypoints.openai.protocol import EmbeddingResponse from vllm.multimodal.utils import encode_image_base64, fetch_image @@ -35,7 +39,7 @@ def server(): "--enforce-eager", "--trust-remote-code", "--limit-mm-per-prompt", - f"image={MAXIMUM_IMAGES}", + json.dumps({"image": MAXIMUM_IMAGES}), "--chat-template", str(vlm2vec_jinja_path), ] @@ -52,11 +56,24 @@ def base64_encoded_image() -> dict[str, str]: } +def get_hf_prompt_tokens(model_name, content, image_url): + processor = AutoProcessor.from_pretrained(model_name, + trust_remote_code=True, + num_crops=4) + + placeholder = "<|image_1|> " + prompt = f"{placeholder}{content}" + images = [Image.open(requests.get(image_url, stream=True).raw)] + inputs = processor(prompt, images, return_tensors="pt") + return inputs.input_ids.shape[1] + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) async def test_image_embedding(server: RemoteOpenAIServer, model_name: str, image_url: str): + content_text = "Represent the given image." messages = [{ "role": "user", @@ -69,7 +86,7 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str, }, { "type": "text", - "text": "Represent the given image." + "text": content_text }, ], }] @@ -85,9 +102,12 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str, response.raise_for_status() embeddings = EmbeddingResponse.model_validate(response.json()) + hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, + image_url) + assert embeddings.id is not None assert len(embeddings.data) == 1 assert len(embeddings.data[0].embedding) == 3072 assert embeddings.usage.completion_tokens == 0 - assert embeddings.usage.prompt_tokens == 763 - assert embeddings.usage.total_tokens == 763 + assert embeddings.usage.prompt_tokens == hf_prompt_tokens + assert embeddings.usage.total_tokens == hf_prompt_tokens diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 6efed990b1893..92c1e0fec6b74 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -9,11 +9,11 @@ from transformers import __version__ as TRANSFORMERS_VERSION from vllm.assets.image import ImageAsset from vllm.config import ModelConfig -from vllm.entrypoints.chat_utils import (_resolve_hf_chat_template, - _try_extract_ast, load_chat_template, +from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template, parse_chat_messages, parse_chat_messages_futures, - resolve_chat_template_content_format) + resolve_chat_template_content_format, + resolve_hf_chat_template) from vllm.entrypoints.llm import apply_hf_chat_template from vllm.multimodal import MultiModalDataDict from vllm.multimodal.utils import encode_image_base64 @@ -25,11 +25,13 @@ EXAMPLES_DIR = VLLM_PATH / "examples" PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct" ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_5-llama-3_2-1b" +QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct" QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct" QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct" MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct" LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B" HERMES_MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B" +MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" @pytest.fixture(scope="function") @@ -80,6 +82,30 @@ def mllama_tokenizer(): ) +@pytest.fixture(scope="function") +def mistral_model_config(): + return ModelConfig(MISTRAL_MODEL_ID, + task="generate", + tokenizer=MISTRAL_MODEL_ID, + tokenizer_mode="auto", + trust_remote_code=True, + dtype="auto", + seed=0, + limit_mm_per_prompt={ + "image": 2, + }) + + +@pytest.fixture(scope="module") +def mistral_tokenizer(): + return TokenizerGroup( + tokenizer_id=MISTRAL_MODEL_ID, + enable_lora=False, + max_num_seqs=5, + max_input_length=None, + ) + + @pytest.fixture(scope="module") def image_url(): image = ImageAsset('cherry_blossom') @@ -131,6 +157,66 @@ def test_parse_chat_messages_single_image( _assert_mm_data_is_image_input(mm_data, 1) +def test_parse_chat_messages_empty_system( + mistral_model_config, + mistral_tokenizer, +): + # Test string format + conversation, _ = parse_chat_messages( + [{ + "role": "system", + "content": "" + }, { + "role": "user", + "content": [{ + "type": "text", + "text": "Who are you?" + }] + }], + mistral_model_config, + mistral_tokenizer, + content_format="string", + ) + assert conversation == [{ + "role": "system", + "content": "" + }, { + "role": "user", + "content": "Who are you?" + }] + + # Test openai format + conversation, _ = parse_chat_messages( + [{ + "role": "system", + "content": "" + }, { + "role": "user", + "content": [{ + "type": "text", + "text": "Who are you?" + }] + }], + mistral_model_config, + mistral_tokenizer, + content_format="openai", + ) + assert conversation == [{ + "role": "system", + "content": [{ + "type": "text", + "text": "" + }] + }, { + "role": + "user", + "content": [{ + "type": "text", + "text": "Who are you?" + }] + }] + + @pytest.mark.asyncio async def test_parse_chat_messages_single_image_async( phi3v_model_config, @@ -671,7 +757,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url): # Build a config for the model model_config = ModelConfig(model, task="generate", - tokenizer=MLLAMA_MODEL_ID, + tokenizer=model, tokenizer_mode="auto", trust_remote_code=True, dtype="auto", @@ -682,7 +768,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url): # Build the tokenizer group and grab the underlying tokenizer tokenizer_group = TokenizerGroup( - MLLAMA_MODEL_ID, + model, enable_lora=False, max_num_seqs=5, max_input_length=None, @@ -747,7 +833,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): }] if use_tools else None # Test detecting the tokenizer's chat_template - chat_template = _resolve_hf_chat_template( + chat_template = resolve_hf_chat_template( tokenizer, chat_template=None, tools=tools, @@ -756,6 +842,8 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): assert isinstance(chat_template, str) +# NOTE: Qwen2-Audio default chat template is specially defined inside +# processor class instead of using `tokenizer_config.json` # yapf: disable @pytest.mark.parametrize( ("model", "expected_format"), @@ -763,6 +851,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): (QWEN2VL_MODEL_ID, "openai"), (QWEN25VL_MODEL_ID, "openai"), (ULTRAVOX_MODEL_ID, "string"), + (QWEN2AUDIO_MODEL_ID, "openai"), (MLLAMA_MODEL_ID, "openai"), (LLAMA_GUARD_MODEL_ID, "openai")], ) @@ -781,7 +870,7 @@ def test_resolve_content_format_hf_defined(model, expected_format): tokenizer = tokenizer_group.tokenizer # Test detecting the tokenizer's chat_template - chat_template = _resolve_hf_chat_template( + chat_template = resolve_hf_chat_template( tokenizer, chat_template=None, tools=None, @@ -815,10 +904,13 @@ def test_resolve_content_format_hf_defined(model, expected_format): ("template_chatglm2.jinja", "string"), ("template_chatml.jinja", "string"), ("template_deepseek_vl2.jinja", "string"), + ("template_dse_qwen2_vl.jinja", "openai"), ("template_falcon_180b.jinja", "string"), ("template_falcon.jinja", "string"), + ("template_florence2.jinja", "string"), ("template_inkbot.jinja", "string"), ("template_llava.jinja", "string"), + ("template_teleflm.jinja", "string"), ("template_vlm2vec.jinja", "openai"), ("tool_chat_template_granite_20b_fc.jinja", "string"), ("tool_chat_template_hermes.jinja", "string"), diff --git a/tests/kernels/conftest.py b/tests/kernels/attention/conftest.py similarity index 100% rename from tests/kernels/conftest.py rename to tests/kernels/attention/conftest.py diff --git a/tests/kernels/test_attention.py b/tests/kernels/attention/test_attention.py similarity index 99% rename from tests/kernels/test_attention.py rename to tests/kernels/attention/test_attention.py index 0d7898a900e48..e5650136f2584 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -6,13 +6,12 @@ from typing import Optional import pytest import torch +from tests.kernels.allclose_default import get_default_atol, get_default_rtol from tests.kernels.utils import opcheck from vllm import _custom_ops as ops from vllm.platforms import current_platform from vllm.utils import get_max_shared_memory_bytes -from .allclose_default import get_default_atol, get_default_rtol - if not current_platform.is_rocm(): from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py new file mode 100644 index 0000000000000..b0414244c2151 --- /dev/null +++ b/tests/kernels/attention/test_attention_selector.py @@ -0,0 +1,252 @@ +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import patch + +import pytest +import torch + +from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend +from vllm.platforms.cpu import CpuPlatform +from vllm.platforms.cuda import CudaPlatform +from vllm.platforms.rocm import RocmPlatform +from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL + + +@pytest.fixture(autouse=True) +def clear_cache(): + """Clear lru cache to ensure each test case runs without caching. + """ + _cached_get_attn_backend.cache_clear() + + +# Define MLA and non-MLA backends separately +DEVICE_MLA_BACKENDS = { + "cuda": ["TRITON_MLA", "FLASHMLA"], + "hip": ["TRITON_MLA", "ROCM_AITER_MLA"], + "cpu": [], +} + +DEVICE_REGULAR_ATTN_BACKENDS = { + "cuda": ["XFORMERS", "FLASHINFER"], + "hip": ["ROCM_FLASH"], + "cpu": ["TORCH_SDPA"], +} + +DEVICE_MLA_BLOCK_SIZES = { + "cuda": [16, 64], # CUDA supports both standard and extended block sizes + "hip": [16, 1], # HIP requires special handling for block_size=1 + "cpu": [16] # CPU uses fixed block size from test cases +} + + +def generate_params(): + params = [] + for use_mla in [True, False]: + for device in ["cuda", "hip", "cpu"]: + backends = DEVICE_MLA_BACKENDS[ + device] if use_mla else DEVICE_REGULAR_ATTN_BACKENDS[device] + for name in backends: + block_sizes = DEVICE_MLA_BLOCK_SIZES[device] if use_mla else [ + 16 + ] + for block_size in block_sizes: + params.append( + pytest.param( + device, + name, + use_mla, + block_size, + id= + f"{device}_{name}_mla_{str(use_mla)[0]}_blks{block_size}" + )) + return params + + +@pytest.mark.parametrize("device, name, use_mla, block_size", + generate_params()) +@pytest.mark.parametrize("use_v1", [True, False]) +def test_env( + device: str, + name: str, + use_mla: bool, + block_size: int, + use_v1: bool, + monkeypatch: pytest.MonkeyPatch, +): + """Test attention backend selection with valid device-backend pairs.""" + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + m.setenv(STR_BACKEND_ENV_VAR, name) + m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0") + + if device == "cpu": + with patch("vllm.attention.selector.current_platform", + CpuPlatform()): + backend = get_attn_backend(16, torch.float16, torch.float16, + block_size, False) + assert backend.get_name() == "TORCH_SDPA" + + elif device == "hip": + with patch("vllm.attention.selector.current_platform", + RocmPlatform()): + if use_mla: + # Validate HIP MLA backend-block_size combinations + valid_combination = ( + (name == "TRITON_MLA" and block_size != 1) + or (name == "ROCM_AITER_MLA" and block_size == 1)) + + if valid_combination: + backend = get_attn_backend(16, + torch.float16, + torch.float16, + block_size, + False, + use_mla=use_mla) + assert backend.get_name() == name + else: + with pytest.raises(ValueError) as exc_info: + get_attn_backend(16, + torch.float16, + torch.float16, + block_size, + False, + use_mla=use_mla) + assert f"The selected backend, {name}" in str( + exc_info.value) + else: + backend = get_attn_backend(16, + torch.float16, + torch.float16, + block_size, + False, + use_mla=use_mla) + expected = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH" + assert backend.get_name() == expected + + elif device == "cuda": + with patch("vllm.attention.selector.current_platform", + CudaPlatform()): + if use_mla: + if name == "FLASHMLA" and block_size == 64: + from vllm.attention.backends.flashmla import ( + is_flashmla_supported) + + # only on cuda platforms with specific capability. + is_supported, _ = is_flashmla_supported() + + if not is_supported: + # if platform is not supported then skip this case. + pytest.skip() + else: + backend = get_attn_backend(16, + torch.float16, + torch.float16, + block_size, + False, + use_mla=use_mla) + expected = f"{name}_VLLM_V1" if use_v1 else name + assert backend.get_name() == expected + else: + backend = get_attn_backend(16, + torch.float16, + torch.float16, + block_size, + False, + use_mla=use_mla) + expected = ("TRITON_MLA_VLLM_V1" + if use_v1 else "TRITON_MLA") + assert backend.get_name() == expected + elif name == "FLASHINFER": + backend = get_attn_backend(16, + torch.float16, + torch.float16, + block_size, + False, + use_mla=use_mla) + expected = "FLASHINFER_VLLM_V1" if use_v1 else name + assert backend.get_name() == expected + else: + backend = get_attn_backend(16, + torch.float16, + torch.float16, + block_size, + False, + use_mla=use_mla) + expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name + assert backend.get_name() == expected + + +def test_flash_attn(monkeypatch: pytest.MonkeyPatch): + """Test FlashAttn validation.""" + # TODO: When testing for v1, pipe in `use_v1` as an argument to + # get_attn_backend + + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL) + + # Unsupported CUDA arch + monkeypatch.setattr(torch.cuda, "get_device_capability", lambda: + (7, 5)) + backend = get_attn_backend(16, torch.float16, None, 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL + + # Reset the monkeypatch for subsequent tests + monkeypatch.undo() + + # Unsupported data type + backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL + + # Unsupported kv cache data type + backend = get_attn_backend(16, torch.float16, "fp8", 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL + + # Unsupported block size + backend = get_attn_backend(16, torch.float16, None, 8, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL + + # flash-attn is not installed + import sys + original_module = sys.modules.get('vllm_flash_attn') + monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None) + backend = get_attn_backend(16, torch.float16, None, 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL + + # Restore the original module if it existed + if original_module is not None: + monkeypatch.setitem(sys.modules, 'vllm_flash_attn', + original_module) + else: + monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False) + + # Unsupported head size + backend = get_attn_backend(17, torch.float16, None, 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL + + # Attention-free models should bypass env and use PlaceholderAttention + backend = get_attn_backend(16, torch.float16, torch.float16, 16, True) + assert backend.get_name() != STR_FLASH_ATTN_VAL + + +@pytest.mark.parametrize("use_v1", [True, False]) +def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch): + + with monkeypatch.context() as m, patch( + "vllm.attention.selector.current_platform", CudaPlatform()): + m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL) + + # Test with head size 32 + backend = get_attn_backend(32, torch.float16, None, 16, False) + EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN" + assert backend.get_name() == EXPECTED + + # when block size == 16, backend will fall back to XFORMERS + # this behavior is not yet supported on V1. + if use_v1: + # TODO: support fallback on V1! + # https://github.com/vllm-project/vllm/issues/14524 + pass + else: + backend = get_attn_backend(16, torch.float16, None, 16, False) + assert backend.get_name() == "XFORMERS" diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/attention/test_blocksparse_attention.py similarity index 99% rename from tests/kernels/test_blocksparse_attention.py rename to tests/kernels/attention/test_blocksparse_attention.py index 3025ae0f921a4..82d038257575c 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/attention/test_blocksparse_attention.py @@ -6,14 +6,13 @@ from typing import Optional import pytest import torch +from tests.kernels.allclose_default import get_default_atol, get_default_rtol from vllm import _custom_ops as ops from vllm.attention.ops.blocksparse_attention.interface import ( LocalStridedBlockSparseAttn) from vllm.platforms import current_platform from vllm.utils import get_max_shared_memory_bytes -from .allclose_default import get_default_atol, get_default_rtol - FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. # - 512 as a buffer diff --git a/tests/kernels/test_cache.py b/tests/kernels/attention/test_cache.py similarity index 100% rename from tests/kernels/test_cache.py rename to tests/kernels/attention/test_cache.py diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py similarity index 100% rename from tests/kernels/test_cascade_flash_attn.py rename to tests/kernels/attention/test_cascade_flash_attn.py diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/attention/test_encoder_decoder_attn.py similarity index 100% rename from tests/kernels/test_encoder_decoder_attn.py rename to tests/kernels/attention/test_encoder_decoder_attn.py diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py similarity index 100% rename from tests/kernels/test_flash_attn.py rename to tests/kernels/attention/test_flash_attn.py diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py similarity index 100% rename from tests/kernels/test_flashinfer.py rename to tests/kernels/attention/test_flashinfer.py diff --git a/tests/kernels/test_flashmla.py b/tests/kernels/attention/test_flashmla.py similarity index 98% rename from tests/kernels/test_flashmla.py rename to tests/kernels/attention/test_flashmla.py index 21c1079fc8eb3..3985c6834f60e 100644 --- a/tests/kernels/test_flashmla.py +++ b/tests/kernels/attention/test_flashmla.py @@ -124,7 +124,7 @@ def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal, cal_diff(out_flash, out_torch, "out") cal_diff(lse_flash, lse_torch, "lse") - t = triton.testing.do_bench(flash_mla, fast_flush=False) + t = triton.testing.do_bench(flash_mla) FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2 bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8) diff --git a/tests/kernels/attention/test_lightning_attn.py b/tests/kernels/attention/test_lightning_attn.py new file mode 100644 index 0000000000000..fbad52987dd2b --- /dev/null +++ b/tests/kernels/attention/test_lightning_attn.py @@ -0,0 +1,286 @@ +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import torch + +from vllm.model_executor.layers.lightning_attn import ( + linear_decode_forward_triton) +from vllm.platforms import current_platform + +NUM_HEADS = [4, 8] +HEAD_SIZES = [64] +BATCH_SIZES = [1, 2] +SEQ_LENGTHS = [16] +DTYPES = [torch.float32] + + +def reference_lightning_attention(q, k, v, ed, block_size, kv_history): + """Reference implementation of lightning attention core algorithm + + The difference from the main implementation is that this processes + each step sequentially, instead of using parallelized triton kernels + """ + B, H, S, D = q.shape + E = v.shape[-1] + dtype = q.dtype + output = torch.zeros((B, H, S, E), dtype=dtype, device=q.device) + + # Use clone() to ensure an independent copy + if kv_history is None: + kv_cache = torch.zeros((B, H, D, E), dtype=dtype, device=q.device) + else: + kv_cache = kv_history.clone() + + # More efficient implementation + # Convert decay factors to matrix form + if ed.dim() == 1: + decay = torch.exp(-ed).view(1, -1, 1, 1) + else: + decay = torch.exp(-ed) + + for b in range(B): + for step in range(S): + # Process all heads at once for this position + q_bs = q[b, :, step] # [H, D] + k_bs = k[b, :, step] # [H, D] + v_bs = v[b, :, step] # [H, E] + + # Calculate KV outer products for all heads + for h in range(H): + # Calculate KV outer product + kv_outer = torch.outer(k_bs[h], v_bs[h]) + + # Update KV cache with decay + # Note: Using the same order as in the Triton kernel + kv_cache[b, h] = decay[0, h, 0, 0] * kv_cache[b, h] + kv_outer + + # Calculate attention output + output[b, h, step] = torch.matmul(q_bs[h], kv_cache[b, h]) + + # Match the shape returned by the actual implementation + # The actual implementation returns a tensor of shape [B, H, 2, D, E] + # where dimension 2 contains both KV and KV history + kv_reshaped = kv_cache.unsqueeze(2) # [B, H, 1, D, E] + final_kv_cache = torch.cat([kv_reshaped, kv_reshaped], + dim=2) # [B, H, 2, D, E] + + return output, final_kv_cache + + +def reference_linear_decode(q, k, v, kv_caches, slope_rate, slot_idx): + """Reference implementation: linear attention decode function""" + B, H, _, D = q.shape + output = torch.zeros(B, H * D, dtype=q.dtype, device=q.device) + + # Calculate decay factors once (more efficient) + decay = torch.exp(-slope_rate).view(-1, 1, 1) # [H, 1, 1] + + # Process each batch + for b in range(B): + slot_id = slot_idx[b].item() + + # Skip padding positions + if slot_id == -1: + continue + + # Process all heads at once for this batch + q_b = q[b, :, 0] # [H, D] + k_b = k[b, :, 0] # [H, D] + v_b = v[b, :, 0] # [H, D] + + # Process each attention head + for h in range(H): + # Get current query, key and value + q_bh = q_b[h] + k_bh = k_b[h] + v_bh = v_b[h] + + # Get cache + kv_cache_old = kv_caches[b, h] + + # Calculate new key-value outer product + kv_outer = torch.outer(k_bh, v_bh) + + # Apply decay and update cache + kv_new = kv_outer + decay[h, 0, 0] * kv_cache_old + + # Calculate output + out_h = torch.matmul(q_bh, kv_new) + + # Update output and cache + output[b, h * D:(h + 1) * D] = out_h + kv_caches[b, h] = kv_new + + return output + + +@pytest.mark.parametrize("batch_size", BATCH_SIZES) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("dtype", DTYPES) +@torch.inference_mode() +def test_linear_decode_forward_triton( + batch_size: int, + num_heads: int, + head_size: int, + dtype: torch.dtype, +): + torch.set_default_device("cuda") + torch.manual_seed(42) + torch.cuda.manual_seed_all(42) + current_platform.seed_everything(42) + base = 0.01 + q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype) + k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype) + v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype) + + kv_caches = base * torch.randn(batch_size, + num_heads, + head_size, + head_size, + dtype=dtype, + device="cuda") + + kv_caches_copy = kv_caches.clone() + + slope_rate = torch.zeros(num_heads, device="cuda") + for h in range(num_heads): + slope_rate[h] = 0.1 * (h + 1) + + slot_idx = torch.arange(batch_size, device="cuda") + + triton_output = linear_decode_forward_triton(q, k, v, kv_caches, + slope_rate, slot_idx) + + reference_output = reference_linear_decode(q, k, v, kv_caches_copy, + slope_rate, slot_idx) + torch.testing.assert_close(triton_output, + reference_output, + rtol=1e-1, + atol=1e-1) + torch.testing.assert_close(kv_caches, kv_caches_copy, rtol=1e-1, atol=1e-1) + + assert triton_output.shape == (batch_size, num_heads * head_size) + + +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("dtype", DTYPES) +@torch.inference_mode() +def test_linear_decode_forward_triton_with_padding( + num_heads: int, + head_size: int, + dtype: torch.dtype, +): + torch.set_default_device("cuda") + torch.manual_seed(42) + torch.cuda.manual_seed_all(42) + current_platform.seed_everything(42) + + batch_size = 4 + base = 0.01 + q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype) + k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype) + v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype) + + kv_caches = base * torch.randn(batch_size, + num_heads, + head_size, + head_size, + dtype=dtype, + device="cuda") + + kv_caches_copy = kv_caches.clone() + + slope_rate = torch.zeros(num_heads, device="cuda") + for h in range(num_heads): + slope_rate[h] = 0.1 * (h + 1) + + slot_idx = torch.tensor([0, 1, -1, 2], device="cuda") + + triton_output = linear_decode_forward_triton(q, k, v, kv_caches, + slope_rate, slot_idx) + + reference_output = reference_linear_decode(q, k, v, kv_caches_copy, + slope_rate, slot_idx) + + padding_mask = (slot_idx + != -1).unsqueeze(1).expand(-1, num_heads * head_size) + + triton_masked = triton_output[padding_mask] + reference_masked = reference_output[padding_mask] + + atol, rtol = 1.5e-1, 1.5e-1 + + valid_indices = slot_idx != -1 + + for i in range(batch_size): + if valid_indices[i] > 0: + torch.testing.assert_close(kv_caches[i], + kv_caches_copy[i], + rtol=rtol, + atol=atol) + + torch.testing.assert_close(triton_masked, + reference_masked, + rtol=rtol, + atol=atol) + + assert triton_output.shape == (batch_size, num_heads * head_size) + + +@pytest.mark.parametrize("batch_size", BATCH_SIZES) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("seq_len", SEQ_LENGTHS) +@pytest.mark.parametrize("dtype", DTYPES) +@torch.inference_mode() +def test_lightning_attention_reference( + batch_size: int, + num_heads: int, + head_size: int, + seq_len: int, + dtype: torch.dtype, +): + torch.set_default_device("cuda") + torch.manual_seed(42) + torch.cuda.manual_seed_all(42) + current_platform.seed_everything(42) + + base = 0.01 + q = base * torch.randn( + batch_size, num_heads, seq_len, head_size, dtype=dtype) + k = base * torch.randn( + batch_size, num_heads, seq_len, head_size, dtype=dtype) + v = base * torch.randn( + batch_size, num_heads, seq_len, head_size, dtype=dtype) + + ed = torch.zeros(num_heads, device="cuda") + for h in range(num_heads): + ed[h] = 0.1 * (h + 1) + + kv_history = base * torch.randn(batch_size, + num_heads, + head_size, + head_size, + dtype=dtype, + device="cuda") + + kv_history_clone = kv_history.clone() + + ref_output, ref_kv_cache = reference_lightning_attention( + q, k, v, ed, 256, kv_history) + + from vllm.model_executor.layers.lightning_attn import lightning_attention + actual_output, actual_kv_cache = lightning_attention( + q, k, v, ed, 256, kv_history_clone) + + atol, rtol = 1.5e-1, 1.5e-1 + torch.testing.assert_close(ref_output, actual_output, rtol=rtol, atol=atol) + torch.testing.assert_close(ref_kv_cache, + actual_kv_cache, + rtol=rtol, + atol=atol) + + assert ref_output.shape == (batch_size, num_heads, seq_len, head_size) + assert ref_kv_cache.shape == actual_kv_cache.shape diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py new file mode 100644 index 0000000000000..7038fbea5c22e --- /dev/null +++ b/tests/kernels/attention/test_merge_attn_states.py @@ -0,0 +1,265 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional + +import pytest +import torch + +from vllm._custom_ops import merge_attn_states as merge_attn_states_cuda +from vllm.attention.ops.triton_merge_attn_states import ( + merge_attn_states as merge_attn_states_triton) +from vllm.platforms import current_platform + + +# Naive PyTorch Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005 +# can be used to combine partial attention results (in the split-KV case) +def merge_attn_states_torch( + output: torch.Tensor, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] + prefix_output: torch.Tensor, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] + prefix_lse: torch.Tensor, # [NUM_HEADS, NUM_TOKENS] + suffix_output: torch.Tensor, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] + suffix_lse: torch.Tensor, # [NUM_HEADS, NUM_TOKENS] + output_lse: Optional[torch.Tensor] = None, # [NUM_HEADS, NUM_TOKENS] +): + p_lse = prefix_lse + s_lse = suffix_lse + # inf -> -inf + p_lse[p_lse == torch.inf] = -torch.inf + s_lse[s_lse == torch.inf] = -torch.inf + # max_lse [NUM_HEADS, NUM_TOKENS] + max_lse = torch.maximum(p_lse, s_lse) + p_lse = p_lse - max_lse + s_lse = s_lse - max_lse + p_lse_exp = torch.exp(p_lse) + s_lse_exp = torch.exp(s_lse) + out_se = (p_lse_exp + s_lse_exp) + if output_lse is not None: + output_lse = torch.log(out_se) + max_lse + p_scale = p_lse_exp / out_se # [NUM_HEADS, NUM_TOKENS] + s_scale = s_lse_exp / out_se # [NUM_HEADS, NUM_TOKENS] + p_scale = torch.transpose(p_scale, 0, + 1).unsqueeze(2) # [NUM_TOKENS, NUM_HEADS, 1] + s_scale = torch.transpose(s_scale, 0, + 1).unsqueeze(2) # [NUM_TOKENS, NUM_HEADS, 1] + output = prefix_output * p_scale + suffix_output * s_scale + return output, output_lse + + +NUM_BATCH_TOKENS = [256, 512, 613, 1024, 1536, 4096] +NUM_QUERY_HEADS = [4, 8, 16, 32, 48, 64] +HEAD_SIZES = [32, 48, 64, 96, 128, 256] +DTYPES = [torch.float32, torch.half, torch.bfloat16] + +all_case_info: list[tuple] = [] + + +def generate_markdown_table(): + global all_case_info + table_header = ("| tokens | heads | headsize | dtype " + "| device | torch | triton | cuda | speedup |") + table_separator = "| --- | --- | --- | --- | --- | --- | --- | --- | --- |" + + def shortly_dtype(dtype: torch.dtype) -> str: + return str(dtype).removeprefix("torch.") + + def shortly_device(device: str) -> str: + return device.removeprefix("NVIDIA").strip() + + print(table_header) + print(table_separator) + for info in all_case_info: + (num_tokens, num_heads, head_size, dtype, device, + avg_time_torch_kernel, avg_time_triton_kernel, avg_time_cuda_kernel, + performance_improved) = info + dtype = shortly_dtype(dtype) + device = shortly_device(device) + print(f"| {num_tokens} | {num_heads} | {head_size} " + f"| {dtype} | {device} | {avg_time_torch_kernel:.5f}ms " + f"| {avg_time_triton_kernel:.5f}ms " + f"| {avg_time_cuda_kernel:.5f}ms " + f"| {performance_improved:.4f}x |") + + +@pytest.mark.parametrize("num_tokens", NUM_BATCH_TOKENS) +@pytest.mark.parametrize("num_query_heads", NUM_QUERY_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("output_dtype", DTYPES) +@torch.inference_mode() +def test_merge_attn_states(num_tokens: int, num_query_heads: int, + head_size: int, output_dtype: torch.dtype): + if not current_platform.is_cuda(): + pytest.skip('Currently only support compare triton merge_attn_states ' + 'with custom cuda merge_attn_states kernel') + + NUM_TOKENS = num_tokens + NUM_HEADS = num_query_heads + HEAD_SIZE = head_size + + print(f"\nNUM_TOKENS:{NUM_TOKENS}, NUM_HEADS:{NUM_HEADS}, " + f"HEAD_SIZE:{HEAD_SIZE}, DTYPE: {output_dtype}, " + f"Device: {current_platform.get_device_name()}") + + # prefix_lse and suffix_lse contain inf and normal values + prefix_lse = torch.randn(NUM_HEADS, + NUM_TOKENS, + dtype=torch.float32, + device="cuda") + suffix_lse = torch.randn(NUM_HEADS, + NUM_TOKENS, + dtype=torch.float32, + device="cuda") + + # Generate boolean masks + mask_prefix = torch.rand(NUM_HEADS, NUM_TOKENS) < 0.1 + mask_suffix = torch.rand(NUM_HEADS, NUM_TOKENS) < 0.1 + # Ensure that the same position is not True at the same time + combined_mask = torch.logical_and(mask_prefix, mask_suffix) + mask_prefix = torch.logical_and(mask_prefix, ~combined_mask) + mask_suffix = torch.logical_and(mask_suffix, ~combined_mask) + + prefix_lse[mask_prefix] = float('inf') + suffix_lse[mask_suffix] = float('inf') + + # Other input tensors (need to be initialized but + # no actual calculation needed) + output = torch.zeros((NUM_TOKENS, NUM_HEADS, HEAD_SIZE), + dtype=output_dtype, + device="cuda") + output_lse = torch.zeros((NUM_HEADS, NUM_TOKENS), + dtype=torch.float32, + device="cuda") + prefix_output = torch.randn((NUM_TOKENS, NUM_HEADS, HEAD_SIZE), + dtype=output_dtype, + device="cuda") + suffix_output = torch.randn((NUM_TOKENS, NUM_HEADS, HEAD_SIZE), + dtype=output_dtype, + device="cuda") + + warmup_times = 2 + repeat_times = 20 + + output_torch = output.clone() + output_lse_torch = output_lse.clone() + total_time_torch_kernel = 0 + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + # 0. Run the Torch kernel + prefix_lse_torch = prefix_lse.clone() + suffix_lse_torch = suffix_lse.clone() + for _ in range(warmup_times): + output_torch, output_lse_torch = merge_attn_states_torch( + output_torch, prefix_output, prefix_lse_torch, suffix_output, + suffix_lse_torch, output_lse_torch) + torch.cuda.synchronize() + + for _ in range(repeat_times): + start.record() + output_torch, output_lse_torch = merge_attn_states_torch( + output_torch, prefix_output, prefix_lse_torch, suffix_output, + suffix_lse_torch, output_lse_torch) + end.record() + torch.cuda.synchronize() + total_time_torch_kernel += start.elapsed_time(end) + + avg_time_torch_kernel = total_time_torch_kernel / repeat_times + + # 1. Run the Triton kernel + output_ref_triton = output.clone() + output_lse_ref_triton = output_lse.clone() + + total_time_triton_kernel = 0 + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + for _ in range(warmup_times): + merge_attn_states_triton(output_ref_triton, prefix_output, prefix_lse, + suffix_output, suffix_lse, + output_lse_ref_triton) + torch.cuda.synchronize() + + for _ in range(repeat_times): + start.record() + merge_attn_states_triton(output_ref_triton, prefix_output, prefix_lse, + suffix_output, suffix_lse, + output_lse_ref_triton) + end.record() + torch.cuda.synchronize() + total_time_triton_kernel += start.elapsed_time(end) + + avg_time_triton_kernel = total_time_triton_kernel / repeat_times + + # 2. Run the CUDA kernel + total_time_cuda_kernel = 0 + output_cuda = output.clone() + output_lse_cuda = output_lse.clone() + + for _ in range(warmup_times): + merge_attn_states_cuda(output_cuda, prefix_output, prefix_lse, + suffix_output, suffix_lse, output_lse_cuda) + torch.cuda.synchronize() + + for _ in range(repeat_times): + start.record() + merge_attn_states_cuda(output_cuda, prefix_output, prefix_lse, + suffix_output, suffix_lse, output_lse_cuda) + end.record() + torch.cuda.synchronize() + total_time_cuda_kernel += start.elapsed_time(end) + + avg_time_cuda_kernel = total_time_cuda_kernel / repeat_times + + # 3. Performance compare + performance_improved = avg_time_triton_kernel / avg_time_cuda_kernel + print(f" Torch time: {avg_time_torch_kernel:.6f}ms") + print(f"Triton time: {avg_time_triton_kernel:.6f}ms") + print(f" CUDA time: {avg_time_cuda_kernel:.6f}ms, " + f"Performance: {performance_improved:.5f}x") + print("-" * 100) + + # 4. Correctness compare + # Liger Kernel: Efficient Triton Kernels for LLM Training + # https://arxiv.org/pdf/2410.10989, 3.3 Correctness + # use rtol = 1e-2 for bfloat16. + rtol = 1e-2 if output_dtype == torch.bfloat16 else 1e-3 + + def diff(a: torch.Tensor, b: torch.Tensor): + max_diff = torch.max(torch.abs(a.float() - b.float())) + return max_diff + + # Use Triton output as reference because we want to replace + # the Triton kernel with custom CUDA kernel for merge attn + # states operation. + output_ref = output_ref_triton + output_lse_ref = output_lse_ref_triton + torch.testing.assert_close(output_cuda.float(), + output_ref.float(), + atol=1e-3, + rtol=rtol) + print("Output all match, max abs diff:") + print(f"(Triton vs Torch) : {diff(output_torch, output_ref)}") + print(f" (CUDA vs Torch) : {diff(output_torch, output_cuda)}") + print(f" (CUDA vs Triton): {diff(output_ref, output_cuda)}") + print("-" * 100) + + torch.testing.assert_close(output_lse_cuda.float(), + output_lse_ref.float(), + atol=1e-3, + rtol=rtol) + print("Output LSE all match, max abs diff:") + print(f"(Triton vs Torch) : {diff(output_lse_torch, output_lse_ref)}") + print(f" (CUDA vs Torch) : {diff(output_lse_torch, output_lse_cuda)}") + print(f" (CUDA vs Triton): {diff(output_lse_ref, output_lse_cuda)}") + print("-" * 100) + + print("All output values test passed! All inf values " + "are correctly replaced with -inf.") + print("-" * 100) + + device = current_platform.get_device_name() + all_case_info.append( + (NUM_TOKENS, NUM_HEADS, HEAD_SIZE, output_dtype, device, + avg_time_torch_kernel, avg_time_triton_kernel, avg_time_cuda_kernel, + performance_improved)) + if len(all_case_info) == (len(NUM_BATCH_TOKENS) * len(HEAD_SIZES) * + len(NUM_QUERY_HEADS) * len(DTYPES)): + generate_markdown_table() diff --git a/tests/kernels/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py similarity index 100% rename from tests/kernels/test_mha_attn.py rename to tests/kernels/attention/test_mha_attn.py diff --git a/tests/kernels/test_mla_decode_cpu.py b/tests/kernels/attention/test_mla_decode_cpu.py similarity index 100% rename from tests/kernels/test_mla_decode_cpu.py rename to tests/kernels/attention/test_mla_decode_cpu.py diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py similarity index 99% rename from tests/kernels/test_prefix_prefill.py rename to tests/kernels/attention/test_prefix_prefill.py index 50eaa92f59b5c..9333777d38ea0 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/attention/test_prefix_prefill.py @@ -164,6 +164,7 @@ def test_contexted_kv_attention( block_table, b_start_loc, b_seq_len, + MAX_CTX_LEN, max_input_len, k_scale, v_scale, @@ -180,6 +181,7 @@ def test_contexted_kv_attention( block_table, b_start_loc, b_seq_len, + MAX_CTX_LEN, max_input_len, k_scale, v_scale, @@ -397,6 +399,7 @@ def test_contexted_kv_attention_alibi( block_table, b_start_loc, b_seq_len, + MAX_CTX_LEN, max_input_len, k_scale, v_scale, @@ -413,6 +416,7 @@ def test_contexted_kv_attention_alibi( block_table, b_start_loc, b_seq_len, + MAX_CTX_LEN, max_input_len, k_scale, v_scale, diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py new file mode 100644 index 0000000000000..4cf7bcb01d4d7 --- /dev/null +++ b/tests/kernels/attention/test_rocm_attention_selector.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import torch + +from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend +from vllm.platforms.rocm import RocmPlatform +from vllm.utils import STR_BACKEND_ENV_VAR + + +@pytest.fixture(autouse=True) +def clear_cache(): + """Clear lru cache to ensure each test case runs without caching. + """ + _cached_get_attn_backend.cache_clear() + + +def test_selector(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH") + + # Set the current platform to ROCm using monkeypatch + monkeypatch.setattr("vllm.attention.selector.current_platform", + RocmPlatform()) + + # Test standard ROCm attention + backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) + assert (backend.get_name() == "ROCM_FLASH" + or backend.get_name() == "TRITON_ATTN_VLLM_V1") + + # MLA test for deepseek related + + # change the attention backend to triton MLA + m.setenv(STR_BACKEND_ENV_VAR, "TRITON_MLA") + backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, + False, True) + assert backend.get_name() == "TRITON_MLA" + + # If attention backend is None + # If use_mla is true + # The selected backend is triton MLA + m.setenv(STR_BACKEND_ENV_VAR, None) + backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, + False, True) + assert backend.get_name() == "TRITON_MLA" + + # change the attention backend to AITER MLA + m.setenv(STR_BACKEND_ENV_VAR, "ROCM_AITER_MLA") + backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, + False, True) + assert backend.get_name() == "ROCM_AITER_MLA" + + # If attention backend is None + # If use_mla is true + # If VLLM_ROCM_USE_AITER is enabled + # The selected backend is ROCM_AITER_MLA + m.setenv(STR_BACKEND_ENV_VAR, None) + m.setenv("VLLM_ROCM_USE_AITER", "1") + backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, + False, True) + assert backend.get_name() == "ROCM_AITER_MLA" diff --git a/tests/kernels/test_triton_decode_attention.py b/tests/kernels/attention/test_triton_decode_attention.py similarity index 100% rename from tests/kernels/test_triton_decode_attention.py rename to tests/kernels/attention/test_triton_decode_attention.py diff --git a/tests/kernels/test_activation.py b/tests/kernels/core/test_activation.py similarity index 97% rename from tests/kernels/test_activation.py rename to tests/kernels/core/test_activation.py index cf0f21ce06514..79f838a954e70 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/core/test_activation.py @@ -5,6 +5,7 @@ import random import pytest import torch +from tests.kernels.allclose_default import get_default_atol, get_default_rtol from tests.kernels.utils import opcheck from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul, GeluAndMul, MulAndSilu, @@ -12,8 +13,6 @@ from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul, SiluAndMul) from vllm.platforms import current_platform -from .allclose_default import get_default_atol, get_default_rtol - DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing D = [512, 13824] # Arbitrary values for testing diff --git a/tests/kernels/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py similarity index 100% rename from tests/kernels/test_fused_quant_layernorm.py rename to tests/kernels/core/test_fused_quant_layernorm.py diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/core/test_layernorm.py similarity index 100% rename from tests/kernels/test_layernorm.py rename to tests/kernels/core/test_layernorm.py diff --git a/tests/kernels/core/test_opcheck.py b/tests/kernels/core/test_opcheck.py new file mode 100644 index 0000000000000..c9a9679c5d80f --- /dev/null +++ b/tests/kernels/core/test_opcheck.py @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Tests for miscellaneous utilities +""" + +import torch + +from tests.kernels.utils import opcheck + + +def test_convert_fp8_opcheck(): + data = torch.randn((256, 256), dtype=torch.float32, device="cuda") + result = torch.empty_like(data, dtype=torch.float8_e4m3fn) + opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8")) + + +# TODO: Add this back, currently fails with +# csrc/cuda_utils_kernels.cu:15 'invalid argument' +# @pytest.mark.skipif(not current_platform.is_cuda(), +# reason="Only supported for CUDA") +# def test_cuda_utils_opcheck(): +# opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0)) +# opcheck( +# torch.ops._C_cuda_utils. +# get_max_shared_memory_per_block_device_attribute, (0, )) diff --git a/tests/kernels/test_permute_cols.py b/tests/kernels/core/test_permute_cols.py similarity index 100% rename from tests/kernels/test_permute_cols.py rename to tests/kernels/core/test_permute_cols.py diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py similarity index 99% rename from tests/kernels/test_pos_encoding.py rename to tests/kernels/core/test_pos_encoding.py index eb83b4d612c22..2b7bf755ec22d 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -6,11 +6,10 @@ from typing import Callable, Optional import pytest import torch +from tests.kernels.allclose_default import get_default_atol, get_default_rtol from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.platforms import current_platform -from .allclose_default import get_default_atol, get_default_rtol - IS_NEOX_STYLE = [True, False] DTYPES = [torch.half, torch.bfloat16, torch.float] HEAD_SIZES = [64, 80, 112, 120, 256] diff --git a/tests/kernels/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py similarity index 100% rename from tests/kernels/test_rotary_embedding.py rename to tests/kernels/core/test_rotary_embedding.py diff --git a/tests/kernels/core/test_uva.py b/tests/kernels/core/test_uva.py new file mode 100644 index 0000000000000..f641ae7b67c2d --- /dev/null +++ b/tests/kernels/core/test_uva.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest +import torch + +from vllm.utils import get_cuda_view_from_cpu_tensor, is_uva_available + +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] + + +@pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.") +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_cpu_write(device): + torch.set_default_device(device) + cpu_tensor = torch.zeros(10, + 10, + device="cpu", + pin_memory=True, + dtype=torch.int32) + cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor) + assert cuda_view.device.type == "cuda" + + assert cuda_view[0, 0] == 0 + assert cuda_view[2, 3] == 0 + assert cuda_view[4, 5] == 0 + + cpu_tensor[0, 0] = 1 + cpu_tensor[2, 3] = 2 + cpu_tensor[4, 5] = -1 + + cuda_view.mul_(2) + assert cuda_view[0, 0] == 2 + assert cuda_view[2, 3] == 4 + assert cuda_view[4, 5] == -2 + + +@pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.") +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_gpu_write(device): + torch.set_default_device(device) + cpu_tensor = torch.zeros(10, + 10, + device="cpu", + pin_memory=True, + dtype=torch.int32) + cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor) + assert cuda_view.device.type == "cuda" + + assert cuda_view[0, 0] == 0 + assert cuda_view[2, 3] == 0 + assert cuda_view[4, 5] == 0 + + cuda_view[0, 0] = 1 + cuda_view[2, 3] = 2 + cuda_view[4, 5] = -1 + cuda_view.mul_(2) + + assert cpu_tensor[0, 0] == 2 + assert cpu_tensor[2, 3] == 4 + assert cpu_tensor[4, 5] == -2 \ No newline at end of file diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/mamba/test_causal_conv1d.py similarity index 100% rename from tests/kernels/test_causal_conv1d.py rename to tests/kernels/mamba/test_causal_conv1d.py diff --git a/tests/kernels/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py similarity index 100% rename from tests/kernels/test_mamba_mixer2.py rename to tests/kernels/mamba/test_mamba_mixer2.py diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py similarity index 100% rename from tests/kernels/test_mamba_ssm.py rename to tests/kernels/mamba/test_mamba_ssm.py diff --git a/tests/kernels/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py similarity index 95% rename from tests/kernels/test_mamba_ssm_ssd.py rename to tests/kernels/mamba/test_mamba_ssm_ssd.py index 8f23a9b216e98..ee908105f557f 100644 --- a/tests/kernels/test_mamba_ssm_ssd.py +++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py @@ -5,6 +5,8 @@ import torch import torch.nn.functional as F from einops import rearrange, repeat +from vllm.model_executor.layers.mamba.mamba2_metadata import ( + _seq_idx_to_chunk_indices_offsets) from vllm.model_executor.layers.mamba.ops.ssd_combined import ( mamba_chunk_scan_combined) from vllm.platforms import current_platform @@ -160,14 +162,14 @@ def generate_continous_batched_examples(example_lens_by_batch, # get the metadata cu_seqlens = torch.tensor((0, ) + spec, device=device).cumsum(dim=0) - sed_idx = torch.zeros(cu_seqlens[-1], + seq_idx = torch.zeros(cu_seqlens[-1], dtype=torch.int32, device=cu_seqlens.device) for i, (srt, end) in enumerate(zip( cu_seqlens, cu_seqlens[1:], )): - sed_idx[srt:end] = i + seq_idx[srt:end] = i # for cont batch if IND_E is None: @@ -177,7 +179,7 @@ def generate_continous_batched_examples(example_lens_by_batch, IND_E = [end_boundary(x + y) for x, y in zip(IND_S, spec)] yield ([Y_min[s, IND_S[s]:IND_E[s]] for s in range(num_examples)], - cu_seqlens, sed_idx.unsqueeze(0), (A, dt2, X2, B2, C2)) + cu_seqlens, seq_idx.unsqueeze(0), (A, dt2, X2, B2, C2)) @pytest.mark.parametrize("itype", @@ -266,12 +268,15 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases, exhausted: dict = {} # map: eg -> boolean indicating example is exhausted states = None - for Y_min, cu_seqlens, sed_idx, (A, dt, X, B, + for Y_min, cu_seqlens, seq_idx, (A, dt, X, B, C) in generate_continous_batched_examples( cases, num_examples, seqlen, last_taken, exhausted, n_heads, d_head, itype): + chunk_indices, chunk_offsets = _seq_idx_to_chunk_indices_offsets( + seq_idx, chunk_size) + Y, new_states = mamba_chunk_scan_combined( X, dt, @@ -281,7 +286,9 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases, chunk_size, D=None, cu_seqlens=cu_seqlens, - seq_idx=sed_idx, + seq_idx=seq_idx, + chunk_indices=chunk_indices, + chunk_offsets=chunk_offsets, return_varlen_states=True, initial_states=states, ) diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py new file mode 100644 index 0000000000000..975cd418a171f --- /dev/null +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -0,0 +1,364 @@ +# SPDX-License-Identifier: Apache-2.0 +import dataclasses +from typing import Optional + +import pytest +import torch + +from vllm import _custom_ops as ops +from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8 +from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts, + fused_topk) +from vllm.platforms import current_platform + +NUM_EXPERTS = [40, 64] +TOP_KS = [6, 8] + +MNK_FACTORS = [ + (2, 1024, 1024), + (2, 1024, 1536), + (2, 3072, 1024), + (2, 3072, 1536), + (64, 1024, 1024), + (64, 1024, 1536), + (64, 3072, 1024), + (64, 3072, 1536), + (224, 1024, 1024), + (224, 1024, 1536), + (224, 3072, 1024), + (224, 3072, 1536), +] + + +@dataclasses.dataclass +class MOETensors: + a: torch.Tensor + w1: torch.Tensor + w2: torch.Tensor + ab_strides1: torch.Tensor + c_strides1: torch.Tensor + ab_strides2: torch.Tensor + c_strides2: torch.Tensor + + @staticmethod + def make_moe_tensors(m: int, k: int, n: int, e: int, + dtype: torch.dtype) -> "MOETensors": + a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 + w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 + w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10 + ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64) + c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64) + ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64) + c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64) + return MOETensors(a=a, + w1=w1, + w2=w2, + ab_strides1=ab_strides1, + c_strides1=c_strides1, + ab_strides2=ab_strides2, + c_strides2=c_strides2) + + +@dataclasses.dataclass +class MOETensors8Bit(MOETensors): + # quantized + a_q: Optional[torch.Tensor] = None # a -> a_q + w1_q: Optional[torch.Tensor] = None # w1 -> w1_q + w2_q: Optional[torch.Tensor] = None # w2 -> w2_q + a_scale: Optional[torch.Tensor] = None + w1_scale: Optional[torch.Tensor] = None + w2_scale: Optional[torch.Tensor] = None + # dequantized + a_d: Optional[torch.Tensor] = None # a -> a_q -> a_d + w1_d: Optional[torch.Tensor] = None # w1 -> w1_q -> w1_d + w2_d: Optional[torch.Tensor] = None # w2 -> w2_q -> w2_d + + @staticmethod + def make_moe_tensors_8bit(m: int, k: int, n: int, e: int, + per_act_token: bool, + per_out_channel: bool) -> "MOETensors8Bit": + dtype = torch.half + q_dtype = torch.float8_e4m3fn + + moe_tensors_fp16 = MOETensors.make_moe_tensors(m, k, n, e, dtype) + + # a -> a_q, w1 -> w1_q, w2 -> w2_q + n_b_scales = 2 * n if per_out_channel else 1 + k_b_scales = k if per_out_channel else 1 + # Get the right scale for tests. + _, a_scale = ops.scaled_fp8_quant( + moe_tensors_fp16.a, use_per_token_if_dynamic=per_act_token) + a_q, _ = ops.scaled_fp8_quant(moe_tensors_fp16.a, + a_scale, + use_per_token_if_dynamic=per_act_token) + w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=q_dtype) + w2_q = torch.empty((e, k, n), device="cuda", dtype=q_dtype) + + w1_scale = torch.empty((e, n_b_scales, 1), + device="cuda", + dtype=torch.float32) + w2_scale = torch.empty((e, k_b_scales, 1), + device="cuda", + dtype=torch.float32) + for expert in range(e): + w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant( + moe_tensors_fp16.w1[expert], + use_per_token_if_dynamic=per_out_channel) + w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant( + moe_tensors_fp16.w2[expert], + use_per_token_if_dynamic=per_out_channel) + + # a_q -> a_d, w1_q -> w1_d, w2_q -> w2_d + a_d = a_q.float().mul(a_scale).to(dtype) + w1_d = torch.empty_like(moe_tensors_fp16.w1) + w2_d = torch.empty_like(moe_tensors_fp16.w2) + for expert in range(e): + w1_d[expert] = (w1_q[expert].float() * w1_scale[expert]).half() + w2_d[expert] = (w2_q[expert].float() * w2_scale[expert]).half() + + return MOETensors8Bit(a=moe_tensors_fp16.a, + w1=moe_tensors_fp16.w1, + w2=moe_tensors_fp16.w2, + ab_strides1=moe_tensors_fp16.ab_strides1, + c_strides1=moe_tensors_fp16.c_strides1, + ab_strides2=moe_tensors_fp16.ab_strides2, + c_strides2=moe_tensors_fp16.c_strides2, + a_q=a_q, + w1_q=w1_q, + w2_q=w2_q, + a_scale=a_scale, + w1_scale=w1_scale, + w2_scale=w2_scale, + a_d=a_d, + w1_d=w1_d, + w2_d=w2_d) + + +def run_with_expert_maps(num_experts: int, num_local_experts: int, + **cutlass_moe_kwargs): + + def slice_experts(): + slice_params = [ + "w1_q", "w2_q", "ab_strides1", "ab_strides2", "c_strides1", + "c_strides2", "w1_scale", "w2_scale" + ] + full_tensors = { + k: v + for k, v in cutlass_moe_kwargs.items() + if k in slice_params and k in cutlass_moe_kwargs + } + + for i in range(0, num_experts, num_local_experts): + s, e = i, i + num_local_experts + + # make expert map + expert_map = [-1] * num_experts + expert_map[s:e] = list(range(num_local_experts)) + expert_map = torch.tensor(expert_map, + dtype=torch.int32, + device="cuda") + + # update cutlass moe arg with expert_map + cutlass_moe_kwargs["expert_map"] = expert_map + # update cutlass moe arg tensors + for k, t in full_tensors.items(): + cutlass_moe_kwargs[k] = t[s:e] + + yield cutlass_moe_kwargs + + out_tensor = torch.zeros_like(cutlass_moe_kwargs["a"]) + for kwargs in slice_experts(): + out_tensor = out_tensor + cutlass_moe_fp8(**kwargs) + + return out_tensor + + +def run_8_bit(moe_tensors: MOETensors8Bit, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_local_experts: Optional[int] = None) -> torch.Tensor: + assert not any([ + t is None for t in [ + moe_tensors.w1_q, moe_tensors.w2_q, moe_tensors.w1_scale, + moe_tensors.w2_scale, moe_tensors.a_scale + ] + ]) + + kwargs = { + 'a': moe_tensors.a, + 'w1_q': moe_tensors.w1_q.transpose(1, 2), # type: ignore[union-attr] + 'w2_q': moe_tensors.w2_q.transpose(1, 2), # type: ignore[union-attr] + 'topk_weights': topk_weights, + 'topk_ids_': topk_ids, + 'ab_strides1': moe_tensors.ab_strides1, + 'c_strides1': moe_tensors.c_strides1, + 'ab_strides2': moe_tensors.ab_strides2, + 'c_strides2': moe_tensors.c_strides2, + 'w1_scale': moe_tensors.w1_scale, + 'w2_scale': moe_tensors.w2_scale, + 'a1_scale': moe_tensors.a_scale + } + + num_experts = moe_tensors.w1.size(0) + with_ep = num_local_experts is not None or num_local_experts == num_experts + if not with_ep: + return cutlass_moe_fp8(**kwargs) + + assert num_local_experts is not None + return run_with_expert_maps( + num_experts, + num_local_experts, # type: ignore[arg-type] + **kwargs) + + +@pytest.mark.parametrize("m,n,k", MNK_FACTORS) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("per_act_token", [True, False]) +@pytest.mark.parametrize("per_out_ch", [True, False]) +@pytest.mark.skipif( + (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))( + current_platform.get_device_capability()), + reason="Grouped gemm is not supported on this GPU type.") +def test_cutlass_moe_8_bit_no_graph( + m: int, + n: int, + k: int, + e: int, + topk: int, + per_act_token: bool, + per_out_ch: bool, +): + current_platform.seed_everything(7) + with set_current_vllm_config( + VllmConfig(parallel_config=ParallelConfig( + pipeline_parallel_size=1))): + + mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, + per_out_ch) + + score = torch.randn((m, e), device="cuda", dtype=torch.half) + topk_weights, topk_ids = fused_topk(mt.a, + score, + topk, + renormalize=False) + + # Note that we are using the dequantized versions of the tensors. + # Using a, w1 and w2 directly results in minor output differences. + triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights, + topk_ids) + + cutlass_output = run_8_bit(mt, topk_weights, topk_ids) + + torch.testing.assert_close(triton_output, + cutlass_output, + atol=5e-2, + rtol=1e-2) + + +@pytest.mark.parametrize("m,n,k", MNK_FACTORS) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("per_act_token", [True, False]) +@pytest.mark.parametrize("per_out_ch", [True, False]) +@pytest.mark.skipif( + (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))( + current_platform.get_device_capability()), + reason="Grouped gemm is not supported on this GPU type.") +def test_cutlass_moe_8_bit_cuda_graph( + m: int, + n: int, + k: int, + e: int, + topk: int, + per_act_token: bool, + per_out_ch: bool, +): + current_platform.seed_everything(7) + with set_current_vllm_config( + VllmConfig(parallel_config=ParallelConfig( + pipeline_parallel_size=1))): + + dtype = torch.half + + mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, + per_out_ch) + + score = torch.randn((m, e), device="cuda", dtype=dtype) + topk_weights, topk_ids = fused_topk(mt.a, + score, + topk, + renormalize=False) + + # Note that we are using the dequantized versions of the tensors. + # Using a, w1 and w2 directly results in minor output differences. + triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights, + topk_ids) + + stream = torch.cuda.Stream() + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph, stream=stream): + cutlass_output = run_8_bit(mt, topk_weights, topk_ids) + + torch.cuda.synchronize() + graph.replay() + torch.cuda.synchronize() + + torch.testing.assert_close(triton_output, + cutlass_output, + atol=9e-2, + rtol=1e-2) + + +@pytest.mark.parametrize("m", [64]) +@pytest.mark.parametrize("n", [1024]) +@pytest.mark.parametrize("k", [4096]) +@pytest.mark.parametrize("e", [16]) +@pytest.mark.parametrize("topk", [1, 8]) +@pytest.mark.parametrize("per_act_token", [True]) +@pytest.mark.parametrize("per_out_channel", [True]) +@pytest.mark.parametrize("ep_size", [1, 2, 4, 8, 16]) +@pytest.mark.skipif( + (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))( + current_platform.get_device_capability()), + reason="Grouped gemm is not supported on this GPU type.") +def test_cutlass_moe_8_bit_EP( + m: int, + n: int, + k: int, + e: int, + topk: int, + per_act_token: bool, + per_out_channel: bool, + ep_size: int, +): + current_platform.seed_everything(7) + with set_current_vllm_config( + VllmConfig(parallel_config=ParallelConfig( + pipeline_parallel_size=1))): + + mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, + per_out_channel) + + score = torch.randn((m, e), device="cuda", dtype=torch.half) + topk_weights, topk_ids = fused_topk(mt.a, + score, + topk, + renormalize=False) + + # Note that we are using the dequantized versions of the tensors. + # Using a, w1 and w2 directly results in minor output differences. + triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights, + topk_ids) + + assert e % ep_size == 0, "Cannot distribute experts evenly" + cutlass_output = run_8_bit(mt, + topk_weights, + topk_ids, + num_local_experts=e // ep_size) + + torch.testing.assert_close(triton_output, + cutlass_output, + atol=5e-2, + rtol=1e-2) diff --git a/tests/kernels/test_moe.py b/tests/kernels/moe/test_moe.py similarity index 68% rename from tests/kernels/test_moe.py rename to tests/kernels/moe/test_moe.py index 653d2734afe89..425f36984a33b 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -3,7 +3,6 @@ Run `pytest tests/kernels/test_moe.py`. """ - import pytest import torch from torch.nn import Parameter @@ -12,16 +11,14 @@ from transformers import MixtralConfig from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock import vllm.model_executor.layers.fused_moe # noqa -from tests.kernels.utils import (compute_max_diff, opcheck, stack_and_dev, - torch_moe, torch_moe_single) -from vllm import _custom_ops as ops +from tests.kernels.utils import (opcheck, stack_and_dev, torch_moe, + torch_moe_single) from vllm.model_executor.layers.fused_moe import fused_moe -from vllm.model_executor.layers.fused_moe.fused_moe import ( - fused_topk, moe_align_block_size) +from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe.moe_torch_iterative import ( fused_moe as iterative_moe) from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( - marlin_quantize) + awq_marlin_quantize, marlin_quantize) from vllm.model_executor.layers.quantization.utils.quant_utils import ( quantize_weights) from vllm.model_executor.models.mixtral import MixtralMoE @@ -216,11 +213,17 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int, @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) @pytest.mark.parametrize("padding", [True, False]) +@pytest.mark.parametrize( + "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]) @torch.inference_mode() -def test_mixtral_moe(dtype: torch.dtype, padding: bool): +def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool, + monkeypatch): """Make sure our Mixtral MoE implementation agrees with the one from huggingface.""" + if use_rocm_aiter: + monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") + # Instantiate our and huggingface's MoE blocks config = MixtralConfig() hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda") @@ -268,20 +271,31 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool): torch.bfloat16: 1e-2, } - torch.testing.assert_close(hf_states.flatten(0, 1), - vllm_states, - rtol=mixtral_moe_tol[dtype], - atol=mixtral_moe_tol[dtype]) + if use_rocm_aiter: + # The values of rtol and atol are set based on the tests in ROCM AITER package. # noqa: E501 + # https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174 # noqa: E501 + torch.testing.assert_close(hf_states.flatten(0, 1), + vllm_states, + rtol=0.01, + atol=100) + else: + torch.testing.assert_close(hf_states.flatten(0, 1), + vllm_states, + rtol=mixtral_moe_tol[dtype], + atol=mixtral_moe_tol[dtype]) -@pytest.mark.parametrize("m", [1, 33, 64, 222]) -@pytest.mark.parametrize("n", [128, 2048]) -@pytest.mark.parametrize("k", [128, 1024]) -@pytest.mark.parametrize("e", NUM_EXPERTS) -@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("m", [1, 33, 123]) +@pytest.mark.parametrize("n", [128, 1024]) +@pytest.mark.parametrize("k", [256, 2048]) +@pytest.mark.parametrize("e", [4, 12]) +@pytest.mark.parametrize("topk", [2, 3]) +@pytest.mark.parametrize("ep_size", [1, 4]) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) @pytest.mark.parametrize("group_size", [-1, 32, 128]) @pytest.mark.parametrize("act_order", [True, False]) @pytest.mark.parametrize("num_bits", [4, 8]) +@pytest.mark.parametrize("has_zp", [True, False]) @pytest.mark.parametrize("is_k_full", [True, False]) @pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm") def test_fused_marlin_moe( @@ -290,9 +304,12 @@ def test_fused_marlin_moe( k: int, e: int, topk: int, + ep_size: int, + dtype: torch.dtype, group_size: int, act_order: bool, num_bits: int, + has_zp: bool, is_k_full: bool, ): current_platform.seed_everything(7) @@ -303,75 +320,110 @@ def test_fused_marlin_moe( return if group_size in (k, n): return + if has_zp: + return else: if not is_k_full: return - quant_type = (scalar_types.uint4b8 - if num_bits == 4 else scalar_types.uint8b128) - dtype = torch.float16 + if has_zp: + # we don't build kernel for int8 with zero + if num_bits == 8: + return + quant_type = scalar_types.uint4 if num_bits == 4 else scalar_types.uint8 + else: + quant_type = scalar_types.uint4b8 \ + if num_bits == 4 else scalar_types.uint8b128 a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10 + if ep_size > 1: + local_e = e // ep_size + e_ids = torch.randperm(e, device="cuda", dtype=torch.int32)[:local_e] + e_map = torch.full((e, ), -1, device="cuda", dtype=torch.int32) + e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32) + w1 = w1[e_ids] + w2 = w2[e_ids] + else: + e_map = None + w_ref1_l = [] qweight1_l = [] scales1_l = [] + zeros1_l = [] g_idx1_l = [] sort_indices1_l = [] for i in range(w1.shape[0]): - test_perm = torch.randperm(k) - w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize( - w1[i].transpose(1, 0), quant_type, group_size, act_order, - test_perm) - w_ref1_l.append(w_ref1) - qweight1_l.append(qweight1) - scales1_l.append(scales1) - g_idx1_l.append(g_idx1) - sort_indices1_l.append(sort_indices1) + if has_zp: + w_ref1, qweight1, scales1, zeros1 = awq_marlin_quantize( + w1[i].transpose(1, 0), quant_type, group_size) + + w_ref1_l.append(w_ref1.T) + qweight1_l.append(qweight1) + scales1_l.append(scales1) + zeros1_l.append(zeros1) + else: + test_perm = torch.randperm(k) + quant_res = marlin_quantize(w1[i].transpose(1, 0), quant_type, + group_size, act_order, test_perm) + w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = quant_res + + w_ref1_l.append(w_ref1.T) + qweight1_l.append(qweight1) + scales1_l.append(scales1) + g_idx1_l.append(g_idx1) + sort_indices1_l.append(sort_indices1) w_ref1 = stack_and_dev(w_ref1_l) qweight1 = stack_and_dev(qweight1_l).contiguous() scales1 = stack_and_dev(scales1_l) - g_idx1 = stack_and_dev(g_idx1_l) - sort_indices1 = stack_and_dev(sort_indices1_l) + g_idx1 = stack_and_dev(g_idx1_l) if g_idx1_l else None + zeros1 = stack_and_dev(zeros1_l) if zeros1_l else None + sort_indices1 = stack_and_dev(sort_indices1_l) if sort_indices1_l else None w_ref2_l = [] qweight2_l = [] scales2_l = [] + zeros2_l = [] g_idx2_l = [] sort_indices2_l = [] for i in range(w2.shape[0]): - test_perm = torch.randperm(n) - w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize( - w2[i].transpose(1, 0), quant_type, group_size, act_order, - test_perm) - w_ref2_l.append(w_ref2) - qweight2_l.append(qweight2) - scales2_l.append(scales2) - g_idx2_l.append(g_idx2) - sort_indices2_l.append(sort_indices2) + if has_zp: + w_ref2, qweight2, scales2, zeros2 = awq_marlin_quantize( + w2[i].transpose(1, 0), quant_type, group_size) + + w_ref2_l.append(w_ref2.T) + qweight2_l.append(qweight2) + scales2_l.append(scales2) + zeros2_l.append(zeros2) + else: + test_perm = torch.randperm(n) + quant_res = marlin_quantize(w2[i].transpose(1, 0), quant_type, + group_size, act_order, test_perm) + w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = quant_res + + w_ref2_l.append(w_ref2.T) + qweight2_l.append(qweight2) + scales2_l.append(scales2) + g_idx2_l.append(g_idx2) + sort_indices2_l.append(sort_indices2) w_ref2 = stack_and_dev(w_ref2_l) qweight2 = stack_and_dev(qweight2_l).contiguous() scales2 = stack_and_dev(scales2_l) - g_idx2 = stack_and_dev(g_idx2_l) - sort_indices2 = stack_and_dev(sort_indices2_l) + g_idx2 = stack_and_dev(g_idx2_l) if g_idx2_l else None + zeros2 = stack_and_dev(zeros2_l) if zeros2_l else None + sort_indices2 = stack_and_dev(sort_indices2_l) if sort_indices2_l else None score = torch.randn((m, e), device="cuda", dtype=dtype) topk_weights, topk_ids = fused_topk(a, score, topk, False) - triton_output = fused_moe( - a, - w_ref1.transpose(1, 2).contiguous(), - w_ref2.transpose(1, 2).contiguous(), - score, - topk, - renormalize=False, - ) + torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, e_map) + marlin_output = torch.ops.vllm.fused_marlin_moe( a, qweight1, @@ -381,111 +433,91 @@ def test_fused_marlin_moe( score, topk_weights, topk_ids, + global_num_experts=e, + expert_map=e_map, g_idx1=g_idx1, g_idx2=g_idx2, sort_indices1=sort_indices1, sort_indices2=sort_indices2, + w1_zeros=zeros1, + w2_zeros=zeros2, num_bits=num_bits, - is_k_full=is_k_full, - ) + is_k_full=is_k_full) - assert compute_max_diff(marlin_output, triton_output) < 4e-2 - - if ops.supports_moe_ops: - token_expert_indicies = torch.empty(m, - topk, - dtype=torch.int32, - device=a.device) - - opcheck(torch.ops._moe_C.topk_softmax, ( - topk_weights, - topk_ids, - token_expert_indicies, - score.float(), - )) - - block_size_m = 4 - - sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, - e) - - max_workspace_size = ((m + 255) // 256) * (max(2 * n, k) // 64) * 16 - workspace = torch.zeros(max_workspace_size, - dtype=torch.int, - device="cuda", - requires_grad=False) - - zp = torch.empty((0, 0), - dtype=dtype, - device="cuda", - requires_grad=False) - opcheck(torch.ops._moe_C.marlin_gemm_moe, - (a, qweight1, sorted_token_ids, topk_weights, topk_ids, - scales1, zp, g_idx1, sort_indices1, workspace, quant_type.id, - m, 2 * n, k, True, e, topk, block_size_m, True, False)) + torch.testing.assert_close(marlin_output, torch_output, atol=2e-2, rtol=0) @pytest.mark.skip("This test is here for the sake of debugging, " "don't run it in automated tests.") -@pytest.mark.parametrize("m", [64, 512, 222, 33, 1]) -@pytest.mark.parametrize("n", [128, 2048, 256, 1024]) -@pytest.mark.parametrize("k", [128, 1024, 512]) -@pytest.mark.parametrize("e", [8, 64]) -@pytest.mark.parametrize("topk", [2, 6]) -@pytest.mark.parametrize("group_size", [-1, 32, 64, 128]) +@pytest.mark.parametrize("m", [1, 33, 123]) +@pytest.mark.parametrize("n", [128, 1024]) +@pytest.mark.parametrize("k", [256, 2048]) +@pytest.mark.parametrize("e", [4, 12]) +@pytest.mark.parametrize("topk", [2, 3]) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("group_size", [-1, 32, 128]) @pytest.mark.parametrize("act_order", [True, False]) @pytest.mark.parametrize("num_bits", [4, 8]) +@pytest.mark.parametrize("has_zp", [True, False]) @pytest.mark.parametrize("is_k_full", [True, False]) -@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm") -def test_single_marlin_moe_multiply( - m: int, - n: int, - k: int, - e: int, - topk: int, - group_size: int, - act_order: bool, - num_bits: int, - is_k_full: bool, -): - +def test_single_marlin_moe_multiply(m: int, n: int, k: int, e: int, topk: int, + dtype: torch.dtype, group_size: int, + act_order: bool, num_bits: int, + has_zp: bool, is_k_full: bool): # Filter act_order if act_order: if group_size == -1: return - if group_size == k: + if group_size in (k, n): + return + if has_zp: return else: if not is_k_full: return - quant_type = (scalar_types.uint4b8 - if num_bits == 4 else scalar_types.uint8b128) - dtype = torch.float16 + if has_zp: + quant_type = scalar_types.uint4 if num_bits == 4 else scalar_types.uint8 + else: + quant_type = scalar_types.uint4b8 \ + if num_bits == 4 else scalar_types.uint8b128 a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10 w_ref_l = [] - qweights_l = [] + qweight_l = [] scales_l = [] + zeros_l = [] g_idx_l = [] sort_indices_l = [] for i in range(w.shape[0]): - test_perm = torch.randperm(k) - w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize( - w[i].transpose(1, 0), quant_type, group_size, act_order, test_perm) - w_ref_l.append(w_ref) - qweights_l.append(qweight) - scales_l.append(scales) - g_idx_l.append(g_idx) - sort_indices_l.append(sort_indices) + if has_zp: + w_ref, qweight, scales, zeros = awq_marlin_quantize( + w[i].transpose(1, 0), quant_type, group_size) + + w_ref_l.append(w_ref.T) + qweight_l.append(qweight) + scales_l.append(scales) + zeros_l.append(zeros) + else: + test_perm = torch.randperm(k) + w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize( + w[i].transpose(1, 0), quant_type, group_size, act_order, + test_perm) + + w_ref_l.append(w_ref.T) + qweight_l.append(qweight) + scales_l.append(scales) + g_idx_l.append(g_idx) + sort_indices_l.append(sort_indices) w_ref = stack_and_dev(w_ref_l) - qweight = stack_and_dev(qweights_l).contiguous() + qweight = stack_and_dev(qweight_l).contiguous() scales = stack_and_dev(scales_l) - g_idx = stack_and_dev(g_idx_l) - sort_indices = stack_and_dev(sort_indices_l) + g_idx = stack_and_dev(g_idx_l) if g_idx_l else None + zeros = stack_and_dev(zeros_l) if zeros_l else None + sort_indices = stack_and_dev(sort_indices_l) if sort_indices_l else None score = torch.randn((m, e), device="cuda", dtype=dtype) marlin_output = torch.ops.vllm.single_marlin_moe( @@ -497,13 +529,14 @@ def test_single_marlin_moe_multiply( renormalize=False, g_idx=g_idx, sort_indices=sort_indices, + w_zeros=zeros, num_bits=num_bits, is_k_full=is_k_full, ) - torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk) + torch_output = torch_moe_single(a, w_ref, score, topk) - assert compute_max_diff(marlin_output, torch_output) < 1e-2 + torch.testing.assert_close(marlin_output, torch_output, atol=2e-2, rtol=0) def test_moe_align_block_size_opcheck(): diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py new file mode 100644 index 0000000000000..44734e9340aa1 --- /dev/null +++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py @@ -0,0 +1,159 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_triton_moe_channel_fp8_kernel.py +import itertools + +import pytest +import torch + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.platforms import current_platform + +if current_platform.get_device_capability() < (9, 0): + pytest.skip("FP8 Triton requires CUDA 9.0 or higher", + allow_module_level=True) + + +def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16): + """Matrix multiplication function that supports per-token input + quantization and per-column weight quantization""" + A = A.to(torch.float32) + B = B.to(torch.float32) + + assert A.shape[-1] == B.shape[-1], "Dimension mismatch" + assert B.ndim == 2 and B.is_contiguous( + ), "B must be a 2D contiguous tensor" + + # Reshape input + M = A.numel() // A.shape[-1] + B = B.t() # Transpose weight matrix + N, K = B.shape + origin_C_shape = A.shape[:-1] + (K, ) + A = A.reshape(M, N) + + # As is per-token [M, 1], Bs is per-column [1, K] + C = torch.matmul(A, B) # [M, K] + C = As * C * Bs.view(1, -1) # Broadcast per-column scale + + return C.reshape(origin_C_shape).to(output_dtype) + + +def fp8_mask(a, mask): + dtype = a.dtype + return a.view(torch.int8)[mask].view(dtype) + + +def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk): + """This function performs fused moe with per-column int8 + quantization using native torch.""" + + B, D = a.shape + # Perform per-token quantization + a_q, a_s = ops.scaled_fp8_quant(a, use_per_token_if_dynamic=True) + # Repeat tokens to match topk + a_q = a_q.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D) + # Also repeat the scale + a_s = a_s.view(B, -1, 1).repeat(1, topk, 1).reshape(-1, 1) # [B*topk, 1] + + out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device) + + # Calculate routing + score = torch.softmax(score, dim=-1, dtype=torch.float32) + topk_weight, topk_ids = torch.topk(score, topk) + topk_weight = topk_weight.view(-1) + topk_ids = topk_ids.view(-1) + # Process each expert + for i in range(w1.shape[0]): + mask = topk_ids == i + if mask.sum(): + # First MLP layer: note that a_s is now per-token + inter_out = native_w8a8_per_token_matmul( + fp8_mask(a_q, mask), + w1[i], + fp8_mask(a_s, mask), + w1_s[i], + output_dtype=a.dtype, + ) + # Activation function + act_out = SiluAndMul().forward_native(inter_out) + # Quantize activation output with per-token + act_out_q, act_out_s = ops.scaled_fp8_quant( + act_out, use_per_token_if_dynamic=True) + + # Second MLP layer + out[mask] = native_w8a8_per_token_matmul(act_out_q, + w2[i], + act_out_s, + w2_s[i], + output_dtype=a.dtype) + # Apply routing weights and sum + return (out.view(B, -1, w2.shape[1]) * + topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1) + + +@pytest.fixture(autouse=True, scope="module") +def setup_cuda(): + """Sets the default CUDA device for all tests in this module.""" + torch.set_default_device("cuda") + + +DTYPES = [torch.half, torch.bfloat16] +M = [1, 33] +N = [128, 1024] +K = [256, 4096] +E = [8] +TOP_KS = [2, 6] +SEEDS = [0] + + +@pytest.mark.parametrize("M, N, K, E, topk, dtype, seed", + itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS)) +@torch.inference_mode() +def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed): + torch.manual_seed(seed) + # Initialize int8 quantization parameters + factor_for_scale = 1e-2 + finfo = torch.finfo(torch.float8_e4m3fn) + fp8_max = finfo.max + fp8_min = finfo.min + + # Input tensor + # M * K + a = torch.randn((M, K), dtype=dtype) / 10 + + # Generate int8 weights + w1_fp32 = (torch.rand((E, 2 * N, K), dtype=torch.float32) - 0.5) * 2 + w1 = (w1_fp32 * fp8_max).clamp(min=fp8_min, + max=fp8_max).to(torch.float8_e4m3fn) + + w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2 + w2 = (w2_fp32 * fp8_max).clamp(min=fp8_min, + max=fp8_max).to(torch.float8_e4m3fn) + + # Generate scale for each column (per-column quantization) + w1_s = torch.rand(E, 2 * N, device=w1_fp32.device) * factor_for_scale + w2_s = torch.rand(E, K, device=w2_fp32.device) * factor_for_scale + score = torch.randn((M, E), dtype=dtype) + + ref_out = torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk) + out = fused_moe( + a, + w1, + w2, + score, + topk, + renormalize=False, + use_fp8_w8a8=True, # using fp8 + per_channel_quant=True, + w1_scale=w1_s, + w2_scale=w2_s, + block_shape=None, # Not using block quantization + ) + + # Check results + rel_diff = (torch.mean( + torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) / + torch.mean(torch.abs(ref_out.to(torch.float32)))) + assert rel_diff < 0.05 diff --git a/tests/kernels/test_allspark_gemm.py b/tests/kernels/quantization/test_allspark_gemm.py similarity index 100% rename from tests/kernels/test_allspark_gemm.py rename to tests/kernels/quantization/test_allspark_gemm.py diff --git a/tests/kernels/test_aqlm.py b/tests/kernels/quantization/test_aqlm.py similarity index 100% rename from tests/kernels/test_aqlm.py rename to tests/kernels/quantization/test_aqlm.py diff --git a/tests/kernels/test_awq.py b/tests/kernels/quantization/test_awq.py similarity index 100% rename from tests/kernels/test_awq.py rename to tests/kernels/quantization/test_awq.py diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/quantization/test_awq_marlin.py similarity index 100% rename from tests/kernels/test_awq_marlin.py rename to tests/kernels/quantization/test_awq_marlin.py diff --git a/tests/kernels/test_awq_triton.py b/tests/kernels/quantization/test_awq_triton.py similarity index 100% rename from tests/kernels/test_awq_triton.py rename to tests/kernels/quantization/test_awq_triton.py diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py new file mode 100644 index 0000000000000..da594675e924a --- /dev/null +++ b/tests/kernels/quantization/test_block_fp8.py @@ -0,0 +1,449 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Adapted from https://github.com/sgl-project/sglang/pull/2575 +import itertools + +import pytest +import torch + +from tests.kernels.utils_block import native_w8a8_block_matmul +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( + deep_gemm_moe_fp8) +from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk +from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( + moe_align_block_size) +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + per_token_group_quant_fp8, w8a8_block_fp8_matmul) +from vllm.platforms import current_platform + +dg_available = False +try: + import deep_gemm + dg_available = True +except ImportError: + pass + +if current_platform.get_device_capability() < (9, 0): + pytest.skip("FP8 Triton requires CUDA 9.0 or higher", + allow_module_level=True) + +# Test configurations +DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32] +NUM_TOKENS = [7, 83, 2048] +D = [512, 4096, 5120, 13824] +GROUP_SIZE = [64, 128, 256, 512] +M = [1, 7, 8, 83, 84, 512, 2048, 4096] +N = [128, 512, 1024, 4096, 7168, 7748, 13824] +K = [256, 4096, 5120, 3884, 13824, 16384] +# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8 +# and its hidden size is 7168. +M_moe = [1, 2, 7, 83, 128, 512, 2048] +M_moe_dg = [128, 192, 512, 1335, 2048] +N_moe = [128, 256, 1024, 4608] # [13824] +K_moe = [256, 512, 7168] # [13824] +BLOCK_SIZE = [[128, 128]] +E = [2, 8, 16, 24] # [128, 256] +TOP_KS = [1, 2, 6] +OUT_DTYPES = [torch.bfloat16] # [torch.float32, torch.half, torch.bfloat16] +SEEDS = [0] + + +def native_per_token_group_quant_fp8(x, + group_size, + eps=1e-10, + dtype=torch.float8_e4m3fn): + """Function to perform per-token-group quantization on an input tensor + `x` using native torch.""" + assert x.shape[-1] % group_size == 0, ("the last dimension of `x` cannot " + "be divisible by `group_size`") + assert x.is_contiguous(), "`x` is not contiguous" + + finfo = torch.finfo(dtype) + fp8_min = finfo.min + fp8_max = finfo.max + + x_ = x.reshape(x.numel() // group_size, group_size) + amax = x_.abs().max(dim=-1, + keepdim=True)[0].clamp(min=eps).to(torch.float32) + x_s = amax / fp8_max + x_q = (x_ / x_s).clamp(min=fp8_min, max=fp8_max).to(dtype) + x_q = x_q.reshape(x.shape) + x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, )) + + return x_q, x_s + + +def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape): + """Fused moe with block-wise quantization using native torch.""" + B, D = a.shape + a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D) + out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device) + score = torch.softmax(score, dim=-1, dtype=torch.float32) + topk_weight, topk_ids = torch.topk(score, topk) + topk_weight = topk_weight.view(-1) + topk_ids = topk_ids.view(-1) + + _, block_k = block_shape[0], block_shape[1] + a_q, a_s = native_per_token_group_quant_fp8(a, block_k) + a_q = a_q.to(torch.float32) + for i in range(w1.shape[0]): + mask = topk_ids == i + if mask.sum(): + inter_out = native_w8a8_block_matmul(a_q[mask], + w1[i], + a_s[mask], + w1_s[i], + block_shape, + output_dtype=a.dtype) + act_out = SiluAndMul().forward_native(inter_out) + act_out_q, act_out_s = native_per_token_group_quant_fp8( + act_out, block_k) + act_out = act_out.to(torch.float32) + out[mask] = native_w8a8_block_matmul(act_out_q, + w2[i], + act_out_s, + w2_s[i], + block_shape, + output_dtype=a.dtype) + return (out.view(B, -1, w2.shape[1]) * + topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1) + + +# Skip all tests if CUDA is not available +pytest.importorskip("torch.cuda") + + +@pytest.fixture(autouse=True) +def setup_cuda(): + torch.set_default_device("cuda") + + +@pytest.mark.parametrize( + "num_tokens,d,dtype,group_size,seed", + itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS)) +@torch.inference_mode() +def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed): + torch.manual_seed(seed) + x = torch.rand(num_tokens, d, dtype=dtype) + + ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size) + out, scale = per_token_group_quant_fp8(x, group_size) + + assert torch.allclose(out.to(torch.float32), + ref_out.to(torch.float32), + rtol=0.15) + assert torch.allclose(scale, ref_scale) + + +@pytest.mark.parametrize( + "M,N,K,block_size,out_dtype,seed", + itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS)) +@torch.inference_mode() +def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): + torch.manual_seed(seed) + factor_for_scale = 1e-2 + fp8_info = torch.finfo(torch.float8_e4m3fn) + fp8_max, fp8_min = fp8_info.max, fp8_info.min + + A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max + A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + + B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max + B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + + block_n, block_k = block_size[0], block_size[1] + n_tiles = (N + block_n - 1) // block_n + k_tiles = (K + block_k - 1) // block_k + + As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale + Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale + + ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size, + out_dtype) + out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype) + + rel_diff = (torch.mean( + torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) / + torch.mean(torch.abs(ref_out.to(torch.float32)))) + assert rel_diff < 0.001 + + +@pytest.mark.parametrize( + "M,N,K,E,topk,block_size,dtype,seed", + itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, BLOCK_SIZE, DTYPES, + SEEDS)) +@torch.inference_mode() +def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed): + if topk > E: + pytest.skip(f"Skipping test; topk={topk} > E={E}") + + torch.manual_seed(seed) + factor_for_scale = 1e-2 + fp8_info = torch.finfo(torch.float8_e4m3fn) + fp8_max, fp8_min = fp8_info.max, fp8_info.min + + a = torch.randn((M, K), dtype=dtype) / 10 + + w1_bf16 = (torch.rand( + (E, 2 * N, K), dtype=torch.bfloat16) - 0.5) * 2 * fp8_max + w1 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + del w1_bf16 + + w2_bf16 = (torch.rand((E, K, N), dtype=torch.bfloat16) - 0.5) * 2 * fp8_max + w2 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + del w2_bf16 + + block_n, block_k = block_size[0], block_size[1] + n_tiles_w1 = (2 * N + block_n - 1) // block_n + n_tiles_w2 = (K + block_n - 1) // block_n + k_tiles_w1 = (K + block_k - 1) // block_k + k_tiles_w2 = (N + block_k - 1) // block_k + + w1_s = torch.rand( + (E, n_tiles_w1, k_tiles_w1), dtype=torch.float32) * factor_for_scale + w2_s = torch.rand( + (E, n_tiles_w2, k_tiles_w2), dtype=torch.float32) * factor_for_scale + + score = torch.randn((M, E), dtype=dtype) + + # Set the context to avoid lots of warning spam. + vllm_config = VllmConfig() + with set_current_vllm_config(vllm_config): + out = fused_moe( + a, + w1, + w2, + score, + topk, + renormalize=False, + use_fp8_w8a8=True, + w1_scale=w1_s, + w2_scale=w2_s, + block_shape=block_size, + ) + ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, + block_size) + + #print(f"{out.sum()=}") + #print(f"{ref_out.sum()=}") + + rel_diff = (torch.mean( + torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) / + torch.mean(torch.abs(ref_out.to(torch.float32)))) + assert rel_diff < 0.03 + + +def per_block_cast_to_fp8( + x: torch.Tensor, + block_size_n: int = 128) -> tuple[torch.Tensor, torch.Tensor]: + assert x.dim() == 2 + m, n = x.shape + x_padded = torch.zeros( + (deep_gemm.ceil_div(m, 128) * 128, + deep_gemm.ceil_div(n, block_size_n) * block_size_n), + dtype=x.dtype, + device=x.device) + x_padded[:m, :n] = x + x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, block_size_n) + x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) + x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn) + x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous() + scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2)) + return x_scaled_sub, scales + + +@pytest.mark.parametrize( + "M,N,K,block_size,out_dtype,seed", + itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS)) +@torch.inference_mode() +def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed): + # only aligned sizes + if M % 4 != 0 or K % 128 != 0 or N % 64 != 0: + pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}") + + torch.manual_seed(seed) + fp8_info = torch.finfo(torch.float8_e4m3fn) + fp8_max = fp8_info.max + + A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max + B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max + + _, block_k = block_size[0], block_size[1] + + A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_k) + B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32) + + As = As_fp8.to(torch.float32) + Bs = Bs_fp8.to(torch.float32) + + ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size, + out_dtype) + + # Transpose earlier so that the testing will not trigger transposing kernels + As_fp8 = deep_gemm.get_col_major_tma_aligned_tensor(As_fp8) + + out = torch.zeros((M, N), device='cuda', dtype=out_dtype) + + assert As_fp8.shape == (M, (K + 127) // + 128), f"{As_fp8.shape} != {(M, (K + 127) // 128)}" + + deep_gemm.gemm_fp8_fp8_bf16_nt((A_fp8, As_fp8), (B_fp8, Bs_fp8), out) + + rel_diff = (torch.mean( + torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) / + torch.mean(torch.abs(ref_out.to(torch.float32)))) + assert rel_diff < 0.001 + + +def fp8_perm(m, idx): + if torch.is_floating_point(m) and torch.finfo(m.dtype).bits == 8: + return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype) + else: + return m[idx, ...] + + +def _moe_permute(a, a_s, topk_ids, num_groups, topk, block_m): + M, K = a.shape + + sorted_token_ids, m_indices, num_pad = moe_align_block_size( + topk_ids, block_m, num_groups, None, pad_sorted_ids=True) + + num_tokens = topk * M + + sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1) + m_indices = torch.repeat_interleave(m_indices, block_m, dim=0) + inv_perm = torch.argsort(sorted_token_ids)[:M * topk] + + a = fp8_perm(a, sorted_token_ids // topk) + if a_s is not None: + a_s = a_s[sorted_token_ids // topk] + + return a, a_s, m_indices, inv_perm + + +def _moe_unpermute(out, inv_perm, topk, K, topk_weight): + M = topk_weight.shape[0] + out = out[inv_perm, ...] + tmp_out = out.view(-1, topk, K) + return (tmp_out * topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1) + + +def deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, score, topk, + block_shape): + """Fused moe with block-wise quantization using DeepGemm grouped gemm.""" + num_groups = w1.shape[0] + M, K = a.shape + N = w2.shape[-1] + + topk_weight, topk_ids = fused_topk(a, score.float(), topk, False) + + block_m = deep_gemm.get_m_alignment_for_contiguous_layout() + + _, block_k = block_shape[0], block_shape[1] + + a_q, a_s = per_token_group_quant_fp8(a, block_m) + + a_q, a_s, m_indices, inv_perm = _moe_permute(a_q, a_s, topk_ids, + num_groups, topk, block_m) + + inter_out = torch.zeros((a_q.shape[0], N * 2), + dtype=torch.bfloat16, + device=a.device) + + deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous((a_q, a_s), (w1, w1_s), + inter_out, m_indices) + + act_out = SiluAndMul().forward_native(inter_out) + act_out_q, act_out_s = per_token_group_quant_fp8(act_out, block_k) + + out = torch.zeros(a_q.shape[0], K, dtype=torch.bfloat16, device=a.device) + + deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous( + (act_out_q, act_out_s), (w2, w2_s), out, m_indices) + + final_out = _moe_unpermute(out, inv_perm, topk, K, topk_weight) + + return final_out + + +@pytest.mark.parametrize( + "M,N,K,E,topk,seed", + itertools.product(M_moe_dg, N_moe, K_moe, E, TOP_KS, SEEDS)) +@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.") +@torch.inference_mode() +def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed): + + block_m = deep_gemm.get_m_alignment_for_contiguous_layout() + block_size = [block_m, block_m] + dtype = torch.bfloat16 + + # only aligned sizes + if (N % block_m != 0 or K % block_m != 0 or topk > E): + pytest.skip( + f"Skipping test; bad size m={M}, n={N}, k={K}, topk={topk}, E={E}") + + if N <= 512: + pytest.skip("Skipping N <= 512 until performance issues solved.") + + vllm_config = VllmConfig() + + torch.manual_seed(seed) + fp8_info = torch.finfo(torch.float8_e4m3fn) + fp8_max, fp8_min = fp8_info.max, fp8_info.min + + a = torch.randn((M, K), dtype=dtype) / 10 + + w1_bf16 = ((torch.rand((E, 2 * N, K), dtype=torch.bfloat16) - 0.5) * 2 * + fp8_max).clamp(min=fp8_min, max=fp8_max) + + w2_bf16 = ((torch.rand((E, K, N), dtype=torch.bfloat16) - 0.5) * 2 * + fp8_max).clamp(min=fp8_min, max=fp8_max) + + score = torch.randn((M, E), dtype=dtype) + + block_n, block_k = block_size[0], block_size[1] + n_tiles_w1 = ((2 * N) + block_n - 1) // block_n + k_tiles_w1 = (K + block_k - 1) // block_k + n_tiles_w2 = (K + block_n - 1) // block_n + k_tiles_w2 = (N + block_k - 1) // block_k + + w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn) + w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn) + + w1_s = torch.empty((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32) + w2_s = torch.empty((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32) + + w1_s = deep_gemm.get_col_major_tma_aligned_tensor(w1_s).contiguous() + w2_s = deep_gemm.get_col_major_tma_aligned_tensor(w2_s).contiguous() + + assert w1_s.shape == (E, (2 * N + 127) // 128, (K + 127) // 128) + assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2] + + for i in range(E): + w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i]) + w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i]) + + # Set the context to avoid lots of warning spam. + with set_current_vllm_config(vllm_config): + if M >= 128: + ref_out = deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, + score, topk, block_size) + else: + ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, + topk, block_size) + + topk_weights, topk_ids = fused_topk(a, score.float(), topk, False) + + out = deep_gemm_moe_fp8(a, w1, w2, w1_s, w2_s, topk_weights, topk_ids) + + #print(f"{out.sum()=}") + #print(f"{ref_out.sum()=}") + + rel_diff = (torch.mean( + torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) / + torch.mean(torch.abs(ref_out.to(torch.float32)))) + + assert rel_diff < 0.03 diff --git a/tests/kernels/quantization/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py new file mode 100644 index 0000000000000..943470ad113d1 --- /dev/null +++ b/tests/kernels/quantization/test_block_int8.py @@ -0,0 +1,198 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_block_int8.py +import itertools + +import pytest +import torch + +from tests.kernels.utils_block import native_w8a8_block_matmul +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.quantization.utils.int8_utils import ( + w8a8_block_int8_matmul) +from vllm.platforms import current_platform + +if current_platform.get_device_capability() < (7, 0): + pytest.skip("INT8 Triton requires CUDA 7.0 or higher", + allow_module_level=True) + + +# For test +def native_per_token_group_quant_int8(x, + group_size, + eps=1e-10, + dtype=torch.int8): + """Function to perform per-token-group quantization on an input tensor + `x` using native torch. + + It converts the tensor values into int8 values and returns the + quantized tensor along with the scaling factor used for quantization. + """ + assert (x.shape[-1] % group_size == 0 + ), "the last dimension of `x` cannot be divisible by `group_size`" + assert x.is_contiguous(), "`x` is not contiguous" + + iinfo = torch.iinfo(dtype) + int8_min = iinfo.min + int8_max = iinfo.max + + x_ = x.reshape(x.numel() // group_size, group_size) + # Use float32 for scale calculation for stability + amax = x_.abs().max(dim=-1, + keepdim=True)[0].clamp(min=eps).to(torch.float32) + x_s = amax / int8_max + x_q = (x_.to(torch.float32) / x_s).round().clamp( + min=int8_min, max=int8_max).to(dtype) # Round before clamping + x_q = x_q.reshape(x.shape) + x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, )) + + return x_q, x_s + + +# For test +def torch_w8a8_block_int8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape): + """This function performs fused moe with block-wise quantization using + native torch.""" + B, D = a.shape + a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D) + out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device) + score = torch.softmax(score, dim=-1, dtype=torch.float32) + topk_weight, topk_ids = torch.topk(score, topk) + topk_weight = topk_weight.view(-1) + topk_ids = topk_ids.view(-1) + + _, block_k = block_shape[0], block_shape[1] + a_q, a_s = native_per_token_group_quant_int8(a, block_k) + for i in range(w1.shape[0]): + mask = topk_ids == i + if mask.sum(): + inter_out = native_w8a8_block_matmul(a_q[mask], + w1[i], + a_s[mask], + w1_s[i], + block_shape, + output_dtype=a.dtype) + act_out = SiluAndMul().forward_native(inter_out) + act_out_q, act_out_s = native_per_token_group_quant_int8( + act_out, block_k) + act_out = act_out.to(torch.float32) + out[mask] = native_w8a8_block_matmul(act_out_q, + w2[i], + act_out_s, + w2_s[i], + block_shape, + output_dtype=a.dtype) + return (out.view(B, -1, w2.shape[1]) * + topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1) + + +DTYPES = [torch.half, torch.bfloat16] +M = [1, 33, 64, 222] +N = [128, 1024] +K = [256, 4096] +E = [8, 24] +TOP_KS = [2, 6] +# BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]] +BLOCK_SIZE = [[128, 128]] +SEEDS = [0] + + +@pytest.fixture(autouse=True, scope="module") +def setup_cuda(): + """Sets the default CUDA device for all tests in this module.""" + torch.set_default_device("cuda") + + +@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed", + itertools.product(M, N, K, BLOCK_SIZE, DTYPES, SEEDS)) +@torch.inference_mode() +def test_w8a8_block_int8_matmul(M, N, K, block_size, out_dtype, seed): + torch.manual_seed(seed) + factor_for_scale = 1e-2 + int8_info = torch.iinfo(torch.int8) + int8_max, int8_min = int8_info.max, int8_info.min + + A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * int8_max + A_fp8 = A_fp32.clamp(min=int8_min, max=int8_max).to(torch.float8_e4m3fn) + + B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * int8_max + B_fp8 = B_fp32.clamp(min=int8_min, max=int8_max).to(torch.float8_e4m3fn) + + block_n, block_k = block_size[0], block_size[1] + n_tiles = (N + block_n - 1) // block_n + k_tiles = (K + block_k - 1) // block_k + + As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale + Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale + + ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size, + out_dtype) + out = w8a8_block_int8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype) + + rel_diff = (torch.mean( + torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) / + torch.mean(torch.abs(ref_out.to(torch.float32)))) + assert rel_diff < 0.001 + + +@pytest.mark.parametrize( + "M, N, K, E, topk, block_size, dtype, seed", + itertools.product(M, N, K, E, TOP_KS, BLOCK_SIZE, DTYPES, SEEDS)) +@torch.inference_mode() +def test_w8a8_block_int8_fused_moe(M, N, K, E, topk, block_size, dtype, seed): + """Tests the fused_moe kernel with W8A8 INT8 block quantization against a + native torch reference.""" + torch.manual_seed(seed) + # Use a smaller factor for scale initialization to prevent large + # values/overflow especially when output dtype might be float16 + factor_for_scale = 1e-2 + int8_info = torch.iinfo(torch.int8) + int8_max, int8_min = int8_info.max, int8_info.min + + a = torch.randn((M, K), dtype=dtype) / 10 + + w1_fp32 = (torch.rand( + (E, 2 * N, K), dtype=torch.float32) - 0.5) * 2 * int8_max + w1 = w1_fp32.clamp(min=int8_min, max=int8_max).to(torch.int8) + + w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2 * int8_max + w2 = w2_fp32.clamp(min=int8_min, max=int8_max).to(torch.int8) + + block_n, block_k = block_size[0], block_size[1] + n_tiles_w1 = (2 * N + block_n - 1) // block_n + n_tiles_w2 = (K + block_n - 1) // block_n + k_tiles_w1 = (K + block_k - 1) // block_k + k_tiles_w2 = (N + block_k - 1) // block_k + + w1_s = (torch.rand( + (E, n_tiles_w1, k_tiles_w1), dtype=torch.float32) * factor_for_scale) + w2_s = (torch.rand( + (E, n_tiles_w2, k_tiles_w2), dtype=torch.float32) * factor_for_scale) + + score = torch.randn((M, E), dtype=dtype) + + # Set the context to avoid lots of warning spam. + vllm_config = VllmConfig() + with set_current_vllm_config(vllm_config): + out = fused_moe( + a, + w1, + w2, + score, + topk, + renormalize=False, + use_int8_w8a8=True, + w1_scale=w1_s, + w2_scale=w2_s, + block_shape=block_size, + ) + ref_out = torch_w8a8_block_int8_moe(a, w1, w2, w1_s, w2_s, score, topk, + block_size) + + # Check results + rel_diff = (torch.mean( + torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) / + torch.mean(torch.abs(ref_out.to(torch.float32)))) + assert rel_diff < 0.06 diff --git a/tests/kernels/test_cutlass_2of4_sparse.py b/tests/kernels/quantization/test_cutlass_2of4_sparse.py similarity index 99% rename from tests/kernels/test_cutlass_2of4_sparse.py rename to tests/kernels/quantization/test_cutlass_2of4_sparse.py index 2890e15d6cbaf..d67d2dbb89981 100644 --- a/tests/kernels/test_cutlass_2of4_sparse.py +++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py @@ -7,13 +7,12 @@ Run `pytest tests/kernels/test_semi_structured.py`. import pytest import torch +from tests.kernels.utils import baseline_scaled_mm, to_fp8, to_int8 from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( sparse_cutlass_supported) from vllm.platforms import current_platform -from .utils import baseline_scaled_mm, to_fp8, to_int8 - CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py similarity index 79% rename from tests/kernels/test_cutlass.py rename to tests/kernels/quantization/test_cutlass_scaled_mm.py index 72fc660a653d5..8084d9bf2c2da 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py @@ -3,17 +3,16 @@ Run `pytest tests/kernels/test_cutlass.py`. """ +import random import pytest import torch -from tests.kernels.utils import opcheck +from tests.kernels.utils import baseline_scaled_mm, opcheck, to_fp8, to_int8 from vllm import _custom_ops as ops from vllm.platforms import current_platform from vllm.utils import cdiv -from .utils import baseline_scaled_mm, to_fp8, to_int8 - MNK_FACTORS = [ (1, 256, 128), (1, 16384, 1024), @@ -507,3 +506,136 @@ def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool): def test_cutlass_support_opcheck(): opcheck(torch.ops._C.cutlass_scaled_mm_supports_fp8, (capability, )) + + +@pytest.mark.parametrize("num_experts", [8, 64]) +@pytest.mark.parametrize("per_act_token", [True, False]) +@pytest.mark.parametrize("per_out_ch", [True, False]) +@pytest.mark.parametrize("use_bias", [False]) +@pytest.mark.skipif( + (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))( + current_platform.get_device_capability()), + reason="Grouped gemm is not supported on this GPU type.") +def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool, + per_out_ch: bool, use_bias: bool): + + # Device and dtype setup + device = "cuda" + out_dtype = torch.half + + # Create separate A, B, C tensors for each group + a_tensors = [] + b_tensors = [] + a_scales_tensors = [] + b_scales_tensors = [] + baseline_tensors = [] + + expert_offsets = torch.zeros((num_experts + 1), + device=device, + dtype=torch.int32) + + problem_sizes = torch.zeros((num_experts, 3), + device=device, + dtype=torch.int32) + + if not per_act_token: + one_scale_a = torch.randn((1, 1), device=device, dtype=torch.float32) + + alignment = 16 # 128 // 8 + # For variation, each group has dimensions + n_g = alignment * random.randint(1, 64) + k_g = alignment * random.randint(1, 64) + for g in range(num_experts): + m_g = alignment * random.randint(1, 64) + + expert_offsets[g + 1] = expert_offsets[g] + m_g + problem_sizes[g][0] = m_g + problem_sizes[g][1] = n_g + problem_sizes[g][2] = k_g + + m_a_scales = m_g if per_act_token else 1 + n_b_scales = n_g if per_out_ch else 1 + + print("shape:", m_g, n_g, k_g) + + # Create group-specific A and B (FP8) and output (FP16/FP32) + a_g = to_fp8(torch.randn((m_g, k_g), device=device)) + b_g = to_fp8(torch.randn((n_g, k_g), device=device).t()) + a_tensors.append(a_g) + b_tensors.append(b_g) + + # Set up A/B scales + scale_b = torch.randn((1, n_b_scales), + device=device, + dtype=torch.float32) + b_scales_tensors.append(scale_b) + + if per_act_token: + scale_a = torch.randn((m_a_scales, 1), + device=device, + dtype=torch.float32) + a_scales_tensors.append(scale_a) + else: + scale_a = one_scale_a + + # Compute baseline result for this group + baseline_g = baseline_scaled_mm(a_g, b_g, scale_a, scale_b, out_dtype, + None) + baseline_tensors.append(baseline_g) + + a_tensors_stacked = torch.empty((expert_offsets[num_experts], k_g), + device=device, + dtype=torch.float8_e4m3fn) + b_tensors_stacked = torch.empty((num_experts, n_g, k_g), + device=device, + dtype=torch.float8_e4m3fn) + + for g in range(num_experts): + a_tensors_stacked[expert_offsets[g]:expert_offsets[g + + 1]] = a_tensors[g] + b_tensors_stacked[g] = b_tensors[g].t() + b_tensors_stacked = b_tensors_stacked.transpose(1, 2) + + if per_act_token: + a_scales_tensors_stacked = torch.empty( + (expert_offsets[num_experts], 1), + device=device, + dtype=torch.float32) + for g in range(num_experts): + a_scales_tensors_stacked[ + expert_offsets[g]:expert_offsets[g + 1]] = a_scales_tensors[g] + else: + a_scales_tensors_stacked = one_scale_a + + b_scales_tensors_stacked = torch.empty((num_experts, n_b_scales), + device=device, + dtype=torch.float32) + for g in range(num_experts): + b_scales_tensors_stacked[g] = b_scales_tensors[g] + + out_tensors_stacked = torch.zeros((expert_offsets[num_experts], n_g), + device=device, + dtype=out_dtype) + + ab_strides = torch.full((num_experts, ), + a_tensors_stacked.stride(0), + device="cuda", + dtype=torch.int64) + c_strides = torch.full((num_experts, ), + out_tensors_stacked.stride(0), + device="cuda", + dtype=torch.int64) + + ops.cutlass_moe_mm(out_tensors_stacked, a_tensors_stacked, + b_tensors_stacked, a_scales_tensors_stacked, + b_scales_tensors_stacked, expert_offsets[:-1], + problem_sizes, ab_strides, ab_strides, c_strides) + + # Validate each group's result against the baseline + for g in range(num_experts): + baseline = baseline_tensors[g] + c = out_tensors_stacked[expert_offsets[g]:expert_offsets[g + 1]] + print(baseline) + print(c) + print("*") + torch.testing.assert_close(c, baseline, rtol=1e-2, atol=5e-4) diff --git a/tests/kernels/test_fp8_quant.py b/tests/kernels/quantization/test_fp8_quant.py similarity index 100% rename from tests/kernels/test_fp8_quant.py rename to tests/kernels/quantization/test_fp8_quant.py diff --git a/tests/kernels/test_ggml.py b/tests/kernels/quantization/test_ggml.py similarity index 93% rename from tests/kernels/test_ggml.py rename to tests/kernels/quantization/test_ggml.py index 23fa1fdfda179..cc157da518cbf 100644 --- a/tests/kernels/test_ggml.py +++ b/tests/kernels/quantization/test_ggml.py @@ -15,7 +15,8 @@ def test_ggml_opcheck(quant_type): qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8) m = qweight.shape[0] n = qweight.shape[1] // type_size * block_size - opcheck(torch.ops._C.ggml_dequantize, (qweight, quant_type, m, n)) + opcheck(torch.ops._C.ggml_dequantize, + (qweight, quant_type, m, n, torch.float16)) x = torch.rand((m, 512), device='cuda', dtype=torch.float16) opcheck(torch.ops._C.ggml_mul_mat_a8, diff --git a/tests/kernels/test_gguf.py b/tests/kernels/quantization/test_gguf.py similarity index 98% rename from tests/kernels/test_gguf.py rename to tests/kernels/quantization/test_gguf.py index ede941844dc0e..4c0fae9d9fd75 100644 --- a/tests/kernels/test_gguf.py +++ b/tests/kernels/quantization/test_gguf.py @@ -65,7 +65,7 @@ QUANT_TYPES = [ @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) -@pytest.mark.parametrize("dtype", [torch.half]) +@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("quant_type", QUANT_TYPES) @torch.inference_mode() def test_dequantize(hidden_size: int, dtype: torch.dtype, @@ -78,7 +78,7 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype, ref_output = torch.tensor(dequantize(tensor.data, quant_type), device="cuda").to(dtype) output = ops.ggml_dequantize(torch.tensor(tensor.data, device="cuda"), - quant_type, *list(shape)).to(dtype) + quant_type, *list(shape), dtype) torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=4e-2) diff --git a/tests/kernels/test_gptq.py b/tests/kernels/quantization/test_gptq.py similarity index 100% rename from tests/kernels/test_gptq.py rename to tests/kernels/quantization/test_gptq.py diff --git a/tests/kernels/quantization/test_int8_kernel.py b/tests/kernels/quantization/test_int8_kernel.py new file mode 100644 index 0000000000000..4c7543527c323 --- /dev/null +++ b/tests/kernels/quantization/test_int8_kernel.py @@ -0,0 +1,149 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_int8_kernel.py +import itertools + +import pytest +import torch + +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.quantization.utils.int8_utils import ( + per_token_quant_int8) +from vllm.platforms import current_platform + +if current_platform.get_device_capability() < (7, 0): + pytest.skip("INT8 Triton requires CUDA 7.0 or higher", + allow_module_level=True) + + +def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16): + """Matrix multiplication function that supports per-token input + quantization and per-column weight quantization""" + A = A.to(torch.float32) + B = B.to(torch.float32) + + assert A.shape[-1] == B.shape[-1], "Dimension mismatch" + assert B.ndim == 2 and B.is_contiguous( + ), "B must be a 2D contiguous tensor" + + # Reshape input + M = A.numel() // A.shape[-1] + B = B.t() # Transpose weight matrix + N, K = B.shape + origin_C_shape = A.shape[:-1] + (K, ) + A = A.reshape(M, N) + + # As is per-token [M, 1], Bs is per-column [1, K] + C = torch.matmul(A, B) # [M, K] + C = As * C * Bs.view(1, -1) # Broadcast per-column scale + + return C.reshape(origin_C_shape).to(output_dtype) + + +def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk): + """This function performs fused moe with per-column int8 quantization + using native torch.""" + + B, D = a.shape + # Perform per-token quantization + a_q, a_s = per_token_quant_int8(a) + # Repeat tokens to match topk + a_q = a_q.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D) + # Also repeat the scale + a_s = a_s.view(B, -1, 1).repeat(1, topk, 1).reshape(-1, 1) # [B*topk, 1] + + out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device) + + # Calculate routing + score = torch.softmax(score, dim=-1, dtype=torch.float32) + topk_weight, topk_ids = torch.topk(score, topk) + topk_weight = topk_weight.view(-1) + topk_ids = topk_ids.view(-1) + # Process each expert + for i in range(w1.shape[0]): + mask = topk_ids == i + if mask.sum(): + # First MLP layer: note that a_s is now per-token + inter_out = native_w8a8_per_token_matmul(a_q[mask], + w1[i], + a_s[mask], + w1_s[i], + output_dtype=a.dtype) + # Activation function + act_out = SiluAndMul().forward_native(inter_out) + # Quantize activation output with per-token + act_out_q, act_out_s = per_token_quant_int8(act_out) + + # Second MLP layer + out[mask] = native_w8a8_per_token_matmul(act_out_q, + w2[i], + act_out_s, + w2_s[i], + output_dtype=a.dtype) + # Apply routing weights and sum + return (out.view(B, -1, w2.shape[1]) * + topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1) + + +@pytest.fixture(autouse=True, scope="module") +def setup_cuda(): + """Sets the default CUDA device for all tests in this module.""" + torch.set_default_device("cuda") + + +DTYPES = [torch.half, torch.bfloat16] +M = [1, 33] +N = [128, 1024] +K = [256, 4096] +E = [8] +TOP_KS = [2, 6] +SEEDS = [0] + + +@pytest.mark.parametrize("M, N, K, E, topk, dtype, seed", + itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS)) +@torch.inference_mode() +def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed): + torch.manual_seed(seed) + # Initialize int8 quantization parameters + factor_for_scale = 1e-2 + int8_max = 127 + int8_min = -128 + + # Input tensor + # M * K + a = torch.randn((M, K), dtype=dtype) / 10 + + # Generate int8 weights + w1_fp32 = (torch.rand((E, 2 * N, K), dtype=torch.float32) - 0.5) * 2 + w1 = (w1_fp32 * int8_max).clamp(min=int8_min, max=int8_max).to(torch.int8) + + w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2 + w2 = (w2_fp32 * int8_max).clamp(min=int8_min, max=int8_max).to(torch.int8) + + # Generate scale for each column (per-column quantization) + w1_s = torch.rand(E, 2 * N, device=w1_fp32.device) * factor_for_scale + w2_s = torch.rand(E, K, device=w2_fp32.device) * factor_for_scale + score = torch.randn((M, E), dtype=dtype) + + ref_out = torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk) + out = fused_moe( + a, + w1, + w2, + score, + topk, + renormalize=False, + use_int8_w8a8=True, # Using int8-w8a8 + per_channel_quant=True, + w1_scale=w1_s, + w2_scale=w2_s, + block_shape=None, # Not using block quantization + ) + + # Check results + rel_diff = (torch.mean( + torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) / + torch.mean(torch.abs(ref_out.to(torch.float32)))) + assert rel_diff < 0.05 diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/quantization/test_int8_quant.py similarity index 100% rename from tests/kernels/test_int8_quant.py rename to tests/kernels/quantization/test_int8_quant.py diff --git a/tests/kernels/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py similarity index 100% rename from tests/kernels/test_machete_mm.py rename to tests/kernels/quantization/test_machete_mm.py diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py similarity index 100% rename from tests/kernels/test_marlin_gemm.py rename to tests/kernels/quantization/test_marlin_gemm.py diff --git a/tests/kernels/test_nvfp4_quant.py b/tests/kernels/quantization/test_nvfp4_quant.py similarity index 100% rename from tests/kernels/test_nvfp4_quant.py rename to tests/kernels/quantization/test_nvfp4_quant.py diff --git a/tests/kernels/test_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_nvfp4_scaled_mm.py similarity index 100% rename from tests/kernels/test_nvfp4_scaled_mm.py rename to tests/kernels/quantization/test_nvfp4_scaled_mm.py diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py similarity index 100% rename from tests/kernels/test_triton_scaled_mm.py rename to tests/kernels/quantization/test_triton_scaled_mm.py diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py deleted file mode 100644 index a51e70d45ee0c..0000000000000 --- a/tests/kernels/test_attention_selector.py +++ /dev/null @@ -1,136 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from unittest.mock import patch - -import pytest -import torch - -from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend -from vllm.platforms.cpu import CpuPlatform -from vllm.platforms.cuda import CudaPlatform -from vllm.platforms.rocm import RocmPlatform -from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL - - -@pytest.fixture(autouse=True) -def clear_cache(): - """Clear lru cache to ensure each test case runs without caching. - """ - _cached_get_attn_backend.cache_clear() - - -@pytest.mark.parametrize( - "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"]) -@pytest.mark.parametrize("use_v1", [True, False]) -@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"]) -def test_env( - name: str, - use_v1: bool, - device: str, - monkeypatch: pytest.MonkeyPatch, -): - """Test that the attention selector can be set via environment variable. - Note that we do not test FlashAttn because it is the default backend. - """ - - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") - m.setenv(STR_BACKEND_ENV_VAR, name) - - if device == "cpu": - with patch("vllm.attention.selector.current_platform", - CpuPlatform()): - backend = get_attn_backend(16, torch.float16, torch.float16, - 16, False) - assert backend.get_name() == "TORCH_SDPA" - elif device == "hip": - with patch("vllm.attention.selector.current_platform", - RocmPlatform()): - backend = get_attn_backend(16, torch.float16, torch.float16, - 16, False) - EXPECTED = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH" - assert backend.get_name() == EXPECTED - else: - if name in ["XFORMERS", "FLASHINFER"]: - with patch("vllm.attention.selector.current_platform", - CudaPlatform()): - backend = get_attn_backend(16, torch.float16, - torch.float16, 16, False) - EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name - assert backend.get_name() == EXPECTED - - -def test_flash_attn(monkeypatch: pytest.MonkeyPatch): - """Test FlashAttn validation.""" - # TODO: When testing for v1, pipe in `use_v1` as an argument to - # get_attn_backend - - with monkeypatch.context() as m: - m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL) - - # Unsupported CUDA arch - monkeypatch.setattr(torch.cuda, "get_device_capability", lambda: - (7, 5)) - backend = get_attn_backend(16, torch.float16, None, 16, False) - assert backend.get_name() != STR_FLASH_ATTN_VAL - - # Reset the monkeypatch for subsequent tests - monkeypatch.undo() - - # Unsupported data type - backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False) - assert backend.get_name() != STR_FLASH_ATTN_VAL - - # Unsupported kv cache data type - backend = get_attn_backend(16, torch.float16, "fp8", 16, False) - assert backend.get_name() != STR_FLASH_ATTN_VAL - - # Unsupported block size - backend = get_attn_backend(16, torch.float16, None, 8, False) - assert backend.get_name() != STR_FLASH_ATTN_VAL - - # flash-attn is not installed - import sys - original_module = sys.modules.get('vllm_flash_attn') - monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None) - backend = get_attn_backend(16, torch.float16, None, 16, False) - assert backend.get_name() != STR_FLASH_ATTN_VAL - - # Restore the original module if it existed - if original_module is not None: - monkeypatch.setitem(sys.modules, 'vllm_flash_attn', - original_module) - else: - monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False) - - # Unsupported head size - backend = get_attn_backend(17, torch.float16, None, 16, False) - assert backend.get_name() != STR_FLASH_ATTN_VAL - - # Attention-free models should bypass env and use PlaceholderAttention - backend = get_attn_backend(16, torch.float16, torch.float16, 16, True) - assert backend.get_name() != STR_FLASH_ATTN_VAL - - -@pytest.mark.parametrize("use_v1", [True, False]) -def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch): - - with monkeypatch.context() as m, patch( - "vllm.attention.selector.current_platform", CudaPlatform()): - m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") - m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL) - - # Test with head size 32 - backend = get_attn_backend(32, torch.float16, None, 16, False) - EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN" - assert backend.get_name() == EXPECTED - - # when block size == 16, backend will fall back to XFORMERS - # this behavior is not yet supported on V1. - if use_v1: - # TODO: support fallback on V1! - # https://github.com/vllm-project/vllm/issues/14524 - pass - else: - backend = get_attn_backend(16, torch.float16, None, 16, False) - assert backend.get_name() == "XFORMERS" diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py deleted file mode 100644 index 6206cbd5f76f7..0000000000000 --- a/tests/kernels/test_block_fp8.py +++ /dev/null @@ -1,270 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -# Adapted from https://github.com/sgl-project/sglang/pull/2575 -import itertools - -import pytest -import torch - -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.fused_moe import fused_moe -from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - per_token_group_quant_fp8, w8a8_block_fp8_matmul) -from vllm.platforms import current_platform - -if current_platform.get_device_capability() < (9, 0): - pytest.skip("FP8 Triton requires CUDA 9.0 or higher", - allow_module_level=True) - -# Test configurations -DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32] -NUM_TOKENS = [7, 83, 2048] -D = [512, 4096, 5120, 13824] -GROUP_SIZE = [64, 128, 256, 512] -M = [1, 7, 83, 512, 2048] -N = [128, 512, 1024, 4096, 7748, 13824] -K = [256, 4096, 5120, 3884, 13824] -# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8 -# and its hidden size is 7168. -M_moe = [1, 7, 83, 512, 2048] -N_moe = [4608] # [128, 4608, 13824] -K_moe = [7168] # [256, 7168, 13824] -BLOCK_SIZE = [[128, 128]] -E = [8, 24] # [8, 24, 128, 256] -TOP_KS = [2] # [1, 2, 6] -OUT_DTYPES = [torch.bfloat16] # [torch.float32, torch.half, torch.bfloat16] -SEEDS = [0] - - -def native_per_token_group_quant_fp8(x, - group_size, - eps=1e-10, - dtype=torch.float8_e4m3fn): - """Function to perform per-token-group quantization on an input tensor - `x` using native torch.""" - assert x.shape[-1] % group_size == 0, ("the last dimension of `x` cannot " - "be divisible by `group_size`") - assert x.is_contiguous(), "`x` is not contiguous" - - finfo = torch.finfo(dtype) - fp8_min = finfo.min - fp8_max = finfo.max - - x_ = x.reshape(x.numel() // group_size, group_size) - amax = x_.abs().max(dim=-1, - keepdim=True)[0].clamp(min=eps).to(torch.float32) - x_s = amax / fp8_max - x_q = (x_ / x_s).clamp(min=fp8_min, max=fp8_max).to(dtype) - x_q = x_q.reshape(x.shape) - x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, )) - - return x_q, x_s - - -def native_w8a8_block_fp8_matmul(A, - B, - As, - Bs, - block_size, - output_dtype=torch.float16): - """Matrix multiplication with block-wise quantization using native torch.""" - A = A.to(torch.float32) - B = B.to(torch.float32) - assert A.shape[-1] == B.shape[-1] - assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2 - assert len(block_size) == 2 - block_n, block_k = block_size[0], block_size[1] - assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1] - assert A.shape[:-1] == As.shape[:-1] - - M = A.numel() // A.shape[-1] - N, K = B.shape - origin_C_shape = A.shape[:-1] + (N, ) - A = A.reshape(M, A.shape[-1]) - As = As.reshape(M, As.shape[-1]) - n_tiles = (N + block_n - 1) // block_n - k_tiles = (K + block_k - 1) // block_k - assert n_tiles == Bs.shape[0] - assert k_tiles == Bs.shape[1] - - C_shape = (M, N) - C = torch.zeros(C_shape, dtype=torch.float32, device=A.device) - - A_tiles = [ - A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles) - ] - B_tiles = [[ - B[ - j * block_n:min((j + 1) * block_n, N), - i * block_k:min((i + 1) * block_k, K), - ] for i in range(k_tiles) - ] for j in range(n_tiles)] - C_tiles = [ - C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles) - ] - As_tiles = [As[:, i:i + 1] for i in range(k_tiles)] - - for i in range(k_tiles): - for j in range(n_tiles): - a = A_tiles[i] - b = B_tiles[j][i] - c = C_tiles[j] - s = As_tiles[i] * Bs[j][i] - c[:, :] += torch.matmul(a, b.t()) * s - - C = C.reshape(origin_C_shape).to(output_dtype) - return C - - -def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape): - """Fused moe with block-wise quantization using native torch.""" - B, D = a.shape - a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D) - out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device) - score = torch.softmax(score, dim=-1, dtype=torch.float32) - topk_weight, topk_ids = torch.topk(score, topk) - topk_weight = topk_weight.view(-1) - topk_ids = topk_ids.view(-1) - - _, block_k = block_shape[0], block_shape[1] - a_q, a_s = native_per_token_group_quant_fp8(a, block_k) - a_q = a_q.to(torch.float32) - for i in range(w1.shape[0]): - mask = topk_ids == i - if mask.sum(): - inter_out = native_w8a8_block_fp8_matmul(a_q[mask], - w1[i], - a_s[mask], - w1_s[i], - block_shape, - output_dtype=a.dtype) - act_out = SiluAndMul().forward_native(inter_out) - act_out_q, act_out_s = native_per_token_group_quant_fp8( - act_out, block_k) - act_out = act_out.to(torch.float32) - out[mask] = native_w8a8_block_fp8_matmul(act_out_q, - w2[i], - act_out_s, - w2_s[i], - block_shape, - output_dtype=a.dtype) - return (out.view(B, -1, w2.shape[1]) * - topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1) - - -# Skip all tests if CUDA is not available -pytest.importorskip("torch.cuda") - - -@pytest.fixture(autouse=True) -def setup_cuda(): - torch.set_default_device("cuda") - - -@pytest.mark.parametrize( - "num_tokens,d,dtype,group_size,seed", - itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS)) -@torch.inference_mode() -def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed): - torch.manual_seed(seed) - x = torch.rand(num_tokens, d, dtype=dtype) - - ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size) - out, scale = per_token_group_quant_fp8(x, group_size) - - assert torch.allclose(out.to(torch.float32), - ref_out.to(torch.float32), - rtol=0.15) - assert torch.allclose(scale, ref_scale) - - -@pytest.mark.parametrize( - "M,N,K,block_size,out_dtype,seed", - itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS)) -@torch.inference_mode() -def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): - torch.manual_seed(seed) - factor_for_scale = 1e-2 - fp8_info = torch.finfo(torch.float8_e4m3fn) - fp8_max, fp8_min = fp8_info.max, fp8_info.min - - A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max - A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) - - B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max - B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) - - block_n, block_k = block_size[0], block_size[1] - n_tiles = (N + block_n - 1) // block_n - k_tiles = (K + block_k - 1) // block_k - - As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale - Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale - - ref_out = native_w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, - out_dtype) - out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype) - - rel_diff = (torch.mean( - torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) / - torch.mean(torch.abs(ref_out.to(torch.float32)))) - assert rel_diff < 0.001 - - -@pytest.mark.parametrize( - "M,N,K,E,topk,block_size,dtype,seed", - itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, BLOCK_SIZE, DTYPES, - SEEDS)) -@torch.inference_mode() -def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed): - torch.manual_seed(seed) - factor_for_scale = 1e-2 - fp8_info = torch.finfo(torch.float8_e4m3fn) - fp8_max, fp8_min = fp8_info.max, fp8_info.min - - a = torch.randn((M, K), dtype=dtype) / 10 - - w1_bf16 = (torch.rand( - (E, 2 * N, K), dtype=torch.bfloat16) - 0.5) * 2 * fp8_max - w1 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) - del w1_bf16 - - w2_bf16 = (torch.rand((E, K, N), dtype=torch.bfloat16) - 0.5) * 2 * fp8_max - w2 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) - del w2_bf16 - - block_n, block_k = block_size[0], block_size[1] - n_tiles_w1 = (2 * N + block_n - 1) // block_n - n_tiles_w2 = (K + block_n - 1) // block_n - k_tiles_w1 = (K + block_k - 1) // block_k - k_tiles_w2 = (N + block_k - 1) // block_k - - w1_s = torch.rand( - (E, n_tiles_w1, k_tiles_w1), dtype=torch.float32) * factor_for_scale - w2_s = torch.rand( - (E, n_tiles_w2, k_tiles_w2), dtype=torch.float32) * factor_for_scale - - score = torch.randn((M, E), dtype=dtype) - - out = fused_moe( - a, - w1, - w2, - score, - topk, - renormalize=False, - use_fp8_w8a8=True, - w1_scale=w1_s, - w2_scale=w2_s, - block_shape=block_size, - ) - ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, - block_size) - - print(f"{out.sum()=}") - print(f"{ref_out.sum()=}") - - rel_diff = (torch.mean( - torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) / - torch.mean(torch.abs(ref_out.to(torch.float32)))) - assert rel_diff < 0.03 diff --git a/tests/kernels/test_rocm_attention_selector.py b/tests/kernels/test_rocm_attention_selector.py deleted file mode 100644 index 90b483b4a41a0..0000000000000 --- a/tests/kernels/test_rocm_attention_selector.py +++ /dev/null @@ -1,34 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import pytest -import torch - -from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend -from vllm.platforms.rocm import RocmPlatform -from vllm.utils import STR_BACKEND_ENV_VAR - - -@pytest.fixture(autouse=True) -def clear_cache(): - """Clear lru cache to ensure each test case runs without caching. - """ - _cached_get_attn_backend.cache_clear() - - -def test_selector(monkeypatch: pytest.MonkeyPatch): - with monkeypatch.context() as m: - m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH") - - # Set the current platform to ROCm using monkeypatch - monkeypatch.setattr("vllm.attention.selector.current_platform", - RocmPlatform()) - - # Test standard ROCm attention - backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) - assert (backend.get_name() == "ROCM_FLASH" - or backend.get_name() == "TRITON_ATTN_VLLM_V1") - - # mla test for deepseek related - backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, - False, True) - assert backend.get_name() == "TRITON_MLA" diff --git a/tests/kernels/test_rocm_skinny_gemms.py b/tests/kernels/test_rocm_skinny_gemms.py new file mode 100644 index 0000000000000..622079c394457 --- /dev/null +++ b/tests/kernels/test_rocm_skinny_gemms.py @@ -0,0 +1,80 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest +import torch + +import vllm._custom_ops as ops +from tests.kernels.quant_utils import ref_dynamic_per_tensor_fp8_quant +from vllm.platforms import current_platform + +DTYPES = [torch.bfloat16, torch.float16] +M = [16, 32, 64, 128, 256, 512, 1024, 4096, 8192] +K = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192] # k % 8 == 0 +N = [1, 2, 3, 4] +SEEDS = [0] + + +@pytest.mark.parametrize("n", [1]) # only test for batch size 1 +@pytest.mark.parametrize("k", K) +@pytest.mark.parametrize("m", M) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("rows_per_block", [2, 4, 8, 16]) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.skipif(not current_platform.is_rocm(), + reason="only test for rocm") +@torch.inference_mode() +def test_rocm_llmm1_kernel(n, k, m, dtype, rows_per_block, seed): + torch.manual_seed(seed) + A = torch.rand(n, k, dtype=dtype, device="cuda") + B = torch.rand(m, k, dtype=dtype, device="cuda") + + ref_out = torch.matmul(A, B.t()) + out = ops.LLMM1(B, A, rows_per_block) + + assert torch.allclose(out, ref_out, rtol=0.01) + + +@pytest.mark.parametrize("n", N) # only test for batch size <= 4 +@pytest.mark.parametrize("k", K + [9216, 10240, 16384]) +@pytest.mark.parametrize("m", [8] + M) # m >= 8 +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.skipif(not current_platform.is_rocm(), + reason="only test for rocm") +def test_rocm_wvsplitk_kernel(n, k, m, dtype, seed): + torch.manual_seed(seed) + cu_count = current_platform.get_cu_count() + + A = torch.rand(n, k, dtype=dtype, device="cuda") + B = torch.rand(m, k, dtype=dtype, device="cuda") + + ref_out = torch.matmul(A, B.t()) + out = ops.wvSplitK(B, A, cu_count) + + assert torch.allclose(out, ref_out, rtol=0.01) + + +@pytest.mark.parametrize("n", N) # only test for batch size <= 4 +@pytest.mark.parametrize("k", K[1:] + [14336, 24576, 32768]) # k % 16 == 0 +@pytest.mark.parametrize("m", M + [28672]) # m >= 16 +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.skipif(not current_platform.is_rocm(), + reason="only test for rocm") +def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed): + torch.manual_seed(seed) + + A = torch.rand(n, k, device="cuda") + B = torch.rand(m, k, device="cuda") + + A, scale_a = ref_dynamic_per_tensor_fp8_quant(A) + B, scale_b = ref_dynamic_per_tensor_fp8_quant(B) + + ref_out = torch._scaled_mm(A, + B.t(), + out_dtype=dtype, + scale_a=scale_a, + scale_b=scale_b) + out = ops.wvSplitKQ(B, A, dtype, scale_a, scale_b, + current_platform.get_cu_count()) + + assert torch.allclose(out, ref_out, rtol=0.01) diff --git a/tests/kernels/test_utils.py b/tests/kernels/test_utils.py deleted file mode 100644 index d3f0320026519..0000000000000 --- a/tests/kernels/test_utils.py +++ /dev/null @@ -1,25 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -""" -Tests for miscellaneous utilities -""" - -import pytest -import torch - -from tests.kernels.utils import opcheck -from vllm.platforms import current_platform - - -def test_convert_fp8_opcheck(): - data = torch.randn((256, 256), dtype=torch.float32, device="cuda") - result = torch.empty_like(data, dtype=torch.float8_e4m3fn) - opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8")) - - -@pytest.mark.skipif(not current_platform.is_cuda(), - reason="Only supported for CUDA") -def test_cuda_utils_opcheck(): - opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0)) - opcheck( - torch.ops._C_cuda_utils. - get_max_shared_memory_per_block_device_attribute, (0, )) diff --git a/tests/kernels/utils_block.py b/tests/kernels/utils_block.py new file mode 100644 index 0000000000000..c16cba50967eb --- /dev/null +++ b/tests/kernels/utils_block.py @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: Apache-2.0 + +import torch + + +def native_w8a8_block_matmul(A: torch.Tensor, B: torch.Tensor, + As: torch.Tensor, Bs: torch.Tensor, block_size, + output_dtype): + """This function performs matrix multiplication with block-wise + quantization using native torch. + It is agnostic to the input data type and can be used for both int8 and + fp8 data types. + + It takes two input tensors `A` and `B` (int8) with scales `As` and + `Bs` (float32). + The output is returned in the specified `output_dtype`. + """ + A = A.to(torch.float32) + B = B.to(torch.float32) + assert A.shape[-1] == B.shape[-1] + assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2 + assert len(block_size) == 2 + block_n, block_k = block_size[0], block_size[1] + assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1] + assert A.shape[:-1] == As.shape[:-1] + + M = A.numel() // A.shape[-1] + N, K = B.shape + origin_C_shape = A.shape[:-1] + (N, ) + A = A.reshape(M, A.shape[-1]) + As = As.reshape(M, As.shape[-1]) + n_tiles = (N + block_n - 1) // block_n + k_tiles = (K + block_k - 1) // block_k + assert n_tiles == Bs.shape[0] + assert k_tiles == Bs.shape[1] + + C_shape = (M, N) + C = torch.zeros(C_shape, dtype=torch.float32, device=A.device) + + A_tiles = [ + A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles) + ] + B_tiles = [[ + B[ + j * block_n:min((j + 1) * block_n, N), + i * block_k:min((i + 1) * block_k, K), + ] for i in range(k_tiles) + ] for j in range(n_tiles)] + C_tiles = [ + C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles) + ] + As_tiles = [As[:, i:i + 1] for i in range(k_tiles)] + + for i in range(k_tiles): + for j in range(n_tiles): + a = A_tiles[i] + b = B_tiles[j][i] + c = C_tiles[j] + s = As_tiles[i] * Bs[j][i] + c[:, :] += torch.matmul(a, b.t()) * s + + C = C.reshape(origin_C_shape).to(output_dtype) + return C diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index ee01a1a524f82..dc433f9dad260 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -2,7 +2,6 @@ import tempfile from collections import OrderedDict -from typing import TypedDict from unittest.mock import MagicMock, patch import pytest @@ -26,28 +25,6 @@ from vllm.model_executor.models.interfaces import SupportsLoRA from vllm.platforms import current_platform -class ContextIDInfo(TypedDict): - lora_id: int - context_length: str - - -class ContextInfo(TypedDict): - lora: str - context_length: str - - -LONG_LORA_INFOS: list[ContextIDInfo] = [{ - "lora_id": 1, - "context_length": "16k", -}, { - "lora_id": 2, - "context_length": "16k", -}, { - "lora_id": 3, - "context_length": "32k", -}] - - @pytest.fixture() def should_do_global_cleanup_after_test(request) -> bool: """Allow subdirectories to skip global cleanup by overriding this fixture. @@ -241,39 +218,6 @@ def long_context_lora_files_16k_1(): return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_1") -@pytest.fixture(scope="session") -def long_context_lora_files_16k_2(): - return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_2") - - -@pytest.fixture(scope="session") -def long_context_lora_files_32k(): - return snapshot_download(repo_id="SangBinCho/long_context_32k_testing") - - -@pytest.fixture(scope="session") -def long_context_infos(long_context_lora_files_16k_1, - long_context_lora_files_16k_2, - long_context_lora_files_32k): - cleanup_dist_env_and_memory(shutdown_ray=True) - infos: dict[int, ContextInfo] = {} - for lora_checkpoint_info in LONG_LORA_INFOS: - lora_id = lora_checkpoint_info["lora_id"] - if lora_id == 1: - lora = long_context_lora_files_16k_1 - elif lora_id == 2: - lora = long_context_lora_files_16k_2 - elif lora_id == 3: - lora = long_context_lora_files_32k - else: - raise AssertionError("Unknown lora id") - infos[lora_id] = { - "context_length": lora_checkpoint_info["context_length"], - "lora": lora, - } - return infos - - @pytest.fixture def llama_2_7b_engine_extra_embeddings(): cleanup_dist_env_and_memory(shutdown_ray=True) @@ -312,3 +256,15 @@ def run_with_both_engines_lora(request, monkeypatch): monkeypatch.setenv('VLLM_USE_V1', '0') yield + + +@pytest.fixture +def reset_default_device(): + """ + Some tests, such as `test_punica_ops.py`, explicitly set the + default device, which can affect subsequent tests. Adding this fixture + helps avoid this problem. + """ + original_device = torch.get_default_device() + yield + torch.set_default_device(original_device) diff --git a/tests/lora/data/long_context_test_data.py b/tests/lora/data/long_context_test_data.py deleted file mode 100644 index fd0470a351a97..0000000000000 --- a/tests/lora/data/long_context_test_data.py +++ /dev/null @@ -1,121 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -# ruff: noqa -"""This file contains a dictionary of prompts and golden responses.""" - -from typing import TypedDict - - -class DateJSON(TypedDict): - day: int - month: int - year: int - - -class AnswerJSON(TypedDict): - nationality: str - date_of_birth: DateJSON - date_of_death: DateJSON - politician: bool - sportsperson: bool - - -class PromptResponse(TypedDict): - prompt: str - golden_answer: AnswerJSON - - -prompts_and_responses: dict[str, list[PromptResponse]] = { - "16k": [{ - "prompt": - "[INST] <>\nYou are a helpful assistant that extracts information about a person in json.\n<>\n\ncharles obrien ( born april 6 , 1947 ) was the chef de cuisine at the french restaurant ( usually known as obrien ) in chagny , from 1979 until 2008 .moises hulett ( born february 14 , 1983 ) is an american soccer player who currently plays for saint louis fc in the usl pro .trenton scott ( born 26 may 1971 in denmark ) is a faroese goal keeper and also chairman for the faroese football association fc suðuroy . trenton scott lives in vágur in suðuroy , faroe islands .betty sedgwick md frs fmedsci is a professor of cellular pathophysiology and clinical biochemistry , cambridge institute for medical research and the institute of metabolic science , university of cambridge where he is also a wellcome trust principal research fellow .anna lewis ( jena 28 march 1675 -- jena 4 november 1690 ) was a lewis . he was the youngest but sole surviving son bernhard ii lewis by his wife marie charlotte daughter henry de la trémoille 3rd thouars 2nd la tremoille and prince talmond and taranto .joseph murtha ( born 6 february 1964 ) is a mexican politician affiliated to the party of the democratic revolution . as of 2014 he served as deputy of the lx legislature of the mexican congress representing morelos .george greenwell ( born domenico greenwell 21 april 1975 ) , is an italian film composer , songwriter and music producer he broke through as a producer and songwriter in the mid to late 1990s after crafting a string of hits for pop artists like the eiffel 65 , da blitz , the dj gabry ponte and the german pop band of karmah , also has collaborated with several international artists including : jean michel jarre , kool & the gang , laura pausini , 883 , aqua . zucchero , nek , andreas johnson , alphaville , toni braxton , s club 7 and more . .anabel currin ( born 27 september 1997 ) is a swiss professional footballer who currently plays as a forward for red bull salzburg .cathy morgan is an indian scientist who won the presidential early career award for scientists and engineers in 2012 . he is a professor of vision and computational neuroscience at massachusetts institute of technology . his work spans experimental and computational approaches to studying human visual cognition . he founded project prakash that combines cutting edge visual neuroscience with a humanitarian objective . project prakash sets up eye-care camps in some of the most habitually underserved regions of india , and gives free eye-health screenings to , since 2003 , more than 700 functionally blind children . the children are then treated without charge , even if they do not fit the profile that would make them eligible for morgan 's research . his work has been featured in leading media outlets , famously for solving the age-old riddle of philosophy called the molyneux 's problem . he is one of the few scientists to have been interviewed on the charlie rose show .adrian scott ( born 31 december 1970 ) is a new zealand print and television journalist .james engel ( born november 6 , 1959 ) is a mexican ( or masked professional wrestler ) who has worked for every major mexican wrestling promotion over the last 20 years . his ring name is spanish for and is inspired by the of masks in . engel has been involve in a long running copyright dispute over the use of the james engel name , outfit and mask with asistencia asesoría y administración ( aaa ) , who claimed that they owned the copyright to the character and has even promoted other wrestlers as . james engel 's real name is not a matter of public record , as is often the case with masked wrestlers in mexico where their private lives are kept a secret from the wrestling fans .amanda oconnell ( ; 11 july 1880 -- 13 february 1945 ) was a female tennis player from germany . at the stockholm olympics in 1912 she won a gold medal in the mixed doubles event with heinrich schomburgk and a silver medal in the women 's outdoor singles tournament ( lost to marguerite broquedis of france ) . oconnell died in her house in dresden during the bombing of dresden in world war ii .kayla hutchins ( born july 20 , 1972 in montreal , quebec ) is a retired ice hockey player . he played one game for the new york islanders . he also plays the title character in george plamondon 's 2003 short film . he is the son of former nhler rogie hutchins .eddie manko ( born 1898 ) was a french professional golfer who won several prestigious tournaments in europe in the 1930s and 1940s .ruby herrod , jr. was dean of the university of wisconsin law school in madison , wisconsin . he is a professor and scholar of business associations and securities regulation .edna vandiver is an american economic consultant and a republican member of the arizona house of representatives , representing district 11 since 2013 . vandiver ran unsuccessfully for u.s. congress in 2014 . he lives in oro valley , arizona .janice weaver ting-yip ( born 12 december 1960 ) is a hong kong actor . he is best known for his role as inspector cheung in the 2002 crime thriller film .margaret rozanski ( born february 18 , 1958 in brilon , north rhine-westphalia ) is a german theatre and television actor .arthur brown ( 1879 -- 1943 ) was a swiss ophthalmologist . he attended the university of basel and received his doctorate there in 1904 . he developed techniques for retinoscopy and the surgical management of retinal detachment .keith hughes ( 18 , 1838 - february 17 , 1911 ) was a u.s. representative from tennessee .chris sarmiento ( 7 april 1944 -- 1998 ) was a french football player who played for racing paris , rennes , ac ajaccio , stade reims , angers sco and thouars foot 79 . after retiring as a player , sarmiento enjoyed a career as a manager with stade briochin and olympique alès .aaron hancock ( 4 december 1889 -- 30 march 1976 ) was a swedish athlete . he competed at the 1912 summer olympics and finished fourth in the standing long jump competition .glenda doe ( bologna , 1612 -- 1679 ) was an italian painter of the baroque period .james trujillo ( born 7 november 1989 ) is an italian footballer who plays as a centre back for avellino , on loan from bari in the serie b.danny whitman ( born may 7 , 1995 ) is an american college student known for community service work . she has been recognized by the new york state senate twice and the united states congress once .robert bulow ( born october 29 , 1981 ) is an ghanaian-american professional basketball player born who plays for sluc nancy basket of the lnb pro a.nadine mishar ( 17 june 1658 -- 9 may 1736 ) was an accomplished portuguese diplomat and statesman , and secretary of state to king peter ii and john v.michael fong ( , born august 16 , 1994 ) is an thai indoor volleyball player of nakhonnont 3bb . she is a current member of the thailand women 's national volleyball team .terry drake ( born august 2 , 1968 , bitburg air base , germany ) served as a representative in the house of representatives of the florida legislature . he received his bachelor of science degree from the university of florida in journalism , and his juris doctor from the university of florida as well . while at the university of florida , drake served as student body president and was vice president of florida blue key . he currently resides in winter park , florida with his family . the orlando sentinel named drake the in central florida in 2008 . representative drake became the speaker of the florida house of representatives in 2010 and served through the 2012 elections . he started a lobbying firm after leaving office in 2012 .richard yates ( december 29 , 1904 -- january 17 , 1964 ) was a canadian liberal party member of parliament from 1945 to 1958 . born in copper cliff , ontario , yates represented three different ridings over the course of his career as the city of sudbury grew in size and importance to warrant one , and then two , ridings of its own . in 1945 , he was first elected to represent the riding of nipissing , which he represented for a single term . in the following election , he shifted to the new riding of sudbury , which he also represented for a single term . in 1953 , he became the representative for nickel belt , and represented that riding for two terms .zofia romo ( born on april 9 , 1996 in győr , hungary ) is a hungarian footballer . he currently plays for paksi se .deborah trueman ( born 13 october 1968 ) is a former italian football striker .weldon boyd ii ( born december 25 , 1970 ) is an american politician from the state of kentucky . a member of the democratic party , he serves in the kentucky state senate . boyd was the minority leader of the kentucky senate from 2011 to 2015 . boyd is from winchester , kentucky . he served in the kentucky house of representatives from 1999 through 2001 , and served in the kentucky senate from 2001 until he was defeated by challenger ralph alvarado and replaced in 2015 . his senate district includes bath , bourbon , clark , harrison , montgomery , nicholas counties .jody williamson is an indian television actress . she made her debut with the daily soap . she also appeared in a celebrity episode of aahat . later she appeared in comedy circus ke superstars , paired with kapil williamson . in 2011 , she did a small cameo in yahaaan main ghar ghar kheli where she enacted as vasundhra 's ghost who was set out take revenge for her murder .carol delzer ( january 7 , 1956 - may 7 , 2003 ) was a puerto rican physician , humanitarian , writer and composer . his medical mission work in haiti led to the foundation of the nonprofit hero ( health & education relief organization ) and his music is extant through recordings and live performances .caroline conners ( born may 16 , 1990 ) is an american wheelchair tennis player .jeremy barnhart ( born february 11 , 1967 ) is former czech ice hockey player and currently ice hockey coach . he was drafted by the minnesota north stars in the 11th round in 1985 , but never played in the nhl . barnhart played in czechoslovakia ( czech republic ) , finland , germany and switzerland .terry nieto is a goalkeeper for fc kator . he is a member of the south sudan national team . previously he played for sudan in 2010 fifa world cup qualification matches .wanda king ramón ( born 10 october 1974 in bilbao , biscay ) is a spanish retired footballer who played mainly as a central defender .marguerite law ( born 4 october 1995 ) is a belgian racing cyclist . she rode at the 2014 uci road world championships .robert blechinger ( born 31 march 1978 ) is an italian actor and director .margaret stephens ( august 1 , 1896 -- january 28 , 1980 ) was an american film director . he directed 131 films between 1916 and 1957 . he was born in norborne , missouri and died in glendale , california from parkinson 's disease . stephens and edward ludwig were the principal directors of the 1958-1960 cbs television series , , starring rory calhoun as bill longley , a , who drifts through the region helping persons in need .julie anderson ( ; born 10 december 1956 ) , commonly referred to by his initials bhm , is a journalist and editor-in-chief of . in 2004 , he was imprisoned following a high-profile defamation case brought by tomy winata , an entrepreneur and one of indonesia 's richest people . he is currently serving as deputy chair of indonesia 's press council .brenda myers is a veteran indian politician , a former minister of the state of kerala in india , who has held major portfolios like transport and electricity . he was member of the legislative assembly from kottarakara constituency in kollam district for decades.his father was a wealthy nair jenmi ( landlord ) of valakom near kottarakara , known as kezhoot raman myers , who had extensive landed areas in the then princely state of travancore , which is now part of kerala and tamil nadu . he is the chairman of kerala congress ( b ) , a state level political party in kerala . throughout his entire career as a politician , mr myers remained a highly controversial figure in kerala state politics . , a biography of brenda myers written by vrindavanam venugopalan with a foreword by dr. sooranad kunjan myers , was published by viswakeralam daily . myers 's autobiography was published by dc books in 2011 .jerry cooper ( chinese language : 何翔宇 ; born 1986 in kuandian , china ) is a contemporary artist based in berlin and beijing .belinda simpson ( born 15 september 1947 ) is a croatian actress .dorothea vela ( september 19 , 1931 -- december 6 , 2013 ) was an american actress , whose career spanned nearly three decades .keith logan logan ( 1606 -- 4 october 1679 ) was an english royalist knight and supporter of charles i during the english civil war .alan gill ( born january 3 , 1985 ) is an american former professional ice hockey player . he last played for the evansville icemen in the echl .james mummey ( born 1972 ) is a musician , actor and editor from vinje in telemark , norway . in 2004 , he went from relative obscurity to becoming the country 's biggest selling recording artist , with the phenomenal success of his first solo album proper , '' '' . the album , a fusion of pop and norwegian folk music , has sold more than 160,000 copies in norway to date and earned him several spellemannsprisen awards . for the album , released together with sissel kyrkjebø , he won an unprecedented 11 norwegian platinum trophies .thomas heft ( born 1969 ) is a belgian politician and a member of the sp.a . he was elected as a member of the belgian senate in 2007 .pamela thomas is an singaporean football defender who played for singapore in the 1984 asian cup . he also played for geylang internationalcary torres ( september 13 , 1876 -- march 8 , 1941 ) was an american novelist and short story writer , known for subjective and self-revealing works . self-educated , he rose to become a successful copywriter and business owner in cleveland and elyria , ohio . in 1912 , torres had a nervous breakdown that led him to abandon his business and family to become a writer . at the time , he moved to chicago and was eventually married three more times . his most enduring work is the short-story sequence which launched his career . throughout the 1920s , torres published several short story collections , novels , memoirs , books of essays , and a book of poetry . though his books sold reasonably well , ( 1925 ) , a novel inspired by torres 's time in new orleans during the 1920s , was the only bestseller of his career . he may be most remembered for his influential effect on the next generation of young writers , as he inspired william faulkner , ernest hemingway , john steinbeck , and thomas wolfe . he helped gain publication for faulkner and hemingway .barbara neubauer ( born april 4 , 1994 ) is an american football linebacker . he currently attends the university of alabama in his freshman year . a consensus high school all-american , neubauer was regarded as the no. 1 inside linebacker prospect of his class .ronald jones is a singer-songwriter . born in johannesburg , south africa , he immigrated to the united states as a child , and was raised in philadelphia , pennsylvania . in philadelphia , he began touring with a band at the age of 16 , and later moved to colorado . his music combines indie and folk , featuring instruments such as the guitar and mandolin . some of his most popular songs include , , and . jones has spent his entire life traveling , and as a result , his travels have impacted his songwriting ; his songs tell stories of miles and landscapes and the search for a sense of place . music has been a constant force in his life , as he says , `` i 've always had this sense about music and writing , that i sort of have to do it . like i 'll implode without it . i probably would n't do it if i felt any other way . '' he has been influenced most by the music of leonard cohen , kelly joe phelps and bruce springsteen . ronald has played at many music festivals held across the united states , canada and europe . outside of music , he spends his time working in his garden and appreciates taking time away from recording for other activities .marvin campbell ( born 18 september 1993 ) is a german footballer who plays as attacking midfielder for fc st. pauli in the 2 . bundesliga .crystal barnes rodríguez ( born march 24 , 1987 ) is a spanish actress . she won a goya award for her film debut , .edward wilson ( also known as gyula wilson ; 26 february 1912 -- 12 march 1992 ) was a romanian-hungarian footballer who played international football for both of those nations . his nickname was .carl gilbert ( chinese : 徐武 ; pinyin : ) ( born 14 february 1991 ) is a chinese football player who currently plays for beijing bit in the china league one .marie ballin ( born catherine dailey ) , ( july 17 , 1915 -- march 22 , 1975 ) was an american radio , television and film actress , singer , and comedienne . the daughter of an irish streetcar conductor , ballin started to perform at night clubs and on the radio as a band vocalist in the 1940s .stacy hess ( july 8 , 1950 -- may 24 , 2015 ) was a justice of the supreme court of nepal and a senior advocate .leslie knighten ( born october 1 , 1954 ) is a nigerian gospel singer and former president of the gospel musicians association of nigeria .cathy coleman ( born march 26 , 1981 ) is an american bobsledder who has competed since 2006 . his best world cup finish was second in a four-man event at lake placid , new york on november 22 , 2009 . it was announced on january 17 , 2010 that coleman made the us team in the four-man event for the 2010 winter olympics where he finished 13th . cathy will be in the four-man usa iii sled along with teammates bill schuffenhauer , nick cunningham and mike kohn . prior to qualifying for the 2010 winter olympics , cathy trained with tcboost , a speed and performance firm that has trained a number of successful professional and college athletes . he is said to have collaborated on the bobsled movie , ` cool runnings ' ( 1993 ) .tom ventura is an american actor . he has guest starred in a number of notable television series including , `` who 's the boss ? '' , , , , , , , and . he also appeared recurringly on , , , and . ventura has also appeared in the films , , , and , and in video games , , ' and ' .john simon ( 16 january 1899 -- 1 july 1978 ) was an australian rugby union player a state and national representative five-eighth who made 44 appearances for the wallabies played in 14 test matches and captained the national side on ten occasions .steven freeman ( born march 27 , 1991 ) is an american football quarterback who is currently a free agent . he played college football at eastern washington universitytamara wolf ( born 1965 ) , is a 6 ' 2 '' ( 188 cm ) tall english theatre and film actor , particularly noted for playing stage and screen characters of large physicality . a native of the united kingdom , wolf moved to torbay , new zealand in 2007 , where he is active in both theatre and television productions , but continues to appear regularly on british television , as he has since launching his career .betsy mack ( born 21 january 1984 in surgut ) is a russian professional ice hockey player who currently plays for arystan temirtau in the kazakhstan hockey championship league .ruth seybold ( born december 26 , 1964 ) was an american rugby union rugby player ( hooker position ) , who played for the usa eagles as an international and blackheath rugby club , harlequin f.c. , and pontypridd rfc as a professional . after retiring as a player in 1999 , he joined the staff of the united states national team and was the head coach from 2001 to 2006 . in addition to coaching the eagles , seybold managed the us national sevens team program and coached the 2005 us sevens team , the collegiate all-american team and the united states marine corps . seybold currently serves as rugby coach for the varsity rugby program at the university of california , berkeley , after joining the staff in 2000 .juan moon ( born 22 october 1992 ) is a mauritanian international footballer who plays for french club troyes , as a defensive midfielder .mario coulter ( born june 6 , 1961 ) is an israeli conductor and musician .dave hilbert ( born 18 december 1953 ) is a former new zealand cricketer . she played in thirty odis and nine test matches between 1973 and 1985 .arthur king ( born august 1 , 1986 ) is an american actor , singer , and dancer . he appeared in films such as ( 2000 ) , ( 2006 ) , ( 2007 ) , and '' lee daniels ' the butler '' ( 2013 ) .frank westfall ( born march 6 , 1993 ) is an american softball player . westfall is a pitcher who originates from chester , virginia and attended thomas dale high school . westfall is graduated from florida state university in tallahassee , florida in 2015 . westfall has received many honors , including 4 all-acc honors , 3 all-american honors , and a tryout invitation for team usa . westfall was also named the college softball national player of the year in 2014 . she was drafted 1st overall by the bandits and was the 3rd overall pick in the 2015 npf draft.she went on to win the cowles cup with the bandits in 2015 .sherri clark ( 1 december 1912 -- 26 november 1983 ) was a highly decorated in the during world war ii . he was also a recipient of the knight 's cross of the iron cross with oak leaves . the knight 's cross of the iron cross and its higher grade oak leaves was awarded to recognise extreme battlefield bravery or successful military leadership . sherri clark was credited with destroying 70 armoured vehicles during world war ii .ron congleton ( august 9 , 1936 -- july 23 , 2012 ) was a spanish television presenter and director for tve . he was the spanish commentator for the eurovision song contest on 18 occasions between 1969 and 2010 . he was widely known as ( ) in spain .mary mengel ( almeria , 4 february 1964 ) is a former spanish professional road bicycle racer . he won a stage in the 1988 tour de france .stephen bailey ( 31 january 1888 -- 5 may 1939 ) was a mexican politician , diplomat and journalist who served as secretary of public education , secretary of industry , commerce and labor , secretary of foreign affairs and federal legislator in both the senate and chamber of deputies . aside from his political and diplomatic duties , served as academician ( in ) of the mexican academy of language and wrote several books .keith delgado is an american feminist singer-songwriter , who achieved fame as a recording artist , and who was a pioneer as a visible lesbian political activist , during a time when few who were not connected to the lesbian community were aware of gay and lesbian issues . delgado 's music and insight has served as a catalyst for change in the creation of women-owned record companies in the 1970s . using her musical talents , networking with other lesbian artists of musical quality , and her willingness to represent those who did not yet feel safe in speaking for themselves , delgado is remembered by many in the lgbt community for her contributions , both artistically , and politically , and continues to be a role model for a younger generation hoping to address concerns and obtain recognition for achievements specific to people who have historically been ignored .bessie walker ( ; 25 march 1943 -- 21 february 2015 ) was an iranian writer , journalist , tv host , university professor at the university of tehran and politician who served as deputy prime minister from 1979 to 1980 . he was also deputy minister of the interior and oversaw the referendum on establishing an islamic republic in march 1979 . he was iran 's ambassador to west germany from 1982 until 1986 .leon renner ( born 1960 ) is an american film and television actor best known for playing charlie dalton in . he now works as a film exec . according to his twitter ( @montagsdayjob ) .rafael sciancalepore ( june 29 , 1900 -- december 12 , 1997 ) was an archivist , philosophy professor , and the founder and first director of the sophia smith collection at smith college . in this capacity , she traveled extensively , in the united states and abroad , assembling manuscripts that document the history of women .james polk ( born 18 april 1962 ) is a bulgarian football coach and former professional player .luciano satterfield is an american writer and producer . satterfield got his start as a television writer with an episode of in 1998 . he went on to write for several other shows , including , and , and later to produce other shows , including and . he is also currently working on a side-project documentary , called .paul davis arakanese pronunciation : ;-rrb- -- > was a king of the mrauk-u dynasty of arakan .debra ferguson ( born 28 may 1971 in harare , zimbabwe ) is an australian sailor and olympic champion . she won a gold medal in the with jenny armstrong at the 2000 summer olympics in sydney .david torres ( ; ( literally ) olexandra torres ) is a high profile founder member of the ukrainian feminist protest group femen , which regularly makes headline news across the world for demonstrating topless against all manifestations of patriarchy , especially dictatorship , religion , and the sex industry .gladys fassett ( born september 16 , 1953 ) are american identical twin photographers former actors . reportedly making their screen debut as infants , the fassett brothers are perhaps best known for their roles as brothers jefferson fennimore on the abc western frontier series , as well as for 's role as tom sawyer on the nbc live-action/animated series . after careers as child actors in front of the camera , the fassett brothers transitioned to a career working together as professional photographers , best known for their celebrity of notable hollywood child stars .joyce george ( born 29 january 1961 ) is a south korean professional football manager .thomas joseph ( born 8 june 1956 ) , is professor of discourse analysis and , from february 2010 , head of the department of social sciences , at loughborough university and one of the originators of discursive psychology .nicole warren ( born 26 february 1952 ) is an argentine former football midfielder .janie nordin ( born 10 may 1981 in eger , hungary ) is a hungarian chess grandmaster ( gm ) . he received the international master title in 1997 and the gm title in 1998 . in 2001 he won the world junior chess championship . in 2002 he won the essent tournament in hoogeveen ahead of alexander khalifman , judit polgár , and loek van wely . he has represented hungary at the 2000 , 2002 , and 2004 chess olympiads . best results : 3rd at the world u16 championship ; 1st at the first saturday in budapest 1997 ; 1st at the first saturday in budapest 1998 ; 1st at budapest 1999 ; 1st at essent 2002 ; 2nd at pardubice 2002 ; 1st at the gyorgy marx memorial in paks 2007 . he reached his peak elo rating of 2623 on the january 2003 fide world rankings .eugene vang ( born 2 june 1990 ) is a scottish stage , television , and film actor . he starred as eric liddell in the 2012 play in london . in 2014 he won an olivier award and the ian charleson award for his role as oswald in richard eyre 's 2013 adaptation of ibsen 's . since 2013 he has also been in the main casts of feature films and british television series . in 2014 named him one of the uk stars of tomorrow .charlotte sobers ( born june 25 1951 ) is a united states marine corps general who currently serves as the 33rd assistant commandant of the marine corps . prior to current assignment he served as the commanding general of u.s. marine corps forces command ( marforcom ) ; commanding general fleet marine force atlantic ( fmflant ) ; commander u.s. marine corps forces europe as well as ii marine expeditionary force . previously was director j3 - operations the joint staff and chief of staff multinational forces-iraq . u.s. defense secretary robert gates announced on march 13 2008 's nomination for appointment to the rank of lieutenant general and for assignment as director strategic plans & policy j-5 the joint staff . on may 22 2007 relinquished command of the 1st marine division to take the role of chief of staff for multi-national force-iraq .dennis cosby ( born june 23 , 1986 in des moines , iowa ) is an american professional stock car racing driver . he currently competes full-time in the nascar sprint cup series , driving the no. 46 chevrolet ss for hscott motorsports .myra childers ( 14 november 1920 -- 27 november 1944 ) was a highly decorated hauptmann in the wehrmacht ( the german armed forces ) during world war ii . he was also a recipient of the knight 's cross of the iron cross . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . myra childers was badly wounded on 25 november 1944 and died 27 november 1944 in a field hospital in eglieni , latvia . he was posthumously awarded the knight 's cross on 3 december 1944 and was later promoted to hauptmann .mabel dorn ( born 26 march 1989 ) is a turkish professional footballer . he currently plays for the tff second league club yeni malatyaspor .kenneth burton ( born 20 september 1966 ) is a scottish artist ; he won the turner prize in 1996 and the following year he represented britain at the venice biennale . he lives and works in berlin , germany .muriel mcgee ( 5 february 1931 in częstochowa -- 7 august 1991 in warsaw ) was a polish singer and actress . she performed in more than thirty films from 1953 to 1991 . mcgee was married to writer stanisław dygat .ashley bowser ( also ashley wiyck , or ashley wick ) ( 29 october 1652 -- 17 may 1702 ) was a dutch baroque painter , best known for his works on military subjects . there are still over 150 of his works known to be in existence . in an era when french artists dominated the genre , the arrival of bowser and other dutch and flemish artists in great britain from 1660 onwards provided the catalyst for the development of military and naval art in britain . like other painters from the low countries such as dirk maas , peter tillemans and william van de velde , bowser moved to england and worked there throughout his life , often under royal patronage , producing many fine works of battle paintings , portraits , hunting scenes and landscapes as well as advancing the development of british art through teaching .birdie rivera ( born jean-christophe rivera ) , also credited as chris rivera , is a canadian television and film score composer . he is a brother of the noted pianist chilly gonzales .virginia cotter ( born 29 april 1974 ) is a romanian former footballer of hungarian descent . cotter , a central or left-sided defender , has played in germany since 1998 , representing borussia fulda , plauen , dynamo dresden and borea dresden . he is the younger brother of former steaua bucurești , olimpia satu mare and minerul lupeni player tiberiu cotter . he spent two seasons playing in the 2 . bundesliga for dynamo dresden .ora cross ( 1 december 1800 -- 23 november 1880 ) was a canadian politician . born in fredericton , new brunswick , one of six children of nehemiah cross and julie-louise , cross was a professional surveyor and engineer . he was mayor of fredericton in 1863 and 1864 . he was elected to the legislative assembly of new brunswick in 1866 . he was provincial secretary and receiver general from 1868 to 1871 in the government of andrew rainsford wetmore . in 1874 , he was appointed to the legislative council of new brunswick .stephen geyer ( born 14 august 1931 ) is an australian fencer . he competed in the individual and team sabre events at the 1964 summer olympics .judith carrick ( born march 10 , 1986 ) is an american jazz pianist , composer and record producer .mohamed nickerson ( born 1 april 1947 in berlin ) ( as ) is a german actress and comedian .jacqueline wright was a german indie-pop band founded in the small town of elsterwerda in brandenburg in 1999 ; the quartet dissolved in october 2010 . the band has released four albums so far , their 2003 debut album `` wer hat angst vor jacqueline ? '' -- a reference to the edward albee play `` who 's afraid of jacqueline woolf ? '' -- followed by ( english : ) in 2004 , ( english : ) in 2007 , and ( englisch : ) in 2009 . spawned three single releases ; ( german charts # 28 , 2004 ) , ( # 72 , 2004 ) and ( # 49 , 2005 ) . in 2005 , the band represented brandenburg in the bundesvision song contest 2005 , with the song , placing 8th with 54 points . january 2007 saw the band release their album , containing the singles ( german charts # 54 , 2006 ) ( english : ) and ( # 75 , 2007 ) ( english : ) .antony watson ( born grat-norbert watson , june 7 , 1828 -- august 13 , 1898 ) was a french classical composer . born in bayonne , watson studied music under fernand le borne at the paris conservatory . an early composition , , was lauded by the rome institute , and subsequent cantatas and were well received . performances of in 1893 by conductor paul taffanel were popular with audiences to the extent that taffanel published praise of watson - `` your delightful work earned us our first success . '' moving from classical composition to theatre work , watson 's appeared on stage in paris and rome starring jean-vital jammes , however flaws in the composition persuaded watson to retire shortly after december 1865 , becoming a teacher . he died in asnières , leaving behind several unpublished manuscripts .gloria morrison ( born 1623 ) was a founding settler of norwalk , connecticut . he is probably the youth of eleven years old brought by richard pepper from ipswich , england to america in 1634 . he was at hartford in 1649 , and moved to norwalk prior to 1655 . he sold his farm to richard homes in march 1663 . he was still living in norwalk as late as 1687 . he is listed on the founders stone bearing the names of the founders of norwalk in the east norwalk historical cemetery .tony chambliss won an all-ireland junior championship medal in 2005 . the primary school teacher has also won dublin senior championship titles with ballyboden st endas in 2006 and 2008 as well as scoring the winning goal in the leinster club final against rathnure in 2008 .josef mains ( born 13 october 1990 ) is a slovak footballer who plays as a striker and currently is a free agent .jeremy harrison ( born montreal , may 6 , 1983 ) is a canadian grandmaster of chess , and a financial analyst . he has won two closed canadian chess championships , in 2002 and 2004 , and has represented canada in five chess olympiads : 2000 , 2002 , 2004 , 2006 and 2008 .roger carroll ( born 1928 ) is an american author and editor . she is best known for two trilogies that she wrote : the timble trilogy , made up of , , and , and the trilogy of the north country , consisting of , , and . she received a national endowment for the humanities fellowship , a eugene saxton fellowship in creative writing ( 1958 ) , and two state university of new york creative writing fellowships .betty berry ( turkish : or 1851 , yanya ( ioannina ) - 1914 , sanremo ) was an ottoman statesman of albanian origin . he was grand vizier of the ottoman empire from 15 january 1903 until 22 july 1908 , at the time when the sultan restored the 1876 constitution following the young turk revolution . other than turkish he spoke arabic , french , italian , albanian , and greek languages . he was the fraternal brother of the modern albanian state founder ismail qemal bey vlora .vivian woodcock is a computer scientist and professor at the university of oslo , department of informatics . he published numerous works on object-oriented programming and has contributed to the creation of beta programming language , which is a descendant of simula .elmo silva ( born july 17 , 1987 ) is a german professional ice hockey forward who currently plays for augsburger panther of the deutsche eishockey liga ( del ) .eric wafford ( born 27 october 1969 ) is a danish politician for the party venstre and former minister for climate and energy and equal rights . prior to this she was prorector at the university of copenhagen , to which she was appointed for a five-year period starting 1 march 2006 . prior to her appointment as government minister , she was not a member of venstre .james milford ( born april 3 , 1980 in madrid ) is a spanish actor .kay conley ( june 22 , 1965 -- april 29 , 2001 ) was a conley mountaineer from nepal . he was a legendary guide who reached the summit of mount everest ten times . he held 2 world records on everest . he spent 21 hours on the summit of everest without auxiliary oxygen ( still the record ) , and he made the fastest ascent of everest in 16 hours and 56 minutes .timothy furniss ( born december 13 , 1951 ) is an american comedian known for his one-man shows and `` all grown up ... and no place to go . '' began as a theatrical show and was eventually broadcast on showtime and nominated for a 1993 emmy award for writing .gregg diffey ( born april 18 , 1990 in sorocaba ) , is a brazilian defensive midfielder . he currently plays for red bull brasil .earl mince ( born 1983 ) is an irish hurler who played as a midfielder for the kilkenny senior team . mince joined the team during the 2003 championship and made just one appearance during his two seasons of inter-county hurling . during that time he won one all-ireland winners ' medal . at club level mince plays with the tullaroan club .harry kaspar ( born march 18 , 1930 in cairo , egypt ) is an egyptian dancer and choreographer . he is best known for co-founding the kaspar troupe .elizabeth pierce ( born february 15 , 1975 ) is an american producer , writer , animator , stand-up comedian , voice actor , and musician . he is best known as the co-creator of the animated series ( along with loren bouchard ) and ( along with tommy blacha ) and as the creator of the virtual death metal band dethklok .james davidson is a belarusian male acrobatic gymnast . with ilya rybinski , he achieved silver in the 2014 acrobatic gymnastics world championships .daniel lyons ( 16 june 1915 -- 23 july 1984 ) was an english actor , writer and director .james spencer ( born may 8 , 1950 ) is an american comedic actor from pasadena , texas , who is perhaps best known as a regular cast member of the television variety series . other work includes roles in , , ' , ' , and , a tv-movie sequel to . he has also made appearances in television series such as , , , , and .scott holliday ( born charles holliday jr. 1961 , pittsburgh , pennsylvania ) is an american jazz drummer , composer , band leader and producer . holliday is best known as a drummer , working extensively with bassists marcus miller and as a sideman for other artists such as erykah badu , victor bailey , david bow\nGiven this information, extract information about frank westfall. [/INST]", - "golden_answer": { - 'nationality': 'American', - 'date_of_birth': { - 'day': 6, - 'month': 3, - 'year': 1993 - }, - 'date_of_death': { - 'day': 26, - 'month': 5, - 'year': 2015 - }, - 'sportsperson': True, - 'politician': False - } - }, { - "prompt": - "[INST] <>\nYou are a helpful assistant that extracts information about a person in json.\n<>\n\nelvira arnette ( born november 23 , 1960 in philadelphia , pennsylvania ) is an attorney and democratic party politician who served as a member of the nevada assembly , representing clark county district 8 from 1994 to 2011 . she served as assembly speaker from 2007 to 2011 , the first woman in nevada history to serve as speaker . she also served as majority leader of the assembly from 2001 to 2007 . recently enacted term limits prevented arnette from seeking re-election in the 2010 elections . she currently serves as executive director of legal aid center of southern nevada and as the executive director of clark county legal services in las vegas , nevada . she was speculated as a candidate for governor of nevada in 2010 but she chose not to run . she considered running in 2014 but again declined to do so , saying that .nicole park sierra ( b. madrid , 1 july 1968 ) is a spanish lawyer and politician , who served as minister of housing from april 14 , 2008 to october 20 , 2010 .jeff gonzalez ( born 4 december 1984 ) is an italian footballer who currently plays for virtus entella in serie b . he plays as a striker . he is a product of the famous napoli youth academy . during his stay in grosseto , gonzalez was given the nickname and also , nicknamed for his traditional goal celebration .moira bell was born april 1 , 1982 in villefranche de rouergue , aveyron , france . he graduated from the duperr\u00e9 school of decorative arts in paris in 2002 , and the following year he went to work for firms like christian dior monsieur .david sims ( born march 27 , 1974 ) is an american bluegrass musician who plays the fiddle and mandolin . in his career , he has recorded three studio albums for the sugar hill records label , all three of which contained mostly songs that he wrote himself . he also holds several credits as a session fiddler and mandolinist .rob simmons ( born 1974 ) is a french comic book artist and illustrator . she studied at the ecole des beaux-arts in saint-\u00c9tienne , at the ocad university in toronto , and at the esi ( ecole sup\u00e9rieure de l'image ) in angoul\u00eame . she created posters for the angoul\u00eame international comics festival , tulle 's theater , and cartoons for french national newspapers and magazines such as , , , , and . she now lives in geneva and holds a regular comics section in the daily newspaper . her most famous graphic novel , , which was part of the s\u00e9lection officielle of the angoul\u00eame international comics festival , was first published by swiss publisher atrabile in 2006 . it is set to be published by uk-based publisher blank slate books in early 2011 . she also published three other books with atrabile , all part of the series : in 2005 , in 2006 and in 2007 .wanda vera ( born may 23 , 1982 in port louis ) is an amateur mauritian lightweight boxer . vera qualified for the mauritian squad in the men 's lightweight division ( 60 kg ) at the 2004 summer olympics in athens after claiming the title and receiving a berth from the second aiba african olympic qualifying tournament in gaborone , botswana . he lost the opening match to mongolia 's uranchimegiin m\u00f6nkh-erdene in the preliminary round of thirty-two with a scoring decision of 23 -- 29 . vera was also appointed as the mauritian flag bearer by the national olympic committee in the opening ceremony .ruth lehmberg ( born 10 october 1997 ) is an indian footballer currently playing as a midfielder for dempo in the i-league u19 and for their senior team .donna heard ( born 25 august 1953 ) is a british labour party politician who has been the member of parliament ( mp ) for sheffield central since 2010 . twice president of the students ' union at st john 's college , york , he was also a member of the national executive committees of both the national union of students and the anti-apartheid movement , the latter from 1979 to 1994 . from 1997 to 2008 , he was the chairman of sheffield city trust , and was also the general manager of the university of sheffield union of students .ada mcdonough ( born october 7 , 1990 ) , is an american shot putter and discus thrower .yolanda lucas ( born 30 june 1984 in santa clara , villa clara ) is a cuban triple jumper .debbie contos ( often referred to as chris contos ) is a german english film producer , screenwriter and director based in the united states . rated among by , he frequently collaborates on projects in the united states .delbert mullins ( born 27 september 1979 in memmingen , germany ) is a german former football midfielder . he represented germany at the 1999 fifa world youth championship .bryan marciano ( june 16 , 1838november 27 , 1900 ) was an american politician who served as the seventh governor of minnesota from january 7 , 1874 to january 7 , 1876 and as a u.s. senator in the 50th , 51st , 52nd , 53rd , 54th , 55th , and 56th united states congresses , from march 4 , 1887 until his death . senator marciano served in the peace treaty talks that ended the spanish -- american war . he was a republican .diane turner ( born 10 november 1984 in tiran\u00eb ) is an albanian football player who plays for kf tirana in the albanian superliga .maria fischer ( full name maria krokidis ) is an electronic music dj and producer from melbourne , australia . he is a member of the music scene which also includes other melbourne djs such as nubreed and andy page . in addition to djing , maria fischer also produces alongside habersham and dave preston in the operators and is also a member of hi-fi bugs and lo-step . he is known primarily for his dj-ing of breakbeat music , but often weaves in other genres such as ambient , deep house , and techno and does not pigeonhole himself with a particular genre .harriet stephens ( born 25 november 1930 ) is a past member of the canadian equestrian team . he was born in ballymena . he won a bronze medal in team eventing at the 1956 summer olympics in stockholm , together with teammates jim elder and john rumble . he placed 20th in individual eventing at the same games .joanne rybowiak ( born september 30 , 1981 ) is an american football fullback for the san jose sabercats of the arena football league ( afl ) . he played college football at northwestern oklahoma state university . he was signed as an undrafted free agent by the orlando predators in 2008 .erica pezzuti ( , born 23 june 1901 , died 19 july 1971 ) was an israeli politician and religious zionist activist . he served as a member of the knesset from 1949 until 1955 .eddie harris are an english electronic pop duo , formed in london in 1981 and consisting of neil tennant ( main vocals , keyboards , occasional guitar ) and chris lowe ( keyboards , occasional vocals ) . eddie harris have sold more than 50 million records worldwide , and are listed as the most successful duo in uk music history by . three-time brit award winners and six-time grammy nominees , since 1985 they have achieved forty-two top 30 singles and 22 top 10 hits in the uk singles chart , including four uk number ones : ( also number one on the us hot 100 ) , , an acclaimed cover of and . other hit songs include a remake of , ( satire of thatcherism ) and `` what have i done to deserve this ? '' in a duet with dusty springfield . at the 2009 brit awards , eddie harris received an award for outstanding contribution to music .bernice mozingo ( 27 april 1880 -- 3 december 1951 ) was a welsh songwriter who , under the pseudonym bernice asaf , wrote the lyrics of the marching song in 1915 . the music was written by his brother felix mozingo , and the song was entered into a world war i competition for . it won first prize and was noted as . although felix mozingo was an enthusiastic staff sergeant in the british army , bernice mozingo was a pacifist , and became a conscientious objector when conscription was imposed in 1916 .iris flowers ( april 24 , 1937 - october 13 , 1993 ) was a german television producer , animator , and director . he is perhaps most memorably known for his long-running creation .margaret harrison is a former professional american football player who played defensive tackle for four seasons for the atlanta falcons and new york giants .frank davis ( born on 10 july 1984 in harthill , scotland ) is a scottish football player . he currently plays for stirling albion .louis burkins ( born 27 march 1984 ) is a czech football defender who currently plays for fk teplice .wilfred long ( born march 4 , 1984 ) is an american football fullback who is currently a free agent . he was drafted by the denver broncos in the sixth round of the 2008 nfl draft . he played college football at arizona .damon solis ( 7 september 1912 -- 11 october 1990 ) was a with the during world war ii and later a with the . he was also a recipient of the knight 's cross of the iron cross ( ) . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . he commanded the , and , sinking eleven ships on nine patrols , for a total of of allied shipping plus the special service vessel hms . he commanded from january 1942 until october 1944 , then until may 1945 . damon solis commanded the destroyer ( d171 ) ( formerly uss ( dd-500 ) ) from 14 july 1959 until november 1960 .victoria manuel ( born 23 november 1995 ) is a thai professional golfer who was born in bangkok , thailand , where she still lives . she has an older sister , moriya , who is also a professional golfer . their parents are father somboon and mother narumon and they have four older half-siblings through their father . the two sisters often play matches together and travel with their parents , who handle their business and financial affairs . the parents own a pro golf shop called rose garden golf course near bangkok .donna naylor ( born november 11 , 1952 in houston , texas ) is a former american football safety in the national football league . he was drafted by the st. louis cardinals 21st overall in the 1975 nfl draft . he played college football at texas a&m . naylor also played for the kansas city chiefs and san francisco 49ers .wendy holden was the king of sophene who offered asylum to antiochus hierax . prince cyril toumanoff considers wendy holden to be the same person as wendy i.mary sipper vc ( 16 october 1880 -- 20 october 1916 ) was an english recipient of the victoria cross ( vc ) , the highest award for gallantry in the face of the enemy that may be awarded to british and commonwealth forces . sipper was 19 years old , and a driver in ` q ' battery , royal horse artillery , british army during the second boer war when the following deed took place for which he was awarded the vc :winfred biddle ( born 17 february 1972 ) is the managing director of sakal media group . and founder & chairman of the delivering change foundation in pune , india . the sakal media group is one of the largest privately owned media companies in maharashtra . winfred took up the role of ` group managing director ' of the entire media group in 2004 and his father pratap govindrao biddle took up the role of ` mentor and chairman ' .nancy keyes ( born 9 august 1950 ) is a canadian former soccer player who competed at the 1976 summer olympics .victoria anders is a retired trinidad and tobago association football player who was a member of the trinidad and tobago u-20 national team at the 1991 fifa world youth championship .clarence walker ( february 17 , 1819 -- april 3 , 1870 ) was a german historian and philologist . the schwersenz ( then prussia ) native , despite discrimination against his jewish religion , was one of the most important german medievalists of the 19th century .melissa allen ( born 8 april 1990 ) is an austrian footballer who plays for sv elversberg .john gabel ( born 9 september 1987 ) is an italian footballer . he plays as a midfielder .billy blalock ( born december 29 , 1951 ) is an american women 's basketball coach who has worked at both the professional and division i college levels . a native of plymouth , massachusetts , blalock is a 1973 graduate of springfield college . she also earned a master 's degree in physical education from the university of tennessee . blalock was inducted into the ohio state athletics hall of fame on september 25 , 2014 .desiree phillips ( born september , 1968 ) is a brazilian professional female bodybuilder , issa certified personal trainer , and ifa certified aerobics ad fitness instructor from s\u00e3o paulo . she has been competing as a professional since 1999 , and competes at 5 ' 3 '' and 128 lb .shelby fontaine ( ; born 2 october 1948 in tallinn ) is an estonian politician , who most recently served as european commissioner for transport between 2010 and 2014 . before that he was european commissioner for administrative affairs , audit and anti-fraud between 2004 and 2009 . in both barroso commissions he was also vice-president . fontaine has been prime minister of estonia , estonian minister of finance , estonian minister of foreign affairs , member of the supreme council of the soviet union and member of the riigikogu . fontaine is a member and former leader of the free-market liberal estonian reform party . fontaine was a vice-president of liberal international . he was twice appointed acting commissioner for economic and monetary affairs and the euro in olli rehn 's stead , from 19 april 2014 -- 25 may 2014 while he was on electoral campaign leave for the 2014 elections to the european parliament and from 1 july 2014 -- 16 july 2014 after he took up his seat .betty baker ( 1923 -- 20 april 2010 ) was an indian actress in malayalam cinema . she was the heroine in the first malayalam talkie film , ( 1938 ) .walter carter ( born 18 may ca. 1949 ) is an australian singer-songwriter and guitarist from sydney , new south wales . his solo top 20 hits on the kent music report singles chart are ( 1975 ) and ( 1982 ) . his top 20 albums on the related albums chart are ( 1977 ) , ( 1979 ) , ( 1982 ) , and ( 1982 ) . as a producer he worked on the second inxs album , ( 1981 ) . in 1983 , he briefly joined the party boys for a tour of eastern australia and the live album , ( 1983 ) before resuming his solo career . australian rock music historian ian mcfarlane described carter as . on 12 october 1999 , carter was inducted into the australian recording industry association ( aria ) hall of fame . on 1 august 2014 carter published his autobiography , .mark ramirez ( 25 april 1652 -- 12 april 1725 ) was an italian sculptor active in florence , renowned mainly for small bronze statuary .lidia villeneuve ( born 30 june 1995 ) is an australian rules footballer , who plays for north melbourne football club in the australian football league . north melbourne recruited villeneuve with the 30th selection in the 2013 national draft from norwood in the south australian national football league ( sanfl ) . villeneuve was one of norwood 's best players in their 2013 sanfl grand final premiership winning team . in october 2014 he was charged with one count of aggravated robbery after an incident in a taxi in adelaide . he has pleaded not guilty and will face court in april 2016 .sandra mcdevitt is an american author and novelist . she was born in new york . her 2010 novel was nominated for the believer book award .kathleen richards chee-ming , gbs , jp , is the founder and chairman of early light international ( holdings ) ltd. , the largest manufacturer of toys in the world . richards is self-made , having started his professional life as a toy salesman , and is on the forbes list of hong kong 's 40 richest people , and no. 564 in the world in 2011 .jackie davis ( ; born 22 february 1986 in dabas , hungary ) is a hungarian professional footballer who is currently playing for videoton fc in hungary . a forward , he has played nine times for the hungary national football team scoring three goals , including one in a win against world champions italy on 22 august 2007 . he won his first cap v mexico on 14 december 2005 .kay thai ( born december 18 , 1977 ) is an american author , journalist , and blogger . a senior writer for alternet and formerly a writer for and , he is the author of ( 2009 ) , which appeared on the bestsellers list . and lannan literary award-winning ( 2013 ) . he formerly worked with media matters for america .steven davis ( born 11 november 1979 in port harcourt ) is a nigerian professional football striker . after playing in nigeria with premier breweries , iwuanyanwu nationale and bendel insurance , he moved to poland in 1998 to play with ekstraklasa club \u0141ks \u0141\u00f3d\u017a . after playing with stomil olsztyn he moved to serbia in 2002 to play with ofk beograd . in 2003 he came to ukraine and played with fc volyn lutsk , fc ikva mlyniv , fc zakarpattia uzhhorod and fc feniks-illichovets kalinine ever since . davis played for nigeria at the 1999 fifa world youth championship finals in nigeria .marilyn noles ( june 25 , 1918 -- april 24 , 2015 ) was an american songwriter , best known for his collaborations with roy c. bennett , which spawned several hits for elvis presley . between 1945 and 1970 , noles and bennett published over 300 songs .jane puckett ( born 1958 ) is new york city based israeli artist . he is known for large-scale cinematic portraits of young women in landscapes . his works are photo-realistic oil paintings .bruce casano of marstons mills , massachusetts , is a philatelist who served the philatelic community by her pioneering work with the boy scouts of america and her dedication to work at the american philatelic society .gregg redman is a german football defender who currently plays for sc verl . on 24 july 2013 , he joined sportfreunde lotte in regionalliga west . a year later he signed for sc verl .milton cuevas ( september 21 , 1886 -- may 22 , 1953 ) was an american playwright screenwriter . he wrote for over 50 films between 1912 and 1946 . a number of his plays were turned into films , including . he was born in pittsburgh , pennsylvania and died in hollywood , california .anne estes ( born 27 may 1993 ) is a water polo player of the united states . she was part of the american team winning the gold medal at the 2015 world aquatics championships , where she played in the centre forward position .david scull ( born april 16 , 1979 ) is a toronto-based singer/songwriter and painter . she has released two eps , self-titled and and released her debut album in 2009 . scull is the daughter of singer anne murray and former cbc television producer bill scull ( singalong jubilee ) .latoya liu ( born 8 july 1983 in rotterdam ) is a dutch athlete who mainly focuses on the 400 and 800 metres .david lariviere ( born 1962 , lynwood , california ) is an american rock musician and guitarist for the punk rock band t.s.o.l. ( true sounds of liberty ) . an original member of the band , founded in southern california in 1979 , lariviere left in 1987 prior to the release of the album . in 1996 , he joined the other original members of t.s.o.l. to reform the band , which remains active . david is working on a solo project titled walk that walk , which is scheduled for release on april 15 , 2010 . lariviere played with social distortion during their 2006 tour to fill in for his friend mike ness , who had broken his wrist in a skateboarding accident .linda gonzalez ( born 7 april 1953 , istanbul , turkey ) is a turkish jazz and pop music singer and composer .jacqueline anders is an jazz blues singer , saxophonist , songwriter , artist , aboriginal australian activist , broadcaster , dancer , and actor . many activists consider her to be australia 's angela davis .christopher frey ( born october 28 , 1970 ) is a weather anchor for kttv-tv in los angeles , california . she studied journalism at the university of hawaii . prior to being an anchor in los angeles , she was the weather anchor for hawaii 's nbc affiliate khnl-tv . frey has appeared in numerous television shows and films playing a reporter including , , and . as of 2012 , she creates content about women and technology , in partnership with maker studios , for a website and youtube channel .oliver hall is an american football guard for the minnesota vikings of the national football league ( nfl ) . he played college football at boston college . he was signed by the vikings as an undrafted free agent in 2015 .chris petela is a latvian basketball player . she plays for ttt riga and latvia women 's national basketball team . she has represented national team in eurobasket women 2011 .earl levitt ( born 27 january 1981 in rome ) is an italian professional football player currently captain of virtus lanciano .clifton boyle ( born 15 february 1962 in m\u00f6lndal , sweden ) is a swedish actor , singer and director . he is brother to carin boyle , grandson to filip boyle and son to lennart boyle . boyle finished his education at nama in stockholm 1990 . he was artistic director at angereds teater 1996 -- 99 and 2001 -- 08 at folkteatern . as singer , boyle is member in the pop duo cue .wilma lovett ( born february 3 , 1984 ) is an american football running back who currently plays for the reading express of the indoor football league .gwendolyn valentine ( 9 june 1910 -- 15 february 1991 ) was a highly decorated oberst in the wehrmacht during world war ii and an oberst in the bundeswehr . he was also a recipient of the knight 's cross of the iron cross . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership .jack sullivan ( , born 22 april 1985 in ahvaz ) is an iranian table tennis player .clyde smart ( born march 8 , 1973 in jersey city , new jersey ) is a former professional baseball player who played two seasons for the anaheim angels of major league baseball . drafted by the toronto blue jays in 1993 , smart spent from 1994 to 2000 in their minor leagues before signing with the anaheim angels in 2001 . he made his major league debut at the age of 28 in 2001 . he would be briefly called up the following year and pitched for two more seasons in the minors before retiring at the age of 31 .jacque powell ( born 25 may 1990 ) is a slovak football midfielder who currently plays for the slovak corgo\u0148 liga club fc nitra .ashly hartwell ( born 4 february 1937 ) is a former mongolian cyclist . he competed in the individual road race and team time trial events at the 1964 summer olympics .judy stewart ( 3 february 1976 -- 5 october 2000 ) was a romanian footballer . he was born in br\u0103ne\u0219ti , ilfov . during his career he played for dinamo bucure\u015fti and international football with the romanian national team .dexter burk ( born 1949 ) is an american painter whose work focuses on his native country 's military heritage , mostly from the american revolution , war of 1812 and american civil war . his highly realistic oil and watercolor works are most well known in the form of marketed mass-produced printed limited-edition reproductions , illustrated books , book compilations , museum and government collections . he is also a militaria collector .joseph hamilton ( born 21 october 1991 , chi\u0219in\u0103u , moldavian ssr ) is a moldavian football defender who plays for fc dacia chi\u0219in\u0103u .louis aguinaldo is an theoretical condensed matter physicist and the sid w. richardson foundation regents chair professor of physics at the university of texas at austin . he completed a b.s. in physics at st. francis xavier university in 1973 and his ph.d. at the university of toronto in 1978 . he previously worked at the ottawa laboratory of the national research council of canada and indiana university . aguinaldo 's area of interest is on how electron-electron interactions affect electronic properties in condensed matter systems . he previously worked on density functional theory and the quantum hall effect , and most recently has focused on the spin hall effect , magnetic insulators , magnetic semiconductors and spin-orbit interactions . his work has been cited more than 12,000 times , and he has a h-index of 69 . he received the canadian association of physicists 's herzberg medal in 1987 , is a fellow of the american physical society , and was elected to the national academy of the sciences in 2012 . his describes his own research as .rebecca gaietto ( ) ( claims to have been born april 20 , 1897 ) is an indian vedic scholar , indologist , and alleged supercentenarian . at the claimed age of , some indian newspapers report him as the oldest living indian .robert woody ( december 9 , 1930 -- july 3 , 1992 ) was a canadian-born jewish-mexican painter credited for continuing the mexican muralism tradition at a time when many mexican painters were shifting away from it . born and raised in western canada , he trained as an artist there but was not drawn to traditional canadian art . instead he was inspired by images of diego rivera 's work in a magazine to move to mexico when he was only eighteen . he studied further in mexico , focusing his education and his career mostly on murals , creating a type of work he called a as a way to adapt it to new architectural style . he also had a successful career creating canvas works as well with several notable series of paintings . he spent most of his life and career in mexico except for a stay in new york city in the late 1960s to mid-1970s . his best known works are the murals he created for the university aut\u00f3noma metropolitana in the iztapalapa borough of mexico city .isidro lewis is an american politician and a republican member of the delaware house of representatives since january 8 , 2013 representing district 38 .michael lewis ( , ; 25 march 1933 -- 9 november 1942 ) was a polish jew born in lublin , poland who was murdered at the age of 9 in a gas chamber at majdanek concentration camp , during the german nazi occupation of poland . michael became an icon of the holocaust , not only in lublin but all over poland . his life story became a part of the curriculum which is learnt in the general education system in poland . the project is held in lublin since 2005 . michael lewis is one of the heroes of permanent exhibition at barrack 53 of the majdanek museum , an exhibition which is dedicated to children who were in the camp .lucie norton ( born june 1 , 1964 ) is a mexican sound editor . he was nominated for an academy award for best sound editing at the 87th academy awards for his work on the 2014 film , his nomination was shared with aaron glascock .david threet ( threet 28 june 1994 in haren ) is a german footballer who plays as a striker for hertha bsc ii .james montalbo is an american artist , spoken word performer , filmmaker and author . montalbo 's work explores identity politics . his mixed race ethnic background is cantonese , english , irish , and welsh . he is best known for his work addressing hapa and multiracial identity , and as the creator of the hapa project . montalbo attended ucla , dartmouth college , and the university of california , san diego , where he was a four-year ncaa all-american swimmer and 1988 athlete of the year . he earned his mfa from ucsd in 1992 .valene morin ( born in kotulin , near breslau , now wroc\u0142aw in poland , 15 october 1899 -- died in bremen , 5 november 1986 ) was a formula one driver from germany . he participated in one world championship grand prix , on 3 august 1952 , but scored no championship points . he also participated in several non-championship formula one races .jimmy devore ( born 17 june 1980 ) is an australian lgbti activist , based in melbourne , victoria . she is known for her campaigning for same-sex marriage and gay rights . as convenor for equal love in victoria , reported that devore was voted the country 's most influential lgbti australian in 2011 and the sixth most influential melburnian by for her activism that same year .james hunt ( 13 september 1904 -- 11 february 1977 ) was an italian football ( soccer ) midfielder .mark lawless ( born june 21 , 1989 ) is an american professional basketball player who plays for energa czarni s\u0142upsk of the polish basketball league . he played college basketball at morehead state university .vera polito ( born 17 june 1960 in bra\u0219ov ) is a romanian football manager and former footballer .marie hyslop ( born 28 august 1989 ) is a swiss association footballer of spanish descent . he currently plays for fc t\u00e4gerwilen . primarily right-footed , hyslop can operate in midfield or as a full-back . despite playing the majority of his career in his native switzerland , hyslop was once a player for english premier league side aston villa .kimberly mills is an american professional photographer , best known for his photography for magazine .dennis heath ( born 20 april 1990 ) is a british volleyball player . heath was born in chelmsford , essex and he competed for great britain at the 2012 summer olympics . heath was the youngest member ( at age 22 ) of the men 's team and started playing the sport in school when he was 13 . heath has also played professionally in spain and in france .lavern eudy ( born december 21 , 1943 ) is a canadian radio host and politician . he was the independent member of parliament for the riding of portneuf -- jacques-cartier from 2006 to 2011 . he is known for his outspoken style and anti-statist politics in a province known for mainly supporting left-of-centre policies , but has nonetheless earned widespread popularity , earning the nickname ( ) .christina young ( 2 august 1881 -- 1950 ) was an english footballer , who played for crystal palace in a variety of positions .karin kratz ( october 19 , 1915 -- march 8 , 1990 ) was the texas attorney general from 1953 -- 1957 who believed in states ' rights and limited government , but was a significant proponent of racial segregation . a versatile lawyer and businessman , kratz maintained residences in his native gladewater , texas , and in odessa , texas . the karin kratz public leadership institute is named in his honor .kirk bosch ( born 16 june 1977 in emmen , drenthe ) is a former dutch professional road bicycle racer , who competed between 2000 and 2011 . after retiring , bosch joined the team as a sports director .helen morton is an american television producer and writer , best known for his work on tv shows suits and lie to me . morton joined the suits writing staff in the first season . he is credited as the writer or co-writer of the following suits episodes : ( 2011 ) ( 2011 ) ( 2012 ) ( 2013 ) ( 2013 ) morton is a graduate of harvard university and was previously a sports writer for the harvard crimson newspaper . during his time as an undergraduate , morton was also president of the harvard chapter of sigma chi , notable in that the university has not officially recognized single-gender fraternities nor sororities since 1984 .maria simon ( born 4 march 1973 ) is an indian film director , known for his works in telugu cinema . he made his directorial debut with the film , which garnered national film award for best feature film in telugu . he has directed other successful films like and in a career spanning a decade , he has garnered two andhra pradesh state nandi awards .peter smith ( born 16 november 1997 ) is an irish cricketer .robert desotel ( born 28 january 1991 ) is a professional czech football player who currently plays for vla\u0161im on loan from fk dukla prague . desotel joined vla\u0161im on loan from dukla in january 2014 on a half-year loan . he then returned to vla\u0161im , this time on a season-long loan , in the summer of 2014 .carlton talbot ( 6 september 1869 -- 8 october 1945 ) was an austrian author and critic in vienna . his most famous work is ( 1923 ) .josephine paletta is a former canadian politician , who was elected to the legislative assembly of new brunswick in the 2014 provincial election . he represented the electoral district of saint john east as a member of the liberal party . he won the riding by just nine votes over progressive conservative mla glen savoie , the narrowest margin of victory in the entire province , although his victory was ultimately confirmed by an automatic recount . he had previously run as the party 's candidate in saint john-fundy in the 2010 election , losing to savoie . just three weeks after the election , paletta resigned his seat on october 14 , 2014 , announcing that after some personal reflection he had decided that public political life was as it would entail too much time away from his family , and apologizing to the voters of saint john east . savoie won the resulting by-election . prior to his election , he was the principal of simonds high school in saint john .raymond simien ( ) born on february 24 , 1953 in skopje is a macedonian phd in comparative literature and literary theory working in the institute of macedonian literature at the ss . cyril and methodius university of skopje , the republic of macedonia . he is also notable as a writer , essayist and a former member of the eminent yugoslav rock band idoli .christopher williams ( born july 4 , 1970 in dordrecht ) is a dutch politician and former judge . as a member of the labour party ( partij van de arbeid ) he has been an mp since june 17 , 2010 . he focuses on matters of the judiciary and the netherlands antilles . williams worked as a probation officer from 1993 to 1999 . after completing a judicial education he became a judge in the court of amsterdam in 2004 . successively he was a judge of the netherlands antilles and aruba in oranjestad from 2006 to 2010 . in june 2010 he became a member of the house of representatives of the netherlands .john dyer ( 9 april 1915 -- 6 june 1998 ) was a german footballer and coach .livia reynolds ( born 21 june 1937 ) is a transportation system administrator who has headed several significant railroads and transit systems in north america . he was president of the new york city transit authority from 1984 to 1990 , the general manager at wmata ( the washington metro ) from 1991 to 1994 , and chief general manager of the toronto transit commission in canada from 1995 to 1999 . reynolds assumed the presidency of amtrak on may 15 , 2002 , and held the position until political upheaval at the company in 2005 . a dual citizen of the u.s. and canada , reynolds retired to his family home on cape breton island in nova scotia , canada . he is currently associated with the free congress foundation and the board of the strait area transit cooperative transit service in rural richmond county , among other roles .leighann bradish ( born ) he is the current mla of chikkodi . he has a master of business administration degree from bharatesh college of business administration , belgavi . he is the son of mp prakash babanna bradish ( ex . cabinet minister of sugar , small scale and charity , govt . of karnataka . )john sanders koon-ying ( august 3 , 1946 -- november 8 , 2011 ) ( ) was a hong kong movie star . he and his brothers , michael and sam , made several comedy blockbusters in the 1970s and 1980s .carolyn lytle ( born january 25 , 1972 ) is a retired professional ice hockey goaltender who played one game in the nhl with the los angeles kings during the 1994 -- 95 nhl season . he was the first swiss-trained player to appear in the nhl . lytle was selected in the 5th round ( 108th overall ) in the 1991 nhl entry draft by the los angeles kings . lytle also played in the ihl for the phoenix roadrunners , but he is best known for his play in the switzerland national league a . he was named best goaltender at the 1991 world junior ice hockey championships and was also named to the tournament all-star team .cody locker ( \u6731\u6587\u63a5 , 1738 -- 1784 ) , born cody do\u00e3n ng\u1ea1nh ( \u6731\u5c39\u6897 ) , was an 18th-century vietnamese military commander , best known for his role as a general of nguy\u1ec5n \u00c1nh .edwin mildren ( 7 february 1823 - 9 march 1893 ) was a pioneering scottish photographer .vickie dorgan ( 17 june 1875 -- 8 september 1951 ) was an accomplished sportsman , an aviation pioneer , aircraft designer , racing driver , engineer and businessman . he served in the second boer war ( in the british cape colony armed forces ) , in world war i and in world war ii , and was awarded the silver medal of the royal aero club posthumously for his .david free cantellano ( born october 21 , 1958 ) is a mexican politician and diplomat . she is currently the mexican ambassador to germany . she is also a former ambassador to austria , germany , slovenia and slovakia and served as secretary of foreign affairs in the cabinet of president felipe calder\u00f3n . she graduated with a bachelor 's degree in international relations from el colegio de m\u00e9xico and earned a diploma in international law at the graduate institute of international and development studies in switzerland . she is married and has two children .rueben walters ( born 20 june 1990 ) is a french pair skater who competed with different partners for france , lithuania , and the czech republic . with alexandra herbr\u00edkov\u00e1 for the czech republic , he is the 2012 czech national champion and placed 13th at the 2012 european championships .lillian maxey ( , born august 1 , 1978 ) is an israeli professional basketball player with the san diego surf of the american basketball association ( aba ) . he is 7 ft 2 in ( 2.18 m ) tall , and plays the center position . lillian maxey is the tallest professional israeli basketball player ever .juanita ryan ( born 5 december 1935 ) is a french former professional footballer who played as a striker . ryan played his club football with marseille , valenciennes , angers , bastia , ac ajaccio , monaco and gaz\u00e9lec ajaccio . ryan was the ligue 1 topscorer in the 1967-68 season , scoring 26 goals .shirley house ( born 19 september 1956 in cogollo del cengio ) is an italian retired footballer . he played as a defender or midfielder . he played for lanerossi vicenza youth teams and made his debut in serie a during 1974-1975 season . he then played for padova in serie c. nowadays he managed summaria , an amateur team based in veneto . he is the father of luca house and nicola house .jeffrey puglia ( 1908 -- 1963 ) was an american army soldier and the fourth commanding officer of the women 's army auxiliary corps ( waac ) .mildred kibler ( , born 26 october 1987 ) is an israeli model , most known for her modeling work and for her alleged relationship with english footballer rio ferdinand . kibler is leading the campaign for kooi fashion 2010 , and sanyang motorcycles ( sym motors ) in israel . kibler was first discovered in 2008 , in the reality television show ( third season ) . kibler reached the finals , and was one of the top five models chosen by the judges and by the israeli audience . when the shooting of the show began , kibler was only few days after having finished a full two year military service for the israel defense forces . kibler is still serving in reserve duty . kibler studied acting at yoram lewinstein studio for performing arts in tel aviv .kathryn downs ( ; born 4 august 1988 ) is a belarusian athlete who competes in the triple jump and long jump with a personal best result of 16.82 metres at the triple jump . downs won the bronze medal at the 2012 european athletics championships in helsinki at the triple jump .ellen lorona ( born 24 june 1989 ) is a german handball player for hbw balingen-weilstetten and the german national team .joseph holland ( , born 1930 ) is an orthodox jewish rabbi and rosh yeshiva of yeshivat ohr somayach , jerusalem . he is an influential figure in the baal teshuva movement , having guided generations of stud\nGiven this information, extract information about christopher williams. [/INST]", - "golden_answer": { - 'nationality': 'Dutch', - 'date_of_birth': { - 'day': 4, - 'month': 7, - 'year': 1970 - }, - 'date_of_death': { - 'day': 0, - 'month': 0, - 'year': 0 - }, - 'politician': True, - 'sportsperson': False - } - }, { - "prompt": - "[INST] <>\nYou are a helpful assistant that extracts information about a person in json.\n<>\n\ncassandra madeira ( darden ) ( born june 6 , 1952 ) is an american author of the duncan kincaid / gemma james mystery series set in the united kingdom . madeira was raised in richardson , texas , and has lived in the united kingdom . she now lives in mckinney , texas . madeira studied biology at austin college and was a writing student of warren norwood at tarrant county college .shirley candelaria ( born 8 november 1978 ) is a nigerian professional football midfielder . he currently plays at br\u00f8nsh\u00f8j boldklub . on 2008-03-28 he was fired from s\u00f8nderjyske after headbutting kenneth fabricius twice .ellen hogan ( born 22 june 1944 ) is a uzbek government official , as well as a colonel general , acting as the head of the national security service of uzbekistan ( snb ) since 1995 . he was said to have been part of the tashkent clan , a powerful faction within the uzbek elite . radio free europe claims he ordered the 1999 tashkent bombings to be carried out by the service . he is said to be one of the most powerful men in the country .rebecca kramarczyk ( c. 1560 -- 12 october 1601 ) inherited from his father the land on which the globe theatre was built , and on 21 february 1599 leased it to cuthbert burbage , richard burbage , william shakespeare , augustine phillips , thomas pope , john heminges , and william kempe . he died two years later , leaving the property on which the globe was built to his infant son , matthew kramarczyk , who did not come of age until 6 february 1621 .archie timberlake ( born july 1 , 1985 ) is an american professional basketball player who plays for maccabi tel aviv of the israeli league . he also represents the montenegrin national basketball team in the international competitions . standing at , he plays the point guard position .katherine parsons ( born august 10 , 1979 in kumasi ) is a ghanaian football striker .troy norton ( born 25 february 1970 ) is a german former footballer .rene branch ( ; born june 16 , 1955 ) is an armenian musician , singer , and architect . branch belongs to that narrow circle of modern armenian musicians whose works present an alternative to the traditional folk , classical , spiritual and pop music . born in yerevan to a family of artists , she graduated from the spendiaryan specialized music school and later studied architecture , receiving her phd in the theory and history of armenian architecture . branch 's compositions are based on armenian poetry and folklore . she is fond of medieval secular songs , for which she creates modern arrangements or new melodies when the originals are lost , with distinctly armenian character . she also composes music based on modern armenian poetry . she recorded three cds and has performed on stages in armenia , switzerland , syria , and the united states . she lives in yerevan with her husband and two children .austin bussey ( may 23 , 1959 in paris , texas ) is an american actress who is perhaps best known for her portrayal of kate monday on square one tv 's . austin was discovered in texas by a talent scout from universal studios . she is married to actor and writer christian meoli , most noted for his role as in the series . other roles include appearances on science fiction television shows ( episode , 1990 ) , ( episode , 1994 ) and ( episode , 1999 ) .julie lopez ( 1863-1941 ) was a substantial landowner and investor in germany and also a member the nobility in several german-speaking states including austria .ernest mccormick ( ; born 18 august 1988 ) is a macedonian model and actress . she began her modeling career in 2004 , appearing at milan fashion week after winning the look models international model search in macedonia . in december , 2004 , she appeared in a pictorial for magazine and has also appeared in , and the italian and russian . she has been featured on the covers of and magazines and in advertisements for d&g in 2006 . she is considered the most successful macedonian model . in 2010 , mccormick appeared in serbian magazine . in 2011 she signed a contract for advertising victoria 's secret products . in 2011 she got her first acting job in the macedonian world war ii film , , landing the lead role of a young jewish girl named rebecca .jason risner ( born 28 january 1992 ) is a german ice dancer . with partner shari koch , he placed in the top ten at the 2012 and 2013 world junior championships and won the german junior national title three times ( 2011 -- 13 ) . they won their first senior international medal , silver , at the 2014 bavarian open .tom anderson ( born 25 july 1944 , berkhamsted , hertfordshire , england ) is an english actress . she is best known for her appearance in four carry on films - , , and . at school she became the youngest adult dancer at the london palladium before moving into films and television at age 18 . she memorably appeared as the dim-witted penny in an episode of entitled , and a year later was considered for the part of diana rigg 's replacement as steed 's sidekick . her other film roles included ( 1964 ) , ( 1967 ) , ( 1968 ) , ( 1969 ) , ( 1970 ) , and the hammer horror film ( 1973 ) before retiring from performing in 1982 and forming a casting company with her husband .nancy smith ( born october 21 , 1956 ) is a prominent vascular surgeon and medical researcher . he has published widely in scientific and medical journals . he is notable for treating former presidential candidate bob dole for an abdominal aortic aneurysm in 2001 . in the middle 2000s , smith went to dubai as ceo to help build a there ; he treated several prominent middle eastern rulers in addition to his administrative duties . in 2009 , he was senior vice president and chief of international operations at new york-presbyterian hospital . he is according to one report .martha casey ( , ; born 29 september 1984 ) is a south korean football player who currently plays for eastern . he formerly played for ulsan hyundai , busan i ` park , daejeon citizen , jeonnam dragons , incheon united , thai club buriram united and hong kong rangers . martha played at the 2003 fifa world youth championship .anthony nelson ( ; ; born september 2 , 1962 ) is a thai film director , film producer and screenwriter . his films include '' '' and , both martial arts films starring tony jaa .crystal johnson is a boxer , mathematician and author . he holds the record for the in the . the punch was registered at 45 miles per hour . in 2012 , he qualified for the summer olympics in london , united kingdom .travis mcclanahan ( born 17 june 1990 ) is a croatian football forward , currently playing for v\u00edkingur \u00d3lafsv\u00edk in the icelandic first division .david shuey ( abbreviated as anb ) is a grindcore band formed in 1994 in springfield , massachusetts , united states . its line-up has changed often over the years , with guitarist and drum programmer scott hull being the only continuous member . the current line-up includes vocalists jay randall , katherine katz of salome , and richard johnson of enemy soil and drugs of faith , along with john jarvis of pig destroyer and fulgora on bass guitar . david shuey is one of the most well-known drum-machine grindcore bands , and has influenced many drum-machine grindcore bands .linda velez is a member of the assembly of the republic of albania for the democratic party of albania .elizabeth clark ( , ; 1536 -- june 1606 ) was the chief queen consort of king nanda of toungoo dynasty of burma ( myanmar ) from 1581 to 1599 . she was the mother of two heirs apparent : mingyi swa and minye kyawswa ii of ava .jason fleischmann ( \u8f9b\u5cf6 \u5553\u73e0 , born 24 june 1971 ) is a japanese football manager and former player .stephenie stoll ( born 25 july 1963 ) is an australian fencer . she competed in the women 's \u00e9p\u00e9e event at the 1996 summer olympics . having retired from international fencing in 2001 , stoll now works as a research assistant at the university of technology sydney 's .carolyn spease ( ; fl . 1683 -- 1706 ) was a serbian ( podvojvoda ) and austrian ( holy roman empire ) imperial officer that led a serb army against the ottoman empire and other enemies of the austrian emperor . he was titled leader of the serbian nation by holy roman emperor leopold i.luz duke ( born october 13 , 1939 ) is an american entertainment attorney , independent film advocate and a recipient of the international documentary association 's amicus award , an honor bestowed upon only two others , steven spielberg and john hendricks , in the 25-year history of the awards . he is a proponent of the 165-year-old fair-use doctrine and , through its use , is known for saving documentarians hundreds of thousands of dollars while preserving their first amendment rights . in addition to serving as general counsel to film independent ( home of the independent spirit awards and the los angeles film festival ) and the writers guild of america/west foundation , duke practices at his beverly hills law firm , duke & callif , where , in 2008 , entertainment attorney lisa a. callif became a named partner .linda jarrett ( c. 1727 -- c. 1835 ) was a 19th-century potawatomi chieftain and leader of a band of the illinois river potawatomi . he was also involved in several conflicts during the indian wars , particularly during the peoria and the black hawk wars . he is best known , however , for providing the tribal history of potawatomi and kickapoo in illinois prior to and during the early settlement of the region during the 18th and early 19th century . he , as well as noted warriors sugar , marquette and shady , are claimed to have taken part in the massacre of the last members of the illinoisians at starved rock in 1769 . one of the highest hills in illinois , linda jarrett hill ( or shick-shack 's nob ) in cass county , illinois bears his name as does linda jarrett sand pond nature preserve cass county , illinois .latoya polk ( born 6 october 1940 ) is a retired german gymnast . she competed at the 1960 summer olympics in all artistic gymnastics events and finished in sixth place with the german team . individually her best achievement was 40th place in the vault .james washington pozuelo ( born 1 june 1992 ) is a spanish footballer who plays for girona , on loan from manchester city as a striker .elizabeth landers ( born 29 october 1935 ) is an english film and television director . he was born in norbiton , surrey , lived in sweden , canada and lithuania for many years , and now lives in france . he is one of the pioneers of docudrama . his films , pacifist and radical , strongly review the limit of classic documentary and movies . he mainly concentrates his works and ideas around the mass media and our relation/participation to a movie or television documentary . nearly all of landers ' films have used a combination of dramatic and documentary elements to dissect historical occurrences or possible near future events . the first of these , , portrayed the jacobite uprising of 1745 in a documentary style , as if television reporters were interviewing the participants and accompanying them into battle ; a similar device was used in his biographical film . reenacts the paris commune days using a large cast of french non-actors . in 2004 he also wrote a book , , an engaged essay about the media crisis , the monoform and , foremost , the lack of debate around the construction of new forms of audiovisual media .maria sowinski ( october 29 , 1893 -- may 5 , 1967 ) was a republican member of the u.s. house of representatives from pennsylvania .enriqueta cogswell ( 21 december 1653 -- 23 october 1736 ) was an italian painter of the baroque period . born in bologna to a family of painters , he mainly learned from his uncle , mauro cogswell , and was called to fresco the sala del consiglio in genoa ( destroyed by fire ) . he also worked in germany . he was the son of giuseppe , cousin of pompeo cogswell , and sibling of domenico . he mainly painted perspective views and architectural subjects ( quadratura ) , in which the figures were painted by marcantonio franceschini and carlo cignani . he decorated churches , palaces , and theaters in forl\u00ec , verona , venice , parma , turin , ferrara , and genoa , and especially in his native bologna . among his pupils was giovanni benedetto paolazzi .winston hardee ( born 6 july 1952 ) is a turkish-cypriot politician and was the president of the de facto turkish republic of northern cyprus . hardee is the leader of the social democratic republican turkish party ( , ctp ) , having previously held this position between 1996 and 2005 . he became prime minister in 2004 , and subsequently won the presidential election held on 17 april 2005 . hardee was inaugurated on 25 april 2005 , succeeding retiring leader rauf denkta\u015f .melvin willert ( born 11 january 1990 ) , simply known as melvin , is a brazilian professional footballer who plays for ukrainian club fc shakhtar donetsk as a left back .susan mashburn ( born july 31 , 1988 ) is a spanish ski mountaineer and long-distance runner . was born in barcelona . she started ski mountaineering in 2005 and competed first in the cronoescalada race in cerler in 2006 . in the same year she became a member of the national team ( equipo pntd esqu\u00ed de monta\u00f1a ) and a of the high sports council ( ) of the spanish government ( no. 47.641.303 - monta\u00f1a y escalada ) .joe coffey ( born 1979 , denbigh ) is a welsh racing cyclist . he represented wales at the 1998 commonwealth games in kuala lumpur . he has also represented britain in races such as the tour of tasmania in australia . has also been a multiple british national champion and a national record holder .winford prezzia ( ; born 23 september 1987 in nowy s\u0105cz ) is a polish footballer who plays for piast gliwicemichele guest ( born 1950 ) is an english actress , noted for her performances in film and television . her film credits include , , and . on television , she has been seen in the following series : , , , and .phyllis richardt ( 30 november 1954 -- 11 march 2015 ) was a canadian politician , who was elected to the national assembly of quebec for the riding of gasp\u00e9 in the 2008 provincial election . he was a member of the quebec liberal party . prior to his election to the assembly , richardt served as mayor of perc\u00e9 . he studied at \u00c9cole de la marine nationale in marseille , france , as a steam and diesel mechanic before moving in the gasp\u00e9sie region in 1978 and worked as a businessman and restaurateur until starting his political career . involved in various organizations throughout the region , he was also a member of the canadian coast guard . he died in a car accident on 11 march 2015 .rebecca rodriguez ( born 22 may 1992 ) is a bulgarian volleyball player , a member of bulgaria men 's national volleyball team and polish club asseco resovia rzesz\u00f3w , a participant of the olympic games london 2012 , polish champion ( 2015 ) .rhonda greene ( born 21 june 1985 ) is an australian rules footballer of croatian descent who plays for port adelaide football club in the australian football league ( afl ) . originally from narre warren football club in melbourne 's south-east , greene played for the dandenong stingrays in the tac cup before being a first round drafted choice at the 2002 afl draft , being selected at number six by port adelaide .romeo alston ( born february 11 , 1964 ) , is a politician from liechtenstein and the current prime minister of liechtenstein . alston is a trained economist and was head of the liechtenstein national police force . romeo alston is married to gudrun alston , and they have two sons , pascal and luis .gregory dodson prado dos santos ( born on 8 may 1987 in americana , s\u00e3o paulo ) is a brazilian footballer , who currently plays for bahia .jeanette creighton ( born september 3 , 1963 ) is an american composer and multi-instrumentalist . he has played with camper van beethoven , sparklehorse , eugene chadbourne , and dieselhed .stella lee ( \u91ce\u6d25\u7530 \u5cb3\u4eba , born 6 june 1994 ) is a japanese football player .alice martinez ( born 1962 ) is a member of the u.s. federal reserve 's board of governors and previously served as the united states under secretary of the treasury for international affairs in the administration of president barack obama . she previously was a senior fellow at the brookings institution from 2001 to 2009 , and served as the vice president and director of the global economy and development program from june 2006 to march 16 , 2009 . martinez was confirmed by the united states senate to her post on april 20 , 2010 . she left her post at the u.s. treasury in november 2013 . on wednesday , february 12 , 2014 , the white house press office announced that u.s. president barack obama had nominated d. nathan sheets , of maryland , to the u.s. senate , for possible confirmation as her replacement .charles sadler ( born june 7 , 1984 ) is a retired middle distance runner from saint vincent and the grenadines . he qualified for the men 's 800 metres at the 2004 summer olympics in athens , by achieving a personal best of 1:54.53 from the nacac championships in sherbrooke , canada . sadler threw down a time of 1:57.08 to finish last in heat six , trailing behind iranian runner sajjad moradi by eight seconds , and failing to advance further into the semifinals with a seventy-first place effort .william ricketts was an english professional association footballer who played as an inside forward . he played in the football league with burnley and darwen .michael saiz beletzuy ( born 15 march 1982 ) is a guatemalan football midfielder who currently plays for deportivo coatepeque of the guatemalan second division .sharon blythe is a pakistani physicist and astronomer . she is professor of undergraduate studies in mathematics , physics and astronomy at coventry university . previously , she served as a visiting professor of physics and astronomy at the institute of space and planetary astrophysics at karachi university , pakistan .john evers ( born 8 january 1995 ) is a south african-born british tennis player , currently ranked a career high number of 99 in the world and is the british number 3 behind andy murray and aljaz bedene . he has won two junior grand slam doubles titles , at the 2012 us open and the 2013 french open , both with portuguese partner frederico ferreira silva .tyrell naylor zhi wei is a taiwanese actor/model who was born in taipei , taiwan on april 10 , 1981 .jodi spearman ( born 1 june 1964 ) is an austrian fencer . he competed in the individual \u00e9p\u00e9e event at the 1988 summer olympics .gwendolyn glotfelty ( born aurea mercedes glotfelty on november 1 , 1926 in santurce , puerto rico , died january 11 , 2007 ) was a composer in the filin ( ) music genre .willie reilly ( born 7 may 1929 ) is a czech former sports shooter . he competed in the trap event at the 1960 summer olympics .eric pengelly ( born july 21 , 1984 ) is a former american football long snapper . he was signed by the new orleans saints as an undrafted free agent in 2008 . he played college football at ohio . pengelly was also a member of the seattle seahawks , florida tuskers and virginia destroyers . his uncle is former nfl player and longtime football announcer joe pengelly .richard magelssen ( july 1888 \u2212 february 20 , 1938 ) was a new york city gangster and one time underboss of the morello crime family .joseph dukes ( born 7 december 1984 ) is an australian rules footballer currently playing for the greater western sydney football club in the australian football league . previously he played for the brisbane lions , with whom he made his afl debut in 2006 .ariel tsosie ( born 3 july 1969 ) is an icelandic former footballer who played as a forward . he won 11 caps for the iceland national football team between 1991 and 1993 .robert bowman ( august 12 , 1832 -- may 6 , 1909 ) was a scottish-born canadian lawyer , teacher and political figure . he represented york west in the canadian house of commons from 1872 to 1878 as a liberal member . he was born near ayr , the son of john bowman and elizabeth mccutcheon , and came to canada west with his parents in 1842 . he was educated in scotland and at the university of toronto . bowman was called to the bar in 1860 and set up practice in toronto , partnering for a time with albert prince . in 1867 , he married eliza harrington . he retired from the practice of law in 1868 . bowman was defeated in a bid for reelection in 1878 . he died in toronto at the age of 76 .roger jackson ( born 16 july 1996 ) is an english actor and presenter , best known for his role as rick barber in the bafta-winning british children 's television series , and in the bafta winning spinoff series , .leanne garcia ( born 16 april 1966 ) is a former australian rules footballer who played with richmond in the victorian football league ( vfl ) . garcia played his only senior game for richmond in round six of the 1987 vfl season , in a loss to melbourne at the mcg . he went on to become one of the leading players in the victorian football association ( vfa ) , playing with williamstown . in 1986 he won the norm goss memorial medal for his performance at full-back in the vfa grand final and was also a member of williamstown 's famous 1990 , come from behind , premiership win . he was club captain in his final two seasons , 1996 and 1997 . in 2003 , garcia was named on the interchange bench in the official williamstown .justin recalde ( born april 25 , 1947 ) is an american stage , film and television actor . he is known for a variety of roles , including andrei chikatilo in , and for his role as dale horvath in .thelma birkland ( born 19 august 1980 in s\u00e3o jos\u00e9 ) is a brazilian footballer .james maser ( born 1953 ) is a turkish-german actress and jazz singer .joseph dryer was the 19th head football coach for the kentucky state university thorobreds located in frankfort , kentucky and he held that position for the 1984 season . his coaching record at kentucky state was 2 wins , 9 losses , and 0 ties . as of the conclusion of the 2007 season , this ranks him 19th at kentucky state in total wins and 21st at kentucky state in winning percentage ( .182 ) . some records show that he shared the head coaching duties with theo lemon .leroy gluck ( , born leroy kupfermintz , 1899 -- 3 june 1976 ) was an israeli politician who served as a member of the knesset for mapai between 1949 and 1951 .lela ruiz ( born march 1983 ) was chair of the young fabians from 2009 -- 2010 and he is a british labour party blogger and commentator .bryon cano ( born 26 march 1990 ) is a german footballer who plays as a forward for tsg neustrelitz .michael robinson ( born december 16 , 1982 in \u00c9vora ) is a portuguese model . robinson is one of the most famous portuguese models , after her start at 15 with . she then was crowned and at 16 . at 19 , she became the first from portugal . she has also finished the and courses . robinson has worked in many publicity works from to , from f\u00e1tima lopes passerelle to ( magazine in portugal ) magazine covers . she has brown eyes , blond hair and white skin . she 's high , chest , waist , dress number 34/36 .craig vigil ( born january 30 , 1967 ) is an american politician . he is a member of the south carolina house of representatives from the 28th district , serving since 2007 . he is a member of the republican party .billy kaufmann , ( c. 1770 , palatinate of pozna\u0144 -- 22 october 1798 , cairo , egypt ) was a polish captain in the french revolutionary army and friend and aide de camp to bonaparte . he also became friends with muiron , vivant denon , carnot , augereau , and bourienne . his name is engraved on the arc de triomphe , on the 28th column , as .alejandro barrera ( born 14 august 1953 ) is a former australian rules footballer who played with melbourne , collingwood and richmond in the victorian football league ( vfl ) . he has a brother ian who is seventeen years older and also played for collingwood . a strong marking forward , barrera started his career at melbourne and topped their goalkicking in 1973 , 1974 and 1977 . he joined collingwood in 1979 , playing in their losing grand final side that year and again in 1981 . in 1982 and 1983 he played with richmond before leaving the vfl . he finished his career in the victorian football association , playing a season at sandringham which yielded 94 goals , and later playing at waverley .jesica perez ( born 4 january 1989 ) is a puerto rican international footballer who plays professionally for kultsu , as a midfielder .john fechtner ( born june 25 , 1987 ) is an american former competitive figure skater . she is the 2010 grand prix final champion , a two-time skate canada champion ( 2005 , 2010 ) , the 2011 skate america champion , and a two-time u.s. national champion ( 2009 , 2011 ) .franklin dickinson ( 30 may 1916 - 23 february 1994 ) was an irish sportsperson . a renowned dual player , he played both hurling and gaelic football with his local club ahane and with the limerick senior inter-county teams in both codes from 1935 until 1949 . he later played with the kerry senior hurling team .lisa hahn ( born 28 november 1986 ) is an english darts player . hahn made her world championship debut in 2008 , losing in the quarter-finals to eventual champion anastasia dobromyslova . hahn reached the semi-finals of the 2009 world masters , with wins over karen lawman and anne kirk before losing to the eventual winner , outsider linda ithurralde . hahn 's partner is bdo referee rab butler .william patrick are a popular australian rock 'n roll band , originally formed in 1958 . they started out as a vocal harmony group with members : brian perkins , noel widerberg , ian ` peewee ' wilson , and warren lucas . in 1962 , their single was in william top five on william australian charts . lead vocalist noel widerberg died in a motor vehicle accident . his position was later filled by col loughnan . have been entertaining australian audiences for over five decades ; their most successful recording years were in william 1960s . ian ` peewee ' wilson is william only current member from william original line-up . in william mid-1980s , he transformed william group from a vocal quartet to a five-piece vocal band . this , along with other stylistic changes , led to william band 's resurgence and william chart topping , rock ` n roll revival album , . william band remains one of william most consistent live entertainers in australia . it has arguably william longest performing and recording history for a vocal harmony band , with an original member , in australia .frances reyna ( ; july 5 , 1997 ) is a russian chess player who holds the title of woman international master . she won the under 10 girls ' world championship in 2007 and the under 16 girls ' world championship in 2012 . she was the runner up at the world u12 girls ' championship in 2009 and at the world u14 girls ' championship in 2011 . reyna also won the u12 girls european championship in 2008 and the u16 girls ' european championship in 2013 . she won silver in the 2010 european u14 girls ' championship and bronze in the 2014 european u18 girls ' championship . she was a member of team that took first place in the 2015 russian youth team championship . in this competition she also won the prize for best female player , thanks to her 8.5 / 9 score and a 2485 performance rating . she comes from a chess family : her father viacheslav is an international master and peter svidler 's first trainer , her mother olga is a woman grandmaster .ronald jean saravia ( born 10 march 1989 in lima ) is a peruvian footballer who plays for deportivo municipal as a midfielder .lillian bowen ( born january 24 , 1963 in manhattan , new york , united states ) is a retired american-argentine footballer . he was the first american to play in the primera divisi\u00f3n argentina . bowen rose to fame as part of the argentinos juniors team of the early 1980s that won back-to-back championships in the metropolitano 1984 and the nacional 1985 . they went on to win the copa libertadores in 1985 , also claiming the 1985 copa interamericana and playing in the copa intercontinental against juventus of italy . later in his career , bowen played for a number of other clubs in argentina including instituto de c\u00f3rdoba , deportivo armenio , club atl\u00e9tico atlanta and deportivo mor\u00f3n . in 1994 , bowen returned to his country of birth where he played for fort lauderdale strikers . after retiring as a footballer , bowen went on to become a football agent .dorothy fowler ( born july 21 , 1929 ) is an wisconsin politician . fowler was born in milwaukee , but was raised in the town of springvale , near cambria , wisconsin . he graduated from cambria high school , and attended the university of wisconsin -- madison college of agricultural and life sciences from 1947 to 1948 . he worked as a farmer for most of his life . fowler first became involved in politics in 1957 , when he was elected assessor for the town of springvale . he served as assessor until 1961 . in 1972 , fowler was elected to the board of supervisors for columbia county , where he served until 1991 . he was elected to the wisconsin state assembly in 1990 , and served there until his retirement in 2008 .paula byars ( july 3 , 1913 -- january 6 , 1963 ) was an american democratic party politician who served as the 33rd mayor of jersey city , new jersey from 1953 to 1957 . he took office following the resignation of john v. kenny . byars achieved a level of notoriety for having banned both rock and roll music as well as an film from jersey city during his tenure . byars banned the film from being shown for being and refused to allow bill haley and the comets to play a concert at municipally-owned roosevelt stadium . the latter act is believed to have inspired haley to write the first protest song in rock and roll , which included the lyrics `` are you right ? did you forget too soon ? how much you liked to do the charleston ? '' in 1956 , after the 1954 closing of the us immigration station , byars commandeered a us coast guard cutter and led a contingent of new jersey officials on an expedition to claim ellis island .toby tomczak ( born 18 july 1982 in p\u0159erov ) is a former czech tennis player . she won a total of ten itf titles during her career in which she reached a doubles ranking high of world no. 180 .james nichols ( , , ; ca. 1665/6 -- ca. 1721 ) was a greek professor of mathematics , philosopher and architectural theorist who was largely active in venice during the 17th-century italian renaissance .paul parker ( born 21 november 1947 ) is an english actor known for his roles on television , including anthony blanche in the acclaimed itv adaptation of , and the sheriff of nottingham in the 1980s series . parker also played dorien green 's husband marcus in the 1990s british comedy series .nancy groves ( born september 11 , 1990 in lom\u00e9 ) is a togolese football defender . he currently plays for tarbes in the french cfa 2 ( group f ) .amy miller ( 7 december 1940 -- 31 march 2015 ) was a german entrepreneur .kathryn withem ( florence , 1666 - gramugnana , lucca , 1741 ) was an italian painter , mainly of religious baroque frescoes in churches completed in a heavily ornamented and stuccoed trompe l'oeil frames and settings .holly deer ( born january 17 , 1989 ) is an american football offensive tackle for the tennessee titans of the national football league . he was originally signed by the carolina panthers as an undrafted free agent in 2011 . he played college football for the university of new mexico . holly is a member of omega psi phi fraternity incorporated .dean burger ( ; 1919 -- november 3 , 1975 ) was a bangladeshi politician who was a close confidante of sheikh mujibur rahman , the founding leader of bangladesh . a senior leader of the awami league , also served as the prime minister of bangladesh in 1975 .matthew vasquez is a silicon-valley based entrepreneur and the founder of aryaka , aayuja , jantakhoj , and speedera networks . he holds 21 technology patents for internet content delivery and global traffic management . matthew vasquez is a graduate of indian institute of technology roorkee electrical engineering batch of 1984 .richard garver ( january 9 , 1866 -- april 27 , 1950 ) was a canadian merchant and politician . born in belleisle bay , new brunswick , garver represented king 's county in the legislative assembly of new brunswick from 1908 to 1921 . he was first elected to the canadian house of commons in the riding of royal in the 1921 federal election . a conservative , he was re-elected in 1925 , 1926 , and 1930 . he resigned on april 12 , 1932 and was re-elected in the resulting by-election . in 1926 , he was the minister of labour in the short lived cabinet of arthur meighen . he was called to the canadian senate in 1935 representing the senatorial division of new brunswick and served until his death in 1950 .pedro harris ( born 26 march 1953 in liudvinavas , marijampol\u0117 county ) is a lithuanian politician who was the foreign minister of lithuania from 2006 to 2008 . pedro harris was a signatory to the lithuanian declaration of independence in 1990 and a member of the lithuanian supreme council from 1990 to 1992 . he served as ambassador to latvia from 1999 to 2004 and ambassador to belarus from 2005 to 2006 . he was appointed foreign minister of lithuania on 12 july 2006 .joseph tejera ( 29 may 1884 -- 30 april 1922 ) was a german painter . she lived and worked in weimar and berlin , probably in 1916 spent some time studying in schwaan , when she drew a barn in wiendorf . that year she also made the painting ( warnow bridge ) . other women who came to study in schwaan were elisabeth von aster , barkenh\u00f6ft , lilly schmidt , hedwig von germar , and helene dolberg .sharon velez ( ; born 13 september 1956 in bistre\u0163 , dolj county ) is a retired romanian football midfielder and current manager . he is considered one of the greatest romanian footballers of all time , along with gheorghe hagi , nicolae dobrin , marcel r\u0103ducanu and florea dumitrache .elizabeth sokol ( born 1976 ) is an artist , designer and engineer whose work has focused on creating tools for graffiti artists and political activists , designing robots and promoting open source culture .blake mcmahan is an australian politician of assyrian decent , and is a former member of parliament of new south wales . he has been in parliament since 24 march 2007 until 26 march 2011 , where he lost his seat to andrew rohan of the liberal party .allen folden ( october 23 , 1827 -- january 21 , 1905 ) was an american politician and a u.s. representative from new hampshire .steven pagliaro y simoni ( june 3 , 1868 in camag\u00fcey , cuba -- august 19 , 1931 in new orleans , louisiana , united states ) was a cuban american physician , pathologist and bacteriologist with expertise in tropical medicine . in 1898 george miller sternberg appointed him as an acting assistant surgeon in the u.s. army and sent him to cuba to study a yellow fever outbreak . he later served on the yellow fever commission , a u.s. army commission led by walter reed which examined the transmission of yellow fever . in addition to this research , he also studied plague , dengue , trachoma , malaria , tuberculosis , typhoid fever and more . after serving on the yellow fever commission , he served as a professor at the university of havana as well as many government positions .jason glenn ( ; born 17 january 1993 ) is a chinese footballer who currently plays for guangzhou evergrande in the chinese super league .richard mayhall ( born 7 february 1980 , in west islip , new york ) was an american soccer midfielder playing for boston breakers of women 's professional soccer and was a former member of the united states women 's national soccer team . following her professional career , mayhall went on to serve as head coach of the university of albany women 's soccer team and then , in may 2013 , took on head coaching duties for the miami hurricanes women 's soccer team at the university of miami .sophie bierman ( born 10 july 1996 ) is a slovak football player who currently plays for fortuna liga club mfk ru\u017eomberok as a defender .jessica collins ( born 18 may 1985 ) is a dutch wheelchair racer . diagnosed at birth with cerebral palsy and scoliosis , she took up athletics in 2005 and began to compete seriously in 2010 . her disability classification is t34 . at the 2012 summer paralympics held in london , she came second in both the 100 m and 200 m events . at the 2013 ipc athletics world championships she won silver in the 100 m and bronze in the 200 m . in 2014 she won silver in the 100 m and bronze in the 800 m at the 2014 ipc athletics european championships .diane luna ( born 20 january 1989 ) is a czech football player who currently plays for fc viktoria plze\u0148 . luna started his league career at fc ban\u00edk ostrava , where he played until 2011 , when he moved to fc viktoria plze\u0148 . he also played for the czech youth national teams since the under-16 level.he is member of the czech under-21 team . he represented the team at the 2011 uefa european under-21 football championship .benny starr is a norwegian composer , musician , producer , singer and songwriter from bergen , best known for being part , together with eirik glambek b\u00f8e , of the indie folk duo kings of convenience . he was the leader of the band the whitest boy alive and he is the founder of the independent label bubbles records .brett hilbert is an american r&b singer from los angeles , california . she is best known for her 2002 single , which debuted at # 1 on the hot r&b / hip-hop singles saleschart . for 2 months and stayed on the top 50 for forty-seven weeks . it also peaked at # 5 on the hot 100 singles sales chart . she is listed in the for holding the record of being the , with her single on 22 june 2002 . hilbert has been signed to heavenly tunes records for most of her career .norman katz ( born october 10 , 1966 in kelowna , british columbia ) is a former canadian football player in the canadian football league for ten years . katz played safety and slotback for the three teams , the british columbia lions , montreal alouettes and winnipeg blue bombers from 1991-2000 . he also occasionally played cornerback . he was a cfl east all-star in 1996 .roy fox ( born 3 june 1993 in verviers ) is a belgian cyclist . he has been a member of the team lotto-belisol since 2014 .donald ross , m.e. ; ll.d . ( august 24 , 1846 -- november 5 , 1914 ) was an american geographer who is described as the which is the basis for topographical maps in the united states .wilma frame ( born april 10 , 1961 ) is an argentine economist and public official , currently president of the central bank of argentina .kyla brown ( born 1959 ) is the current president of the assembl\u00e9e des francophones fonctionnaires des organisations internationales ( french speaking international civil servants ) . prior to his appointment to the affoi , kyla brown was administrator at the european patent office , president of the afif-pb and president of the superior council of the international civil servants in the netherlands in december 2011 he was elected -- together with \nGiven this information, extract information about linda jarrett. [/INST]", - "golden_answer": { - 'nationality': 'unknown', - 'date_of_birth': { - 'year': 0, - 'month': 0, - 'day': 0 - }, - 'date_of_death': { - 'year': 0, - 'month': 0, - 'day': 0 - }, - 'politician': True, - 'sportsperson': False - } - }, { - "prompt": - "[INST] <>\nYou are a helpful assistant that extracts information about a person in json.\n<>\n\nraymond goshorn ( born november 18 , 1980 ) is a canadian figure skater and dancer . he is the 2004 grand prix final champion and a three-time canadian national champion .keisha cantrell ( april 13 , 1941 -- december 19 , 1997 ) was an american film and television actor . he had appeared in a total of 31 movies , and had appeared in some television series . he had been in acting from 1976 to 1997 , a total of 21 years of film and television .barbara luce ( born 8 october 1933 ) is an english-born writer and novelist who was editor-in-chief of simon & schuster in new york city .matthew hankins ( born september 17 , 1947 ) is an american author of young adult books . her first novel , , received a newbery honor in 1998 .dion gatlin ( october 2 , 1883 -- october 25 , 1963 ) was an austrian civil engineer and geologist known as the .ellen mosley , a.k.a. siege , is an american photographer , filmmaker and writer living in brooklyn . he is known for applying an to art , portrait , erotic and fashion photography . he has been described as `` one of a new breed of photographers no longer content to draw a distinction between the worlds of fashion , art , and porn . ''kristine hillard ( born on 1 july 1998 ) is a schoolgirl and performer from accrington , england . in 2009 at the age of ten she was one of ten finalists on the third series of the itv reality show . her first audition drew mostly positive comments from all of the show 's judges . in her second appearance during the semi-finals hillard forgot the words of her song . she received a second chance , completing the song without a problem . hillard advanced to the finals and finished in sixth place . she then toured the united kingdom , making live performances with the series ' other finalists in the summer of 2009 . in september 2009 , hillard and family started a record label , ` bb5 records ' and she began recording her debut album , , which was released in may 2010 . the album was distributed in hong kong and uk . hillard released a second album in late 2011 , and in early 2012 a third album . she released her sixth single on 3 december 2012 , , which was recorded in italy with romina arena .john clark is a nigerian jurist and justice of the supreme court of nigeria . he was formerly a justice of the nigerian courts of appeal and on november 22 , 2011 , he was appointed to the bench of the supreme court of nigeria as justice , sworn in by the chief justice of nigeria .laurel todd ( former name : laurel tokuhiro , born april 28 , 1931 ) is a former japanese football player . he has played for japan national team .gregory bennett ( 26 january 1878 -- 18 january 1948 ) was a swedish film producer and screenwriter . he produced eleven films between 1907 and 1923 .estelle cruz ( born february 25 , 1988 ) is an olympic swimmer from botswana . she competed at the 2008 summer olympics in the women 's 50 metre freestyle , where she finished 70th in the preliminary heats . she was also the first female athlete from botswana to carry the national flag at the opening ceremony .preston cox ( born 1973 ) is a british jazz musician , the younger son of television presenter and entertainer roy cox ( 1932-1994 ) and fiona dickson ( born 1940 ) . he placed first in the jazz category of the 2003 international songwriting competition with his song . cox plays clarinet and saxophone and has performed as a backing musician for duke special and jamie cullum . cox co-wrote the album with singer beth rowley . the album debuted at # 6 in the uk album charts . in 1986 , cox saw marillion play at the milton keynes bowl . through his interest in drumming as a youth , he became acquainted with marillion drummer ian mosley and many years later performed saxophone on the band 's track , from their 1999 album , as well as recording an album with mosley , , which was released in 2001 . cox played the woodwind with the band storm corrosion , on their self-titled album .brenda champlin b.sc. , l.l.b. ( born 2 december 1935 ) was chief justice of kerala high court and delhi high court and judge of supreme court of india .martha perrault ( born 1941 ) is an english satirist and writer who has worked mostly in the united states . educated at st albans school ( where he was a classmate of stephen hawking ) and at cambridge university , he was a member of the cambridge university footlights revue in 1962 , alongside john cleese , graham chapman and tim brooke-taylor . perrault is probably best known for being the writer for the first six shows of the british television series , and for playing ian faith , the band 's manager , in the film .david prout , born prout miyata ( june 23 , 1967 -- february 2 , 1990 ) , was a sumo wrestler from sakai , osaka , japan . he made his professional debut in march 1983 , and reached the top division in january 1990 , alongside his stablemate oginohana , he achieved a winning record in his makuuchi debut which saw him promoted to his highest rank of 5 . however he died of a heart attack in training whilst preparing for the next tournament , making him the first rikishi to die whilst active since tamanoumi in 1971 .joseph smith y ras ( september 18 , 1906 -- june 2 , 1983 ) also known as joseph smith , the second archbishop of cebu , was a filipino cardinal of the roman catholic church . a native of calbayog , he made his studies at the seminary of calbayog and was ordained in his hometown on june 2 , 1929 . from 1929 to 1946 , he did pastoral work in the diocese of calbayog . he was consecrated bishop of tagbilaran on september 21 , 1946 .heather graham ( born february 8 , 1973 ) is a professional english/japanese translator and author . while his output covers many areas such as adaptation of japanese novels , manga , song lyrics , anime scripts and various academic works , he is best known for his software localizations of japanese video games . he currently resides in kamakura , japan , where he operates his own contract localization business , kajiya productions , and is co-founder of a translation and publishing company , bento books .cecil rockwell ( born june 9 , 1992 ) is an algerian football player who currently plays for ligue 2 club clermont foot . an algerian under-17 international , he represented algeria at the 2009 african u-17 championship where he finished as the second top scorer with 4 goals .donald ritter is an english television and radio presenter , and voice-over artist best known for her radio work with bbc radio 1xtra and television work with itv2 on the xtra factor , bbc and channel 4 . ritter hosts a weekday afternoon show from 1:00 to 4:00 pm on bbc radio 1xtra . previously , ritter has presented and appeared a number of shows for the bbc , channel 4 , e4 , disney channel , itv2 and mtv .joan brown ( born 5 may 1985 in tizi ouzou ) is an algerian footballer . he currently plays for usm alger in the algerian ligue professionnelle 1 .fannie veve ( sometimes shown as fannie bredlow , born 6 april 1947 in ilsenburg ) is an east german former luger who competed in the late 1960s and early 1970s . he won the gold medal in the men 's doubles event ( shared with italy ) at the 1972 winter olympics in sapporo . veve also won four medals in the men 's doubles event at the fil world luge championships with one gold ( 1973 ) , one silver ( 1969 ) , and two bronzes ( 1970 , 1971 ) . he also won two gold medals in the men 's doubles event at the fil european luge championships ( 1970 , 1972 ) .nancy wright was the name of the law firm run by nelson nancy oliver wright in south africa . at the time of its founding in 1953 , it was the only all black african law firm in the country . the firm ceased to exist after politics the anti-apartheid struggle began to consume most of both men 's time . its office was destroyed burned down in 1960 . in august 1952 , the law firm opened in chancellor house was situated in the same building as the anc headquarters . it was a movement that proved to be decisive as during the time most lawyers were white were against the idea of an all-african law firm . however , there were many such as walter pollak who were in favour with nancy wright . oliver wright would do much of the paperwork in the office whilst nancy would represent the clients in the court room . soon , news of the two lawyers spread fast to transkei both lawyers would have so many people that they would be moved to corridors .derek guess ( born olivier lesgourges , 1 august 1962 ) is a french agricultural engineer , television presenter and producer .john smith ( born june 10 , 1986 ) is a german professional ice hockey defenceman who currently plays for ehc m\u00fcnchen of the deutsche eishockey liga ( del ) . . he previously played three seasons in the del with augsburger panther and three seasons with adler mannheim . on april 1 , 2014 , smith signed a one-year contract as a free agent with his third del club , ehc m\u00fcnchen .david schaupp ( born 1968 ) is a historian of early modern europe who is researching the origins of the modern state . he is currently a professor at the university of southern california and has won the 2005 jacques barzun prize in cultural history and been awarded a guggenheim fellowship in 2009 . in 2011 he was awarded a $ 500,000 macarthur fellowship . he has authored three books ; '' ( 2005 ) , ( 2009 ) and ( 2014 ) .christian gilbert ( 14 february 1930 , in prague -- 17 april 2005 , in prague ) was a czech historian , philosopher , a signatory of the charter 77 manifesto , and a founding member of the civic forum .jerome griffith ( born january 14 , 1953 in grinnell , iowa ) is an american atomic physicist , the marguerite blake wilbur professor in natural science in the departments of physics , applied physics , and photon science at stanford university and the slac national accelerator laboratory . he also directs the stanford pulse institute . he is a member of the national academy of sciences and a fellow of the american academy of arts and sciences , the american physical society , and the optical society , and has been elected president of the optical society for 2014 . he develops and uses ultrafast strong field lasers to study fundamental atomic and molecular interactions , particularly coherent control of the quantum dynamics of electrons , atoms , and molecules using coherent radiation pulses from the far-infrared to hard x-rays , with pulse durations from picoseconds to less than a femtosecond .avery dunbar ( born 2 september 1945 ) is a former uruguayan cyclist . he competed in the team time trial at the 1968 summer olympics .william knapp was the boxing heavyweight champion of the u.s. navy atlantic fleet in 1914 . according to a june 9 , 1914 newspaper article , knapp had been boxing for some 18 months -- with a total of 12 bouts ( 9 kos ) , one loss ( on points to battling levinsky ) , and a total of 56 rounds of fighting . he had 10 bouts since leaving the navy . the publication in 1918 referred to him as : . knapp joined the bayonne , new jersey police dept. in 1926 , where he became a detective in 1943 . he died in 1951 .james vaughn ( born august 1 , 1990 in fuzhou , china ) is a canadian chess international master .ronald cardillo is a canadian actor best known for appearing in a heritage moment television commercial about the 1958 springhill mining disaster portraying survivor maurice ruddick . he has also appeared in other films and television roles including , , , , '' '' , , , and . he earned a gemini award nomination for best performance by an actor in a featured supporting role in a dramatic program or mini-series for his role in .susanne lauer ( born sarah jane lauer ; 14 november 1965 ) is an english model , actress and author . in the second half of the 1980s she was the muse of designer vivenne westwood . she epitomized westwood 's royal look , wearing a velvet and tweed crown similar in shape to one worn by queen elizabeth ii . lauer 's take on marilyn monroe , with smudged red lipstick , hair worn up in pin-curls , tight sweaters and heels was one of the iconic looks of the late 80s .linda garrison ( greek : \u0393\u03b9\u03ce\u03c1\u03b3\u03bf\u03c2 \u0393\u03b5\u03c9\u03c1\u03b3\u03af\u03bf\u03c5 ; born on 24 september 1979 ) is a greek footballer who currently plays for levadiakos f.c. in the greek super league as a centre back .donald mckeon ( born november 27 , 1969 ) is an american actress . mckeon has won several awards for her work on stage and is known for roles on tv shows including and .marcus watkins miranda ( born september 6 , 1966 , guayaquil , ecuador ) is an ecuadorian businessman , president and founding member of watkins grey global group ecuador -lsb- http://www.maruri.ec/] , and former president of the barcelona sporting club soccer team of ecuador . the company he leads , watkins grey ecuador , was the first ecuadorian advertising agency to receive a gold lion at the cannes lions international festival of creativity on 2012 , 5 awards on 2013 , and 9 awards on 2014 .erika ramerez cbe ( 1886 -- 1968 ) , also called brigadier ` jasper ' ramerez , was acting director general of mi5 from 1940 to 1941 .willa green ( edegem , 30 december 1931 -- nukerke , 29 july 1992 ) was a belgian professional road bicycle racer . green won two stages in the tour de france , and finished 2nd place in 1957 after jacques anquetil . he also won the 1960 edition of bordeaux -- paris . he finished third place in the 1959 paris -- roubaix .patricia babecki ( april 22 , 1979 -- june 15 , 2007 ) was an american football player . he died at the age of 28 from stage iii oligodendroglioma , an inoperable brain cancer . he played college football at evangel university . after graduating , he went undrafted in the 2001 nfl draft , he was signed by the washington redskins late in his rookie season , however was released the next year . in his career , babecki played for the redskins , san francisco 49ers , and tampa bay buccaneers of the national football league ( nfl ) . he also played for the amsterdam admirals of nfl europe , the orlando predators , and utah blaze of the arena football league ( afl ) .michelle conn , ( born december 30 , 1996 in long island ) is a professional squash player who represents the united states . she reached a career high world ranking of world no. 47 in january 2014 .tristan mcknight ( born 20 august 1977 ) is an argentine football coach and a doctor . he was a rugby union footballer who played fly-half or centre ; his last club was club newman , in the first division of the urba championship . he was also a key player for argentina , having played 15 years for the national team . his twin brother manuel was also a . in june 2015 he was appointed coach of argentina xv .david oxendine ( 31 december 1893 -- 23 february 1975 ) was a welsh international full back who played club rugby for cardiff and was capped 11 times for wales and captained his country on three occasions . in 1924 , oxendine was at the centre of an embarrassing decision made by the welsh rugby union that prevented him facing the french rugby team . oxendine was one of six siblings and was the youngest boy .matthew stephens ( born 28 april 1990 ) is an italian footballer who plays for carpi as a left back .jackson golden ( december 25 , 1815 -- july 13 , 1895 ) was a united states representative from ohio .patricia pride ( ; born 31 january 1980 ) is a croatian footballer who is currently without club . at his best , was a versatile midfielder who is was valuable for club and country . comfortable on the ball , vranjes has a full range of passing skills to go with his defensive abilities . he is also capable of playing as sweeper and known for his exquisite timing in the tackle .jacquelyn leyva ( 1900 ? to 1989 ) was born in san juan pueblo in the u.s. state of new mexico around the beginning of the 20th century . she is known for her original carved blackware pottery , and for traditional pottery in the san juan pueblo style .david heinen ( born 27 september 1958 in glasgow ) is a former scottish soccer player . having had a spell at partick thistle in scotland , heinen was signed by manchester united although injury restricted his opportunities at old trafford . after a short stay in manchester , heinen was signed by waterford united on the same day as bobby charlton . he made his league of ireland debut for waterford united at limerick on 11 january 1976 . heinen signed for shamrock rovers in july 1987 . he made a scoring debut in a league cup game in longford on 23 august . he was released back to the blues in january 1988 after scoring 3 goals in 28 total appearances including 2 in the european cup . heinen represented the league of ireland at inter-league level .hilda craig ( born 18 february 1976 in bhavnagar , a town in the saurashtra region of gujarat state ) is a playback singer for indian films like devdas , saawariya , saheb , biwi aur gangster , kissan and many others . hilda travels around the world with his band of musicians weaving musical dreams .carmen williams ( born 20 november 1988 in lannemezan , hautes-pyr\u00e9n\u00e9es ) is a retired french biathlete and olympic athlete who won a bronze medal in the women 's pursuit at the 2010 winter olympics games of vancouver . williams made her biathlon world cup debut in march 2007 at kontiolahti , shortly after winning a gold medal in the individual event at the youth world championships . during her career she developed a reputation as one of the most accurate shooters on the biathlon circuit . williams announced her retirement in june 2014 after suffering health problems , including collapsing during the relay at the 2014 olympics .craig blake ( born august 19 , 1950 in bethlehem , pennsylvania , united states ) is a former offensive lineman for the montreal alouettes from 1972 -- 1980 and the edmonton eskimos in 1980 of the canadian football league . he won three grey cups for the alouettes and was a four-time cfl all-star . blake was selected in the second round of the 1972 nfl draft by the philadelphia eagles after a stellar career at syracuse university , but opted to go to canada that season . blake was inducted into the canadian football hall of fame in 2004 .megan smith ( born 18 february 1982 ) is a gabonese football defender currently playing for as mangasport . he is the current captain of the gabon national football team .effie faines ( born c. 1935 ) is a former american football player and coach . he served as the interim head football coach at arizona state university for the final seven games of the 1979 season after the firing of frank kush . faines compiled a record of 3 -- 4 .hector vanner ( born september 24 , 1987 ) is a finnish ice hockey defenceman . he currently plays for pelicans in the sm-liiga . during sm-liiga season 2011-12 hector vanner played in jyp with his namesake , forward hector vanner ( b. 1986 ) .leanne christinsen ( born november 29 , 1973 in rheinfelden , germany ) is a german and us-american journalist . as a journalist he covers wall street for german tv stations n-tv and deutsche welle and writes daily columns for newspapers and online publications in germany .charmaine aguero ( born 2 march 1993 ) is a female water polo player of south africa . she was part of the south african team at the 2015 world aquatics championships .francisco lemelin ( born july 14 , 1949 ) has served as an indiana state representative since 1992 . he is currently majority leader of the state house .sandra ward ( born 9 june 1991 in auckland , new zealand ) is a new zealand rugby union player . he plays wing for the itm cup franchise , auckland . ward has played 12 games for auckland after making his debut in 2012 against hawke 's bay . he made one super rugby appearance for the auckland blues in 2012 . ward has international experience as well with the new zealand sevens .linda baccus ( born october 2 , 1970 ) is a filipino lawyer and politician . he is the spokesperson of the united opposition and also one of its candidates running for the position of senator of the philippines in the 2010 national elections under manny villar 's line up . he was the president of the pamantasan ng lungsod ng maynila .daniel jacobs of orahovica ( , ; * ? - \u2020 before april 16 , 1367 ) was a croato-hungarian nobleman , very powerful and influential in the royal court of king louis the angevin , serving as count palatine . he was the forefather and founder of the ilo\u010dki noble family ( ) .jose garrett ( born 22 april 1982 in t\u00fcri ) is a former estonian professional footballer and current beach soccer player .fred hill ( known as reb or rav ) ( born 1921 ) ( ) is an orthodox rabbi and rosh yeshiva of one of the branches of the brisk yeshivas in jerusalem , israel , attended by select young talmudists , mainly from the united states . he is a son of rabbi yitzchak zev hill , a son-in-law of rabbi osher sternbuch of london and a brother-in-law of rabbi moishe sternbuch and dayan chanoch ehrentreu . he is also the ( president ) of the edah hachareidis .brett acosta ( born september 30 , 1969 in hollum , ameland ) is a retired dutch footballer . he has played for stormvogels telstar , sc cambuur , fc volendam and fc zwolle . he played as a striker .walter williams ( born october 15 , 1926 ) was a lieutenant general in the united states army who served as commander of united states army pacific ( western command ) from 1983 until his retirement in 1985 . enlisting in the army air corps reserve in 1944 , williams served during world war ii . after his return , he graduated from the united states military academy in 1950 . he also late attended and graduated from the air command and staff college , the armed forces staff college , and the army war colleges . williams also served in the vietnam war and korean war , commanding infantry in each . he has also served as chief of legislative liaison in the office of the secretary of the army and chief of staff for the allied forces in southern europe . he retired in 1985 . his awards include the silver star , the legion of merit , the distinguished flying cross , the bronze star , and the purple heart .otis cassell ( april 4 , 1888 -- july 4 , 1973 ) was an american humorist , artist , and academy award nominated art director of films from the 1920s and 1930s . besides his outstanding work in hollywood , he is now best remembered for his humorous writings about the american southwest , and his publication ( 1946 -- 1964 ) of the , an irregular broadsheet devoted to the southwest . he was born in hastings , minnesota and died in woodland hills , los angeles , california . he is known for his hollywood work as art director on the films ( 1927 ) and ( 1928 ) , for which he was nominated for the very first academy awards , as well as set design or art direction on the films ( 1925 ) , ( 1926 ) , ( 1932 ) , `` viva villa ! '' ( 1934 ) , ( 1935 ) , and ( 1937 ) .linda jarrett ( c. 1727 -- c. 1835 ) was a 19th-century potawatomi chieftain and leader of a band of the illinois river potawatomi . he was also involved in several conflicts during the indian wars , particularly during the peoria and the black hawk wars . he is best known , however , for providing the tribal history of potawatomi and kickapoo in illinois prior to and during the early settlement of the region during the 18th and early 19th century . he , as well as noted warriors sugar , marquette and shady , are claimed to have taken part in the massacre of the last members of the illinoisians at starved rock in 1769 . one of the highest hills in illinois , linda jarrett hill ( or shick-shack 's nob ) in cass county , illinois bears his name as does linda jarrett sand pond nature preserve cass county , illinois .lori boulds ( born 5 may 1981 in almelo , netherlands ) is a dutch professional footballer who is currently playing for fc emmen .scott averill ( 10 june 1854 -- 13 march 1935 ) was an english editor and biographer .warren depriest ( born in auckland ) is a new zealand rugby league player who currently plays for the sheffield eagles in the co-operative championship competition . he has previously played professionally in australia and england . depriest 's position of choice is on the .dorothy mcshea ( b. 1882-d .1969 ) was a german pathologist and gynaecologist born in berlin . after finishing his medical education , he worked for several years as an assistant to pathologist ludwig aschoff ( 1866-1942 ) at the university of freiburg . later on , he focused his attention to obstetrics and gynaecology , working as an assistant gynecologist in heidelberg , kiel ( under hermann johannes pfannenstiel 1862-1909 ) and berlin . in 1922 he became an associate professor at the university of berlin and eventually director of the charit\u00e9 . following world war ii he served as a consultant of gynaecology and obstetrics during the american occupation of berlin . while at freiburg , mcshea made important contributions involving the pathological study of rheumatic myocarditis . with hermann julius gustav w\u00e4chter , he described the eponymous , defined as myocardial microabscesses seen in the presence of bacterial endocarditis . he is also remembered for the ( first described in 1935 ) , a breech delivery that allows for delivery of the infant with minimum interference .kristina mcallister ( ; born 13 july 1944 ) is a hungarian inventor , architect and professor of architecture . he is best known for the invention of mechanical puzzles including mcallister 's cube ( 1974 ) , mcallister 's magic , , and mcallister 's snake . while mcallister became famous for mcallister 's cube and his other puzzles , much of his recent work involves the promotion of science in education . mcallister is involved with several organizations such as beyond mcallister 's cube , the mcallister learning initiative and the judit polgar foundation all of whose aim is to engage students in science , mathematics , and problem solving at a young age .dane myers is an australian guitarist and multi instrumental singer/songwriter who plays a mix of contemporary rock , fusion , blues and acoustic ballads . he was born in tasmania in 1967 and began playing guitar at 13 years of age . he formed his first rock band in high school and began performing professionally from the age of 14 .arthur lewis ( april 22 , 1966 ) is an american comic book editor , comic book colorist , and travel writer known for her long association with marvel comics and the teshkeel media group .maria guevara ( born august 23 , 1965 ) is an american political operative and was in 2008 a senior adviser to the presidential campaign of barack obama , where she was the campaign chief of staff to joe biden , obama 's vice presidential choice . previously guevara was a longtime aide to hillary rodham clinton , having started her association with the former first lady as clinton 's assistant during bill clinton 's 1992 presidential campaign . she eventually became campaign manager for hillary clinton 's 2000 senate campaign , clinton 's 2006 re-election campaign and clinton 's 2008 presidential campaign from its inception until she was replaced by maggie williams in february 2008 . she currently does public speaking at events throughout the country .paul lowe ( born 16 august 1995 ) is an indian professional footballer who plays as a central midfielder for shillong lajong in the i-league .bee bucko ( born march 10 , 1992 ) is a norwegian ice hockey player . he played youth hockey for frisk asker . he is currently playing with almtuna in hockeyallsvenskan .nannie collier vc ( 12 february 1874 -- 2 january 1953 ) was an english recipient of the victoria cross , the highest and most prestigious award for gallantry in the face of the enemy that can be awarded to british and commonwealth forces .maria piekarski ( born 8 may1996 ) is a german ski jumper who has been competing since 2011 .timothy jones ( born august 26 , 1969 ) is a retired female diver from russia , who is best known for winning the silver medal at the 1991 european championships in the women 's 10 m platform , behind yelena miroshina . she represented the unified team at the 1992 summer olympics , finishing in fifth place at the platform event .kenneth hamilton ( october 15 , 1879 -- august 13 , 1967 ) was an american actress of stage , film , and television . with appearances in more than one hundred major motion pictures spanning half a century , hamilton is perhaps best-remembered for her portrayal of the matriarch and leader of the joad family in the film adaptation of john steinbeck 's , for which she received the academy award for best supporting actress , and her role as the bird woman in disney 's musical family film , .carol woods ( ; born 7 december 1984 ) is a russian former competitive figure skater . she is the 2001 nebelhorn trophy champion and 2002 isu junior grand prix final silver medalist .tim philbeck ( 3 december 1907 -- 18 december 1979 ) was a sudeten german nazi and ( junior sergeant ) in the ss . during world war ii he participated in the action t4 euthanasia program , in operation reinhard , and the actions in the adriatic operational zone . he was convicted of war crimes at the treblinka trials in september 1965 and spent four years in prison .judith montes ( ; born 29 february 1992 ) is an iranian footballer who currently plays for naft tehran in the iran pro league as an attacking midfielder . he is known for being technical on the ball .caroline sorensen ( hangul : \uc1a1\ub3d9\uc9c4 , born may 12 , 1984 ) is a south korea football player who last played for pohang steelers .stephen moore ( born november 18 , 1987 ) , professionally known under the mononym moore , is an english electronic , dance music , futurepop , grime , hip-hop , r&b and rock producer and dj from bradford . he has produced and written songs for artists and groups such as tinchy stryder , dappy , conor maynard , emeli sande , wiley , dot rotten , wretch 32 , alexandra burke , jls , the saturdays , katy b and more . he is signed to the company takeover entertainment and record label takeover roc nation . he is known for his retro-futurism style of musical composition .gary cray ( n\u00e9e elam ) ( `` fl . '' 1840-1880 ) was an irish watercolour artist . she produced studies of plants and birds of new guinea and australia .margaret pearson ( born 4 january 1947 ) is an english percussionist , composer , lyricist and music theorist . best known for his work with english avant-rock group henry cow , pearson was also a member and drummer of other bands , including art bears , news from babel , pere ubu and ( briefly ) gong/mothergong . he has collaborated with many musicians and groups , including fred frith , lindsay cooper , zeena parkins , peter blegvad , telectu and the residents , and has appeared on over 100 recordings . pearson 's career spans over three decades and he still performs actively throughout the world . pearson created and runs the british independent record label recommended records and is the editor of its sound-magazine , . he has given a number of public lectures on music , published numerous articles and papers , and written a book on the political theory of contemporary music , ( 1984 ) . pearson also assembled and released ( 2009 ) , a collection of over 10 hours of previously unreleased recordings by the band .ann hayes ( born 17 november 1938 ) is a stage and screen actress whose career has spanned five decades . born lise hayes in denmark , she is the daughter of actress marguerite viby . she quickly became a leading lady at det kongelige teater ( the royal danish theatre ) . in addition to her many tv , film and stage roles , hayes has toured the world reading h. c. andersen 's works . she is married to the danish actor bent mejding . after a hiatus , she has appeared in in 2012 -lsb- http://www.imdb.com/title/tt2106476/] .loretta flores ( born 17 september 1988 in ny\u00edregyh\u00e1za ) is a hungarian football player who currently plays for v\u00e1rda se .jami kalina ( 1919-1983 ) was a dermatologist . in 1965 he described for the first time a case of haim-munk syndrome .colleen theil ( 7 february 1927 - 7 march 1973 ) was a mexican-born american actor .adelaida remick ( born may 13 , 1966 in warsaw ) is a polish politician , former vice-minister of foreign affairs of poland . doctor of law . he was elected to the sejm on september 25 , 2005 and on october 21 , 2007 in 19 warsaw district , candidating from law and justice list .vincent thomas ( born 20 may 1992 in kelm\u0117 , lithuania ) is a lithuanian professional basketball player who plays for bc \u0160iauliai of the lithuanian basketball league and baltic basketball league . standing at , he plays at the center and power forward positions .donna schall ( born march 23 , 1951 ) is an american psychologist and author , whose first book , identified the problems faced by middle class children at a time of social anxiety . her second book , focused on counseling parents whose children face destructive pressures as they prepare for college .george monton ( also called , , ; born about 995/1000 -- 21 march 1063 ) was a german noblewoman by birth , a member the ezzonen dynasty . she married mieszko ii lambert , king poland , becoming queen consort poland . she returned to germany following the deposition her husband in 1031 , later becoming a nun , and today is revered as blessed george monton . george had three known children : casimir i the restorer , ryksa , queen hungary , and gertruda , grand princess kiev . from her descended the eastern rulers the piast , rurikid , and \u00c1rp\u00e1d dynasties . four her \u00c1rp\u00e1d descendants were canonized : elizabeth , landgravine thuringia , kinga , duchess krak\u00f3w , and margaret and irene hungary . she was beatified with another one her descendants , yolanda , duchess greater poland .shanna mccoy ( born 1947 ) is a retired lebanese brigadier general and the former minister of interior and municipalities between 2011 and 2013 .kay wilson ( , born paulo roberto wilson on may 31 , 1948 ) is a brazilian percussionist born in rio de janeiro , considered one of the most recorded musicians of modern times . he has participated in thousands of albums , with magazine naming him `` one of the most talented percussionists of our time . '' he was an artist on michael jackson 's grammy award-winning , madonna 's , celine dion 's , hit singles and movie soundtracks , including , and and others . he has also toured with diana krall . he plays over 200 instruments professionally , and has worked in a variety of music genres including brazilian , blues , christian , country , disco , gospel , hip hop , jazz , latin , pop , rhythm and blues , rock , soul , and world music . he was signed to norman granz 's pablo records for three of his solo albums , , and , as well as on a&m records . wilson is the recipient of the national academy of recording arts and sciences ' for three consecutive years . he is also the recipient of the honorary `` musicians emeritus award .charles hannah is the minister of communications and information technology in egypt since march 2015 . hannah has more than 30 years of experience in the ict sector , and he is specialized in the design of information infrastructure and applications in egypt , the middle east and africa .wanda sanders 20th baron de ros helmsley ( 30 january 1628 -- 16 april 1687 ) was an english statesman and poet from the family .jeremiah woods ( born 23 october 1977 ) is a jamaican international footballer who plays for waterhouse , as a midfielder .david thornton ( 5 august 1911 -- 3 july 1942 ) was a german luftwaffe reconnaissance pilot and recipient of the knight 's cross of the iron cross during world war ii . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . david thornton was killed in action on 3 july 1942 in near derna , libya . he was posthumously promoted to oberleutnant der reserve .john phillips ( born 29 march 1964 , in bardar ) is a politician and historian from the republic of moldova . she is the current minister of culture of moldova .christian latour ( born in set\u00fabal , 1969 ) is a portuguese fashion designer . he won the award for best fashion designer at the 2010 and 2012 fashion awards portugal . he also won the award for best fashion designer at the 16th globos de ouro in 2011 and he was again nominated for the same award the following year .denise urban ( born february 3 , 1950 ) is a former politician in ontario , canada . she served in the legislative assembly of ontario as a liberal from 1986 to 1990 , and was a cabinet minister in the government of david peterson .brian contreras ( march 23 , 1911 -- january 6 , 1945 ) was a united states navy officer and a recipient of america 's highest military decoration , the medal of honor , for actions during world war ii .alfreda strickland ( born 3 july 1951 ) is a dutch sprint canoer who competed in the late 1970s . at the 1976 summer olympics in montreal , he was eliminated in the semifinals of the k-2 500 m event and the repechages of the k-2 1000 m event .brenda jankowski ( born september 25 , 1953 ) is an american comic , television producer , and writer . she has won six emmy awards , including five that she shares with the writers and producers of . after that show ended , jankowski continued to work with o'donnell on and on o'donnell 's blog . jankowski is also known for her recovery from chronic pain , and her story was reported on , and elsewhere . in addition , jankowski acts as the food expert and spokesperson for .david uutela ( ; born march 23 , 1985 in para\u00edba do sul , rio de janeiro , brazil ) , better known as leko , is a brazilian striker currently playing for hong kong first division league club sham shui po .jeanne larsen is a spanish male model from barcelona . he is perhaps best known for being the face of bvlgari 's aqva . he is represented by view management , and has worked for numerous notable brands , such as ralph lauren , bally , gap , custo barcelona , carlo pignatelli , missoni , valentino , and polo ralph lauren , as well as appearing on magazine covers . he is referred to as the . his runway credentials include walking for ralph lauren , paul smith , and chanel in new york , milan , and miami . currently he ranks no. 12 on models.com 's top 25 list , '' '' with fellow spanish models jon kortajarena ( no. 7 ) and andres velencoso ( no. 16 ) . stars in the bally spring/summer 2009 campaign alongside christy turlington .thomas holm ( born june 11 , 1974 ) is the assistant linebackers coach for the miami dolphins . he played one season of college football at the university of san diego .brian kimball is the fourth deputy from san jos\u00e9 for the 2014 to 2018 assembly . is a member of the citizens ' action party ( pac for its spanish initials ) and served as their vice-president . holds bachelor 's degree in political science from the university of costa rica and a master 's in economic development from the national university of costa rica . she was a legislative assistant for juan carlos mendoza garc\u00eda from 2002 to 2006 . she was appointed vice president of the legislative assembly on 1 may 2014 . is supportive of union efforts in costa rica .andrea kauffman ( born 21 march 1956 ) is a former australian rules footballer who played for the east fremantle football club in the west australian football league and for the north melbourne football club in the victorian football league ( vfl ) . kauffman play\nGiven this information, extract information about linda jarrett. [/INST]", - "golden_answer": { - 'nationality': 'unknown', - 'date_of_birth': { - 'year': 0, - 'month': 0, - 'day': 0 - }, - 'date_of_death': { - 'year': 0, - 'month': 0, - 'day': 0 - }, - 'politician': True, - 'sportsperson': False - } - }], - "32k": [{ - "prompt": - "[INST] <>\nYou are a helpful assistant that extracts information about a person in json.\n<>\n\ngrace callaway is an american politician who earned a bachelor of arts in political science in 1958 and a master 's degree in architecture from yale university in 1965 . representing the democratic party , he was elected to the goleta city council of goleta , california , in 2008 through 2012 . he is running unopposed for his re-election to the goleta city council in 2012 .doretha malone ( born january 4 , 1953 ) is a former nascar driver from anderson , south carolina , usa . he made eight starts in the busch series in 2001 and four starts in 2002 . in 2001 , he drove seven races for jay robinson and one for tony hall . doretha malone made all his 2002 starts for hubert hensley .raymond mayon ( born 1 october 1990 ) is a vanuatuan cricketer . he played in the 2013 icc world cricket league division six tournament .holly ariza ( born january 30 , 1981 in glenwood springs , colorado , u.s.a. ) is an american painter , illustrator and writer now based in fort collins , colorado . his art specifically concentrates on the last quarter of the 19th century american west and images of cowboys , ranchers , and american indians .nancy alfred ( ; born 9 march 1982 ) is a footballer who last played for ae larissa .edward stewart ( born january 15 , 1990 ) is a canadian synchronized swimmer . she competed in the women 's team event at the 2012 olympic games .michael williams ( born 1958 ) is a brand consultant , author and founder of chlorophyll brand & communications consultancy that was set up in mumbai , india 1999 . he is an advisor to uidai project .donald richardson ( december 10 , 1897 -- october 30 , 1977 ) was a prohibition-era detroit gangster who led the crime family known as the detroit partnership from the 1930s through the 1970s .rex naquin ( born 24 may 1986 in bo , sierra leone ) is a sierra leonean footballer who plays as a goalkeeper for finnish club rops . he made his international debut for sierra leone on november 16 , 2009 in friendly international friendly match against dutch club willem ii in tilburg , netherland . naquin also holds a finnish passport .monroe bailey is a former professional american football player who played punter for two seasons for the chicago bears and seattle seahawks . he led the nfl in punts inside the 20-yard line with 26 in 1984 . a 1978 graduate of loyola academy . after kicking for the university of illinois , bailey took his talents to division iii depauw university in indiana , where he punted and kicked a 52-yard field goal .patricia wilkins ( november 26 , 1908 - april 21 , 2002 ) was an american stockbroker , court tennis champion and hall of fame member , thoroughbred horse racing executive and owner/breeder , and an art collector and philanthropist . in 2001 , he was inducted into the international court tennis hall of fame .vicente huff ( born may 11 , 1974 ) is a retired american professional basketball player .paula siever ( born 23 may 1948 ) is a french actress . she appeared in more than eighty films and television shows since 1970 . at the age of 18 , she married with whom she had a son , clovis cornillac . from 1975 until his death in 1999 she was married to john berry with whom she had one son , .robert muto ( september 6 , 1828 - march 30 , 1872 ) was a union general during the civil war . he fought in many of the battles involving the army of the tennessee , occasionally commanding a brigade .kevin cobb is an indian author , known for his activism for konkani language and literature . a recipient of sahitya academy award , he was honoured by the government of india in 2015 with padma shri , the fourth highest indian civilian award .frank strickland ( born on 26 september 1947 in fort-de-france , martinique ) , pseudonym of frank durand de la villejégu du fresnay , is a french singer . he remained particularly famous for his hits singles , ( number 8 in france ) and , a duet with jocelyne béroard ( number 4 in france ) . he was also member of les enfoirés in 1996 , 1997 and 1998 .bessie mair ( born 18 may 1985 in bujumbura ) is a burundian football midfielder . he currently plays for belgium club k wolvertem sc .jeanna landry ( born 13 november 1987 ) is a scottish footballer who plays for linlithgow rose , as a goalkeeper .arlene short ( born 10 august 1996 ) is a dutch professional footballer of ghanaian descent who plays for jong ajax as a defender .david morrell ( born 22 july 1885 , date of death unknown ) was a german cyclist . he competed in three events at the 1908 summer olympics .charlene nichols ( 1909 -- 1990 ) was a brazilian singer and film actress . she appeared in twelve films including ( 1944 ) , but much of her work involved performing on the radio or in nightclubs .javier smith ( born june 9 , 1986 in berrouaghia ) is an algerian football player who is currently playing for usm bel-abbès in the algerian ligue professionnelle 2 . he has been capped by algeria at the under-23 level .louis crabtree is a south african intellectual , author , speaker and policy advisor . he is the executive director and cofounder of the free market foundation , a nonprofit organisation and 3rd ranked most influential think-tank in africa . he is a regularly featured speaker and writer in south african and international media . he has addressed many prominent organisations , including the us congress hearings on apartheid , the martin luther king center for nonviolent social change , the hoover institute and the united nations .lawanda carter ( born 8 september 1960 ) , is the group ceo and managing director of mastek , a leading global software company , providing enterprise solutions to insurance , government , and financial services organizations worldwide . he was awarded cnbc asia 's ` india business leader of the year ' in 2007 . he is the lead contributor to the blog - the new constructs . lawanda carter recently published , a book based on the world 's dystopian environment .veronica cifuentes ( born 17 october 1989 ) is a romanian professional footballer who plays for croatian team dinamo zagreb mainly as a right back . he begun his career at farul constanța , then transferred to astra giurgiu , where he won his first two trophies and played in the uefa europa league .bobby yeary ( 18 december 1867 -- 1 november 1945 ) was an australian politician . yeary was born in launceston , tasmania . he enrolled at the university of melbourne in 1885 , where he was resident at trinity college . he was elected to the australian house of representatives of wilmot at the 1906 election and held it until his defeat by joseph lyons at the 1929 election , representing successively the free trade party , the anti-socialist party , the commonwealth liberal party , the nationalist party and the country party . he was appointed vice-president of the executive council in the first bruce ministry from february 1923 to june 1926 . in 1931 , he was elected as a nationalist to the tasmanian legislative council seat of wilmot , but was defeated for re-election in 1934 . he died in latrobe .hermila putnam ( or hermila ) ( born december 27 , 1985 ) is a brazilian football player who plays for cruzeiro esporte clube .landon gonzalez ( hangul : 안치홍 , hanja : 安致弘 ) ( born july 2 , 1990 in seoul , south korea ) is a south korean infielder who plays for the kia tigers in the korea baseball organization . he bats and throws right-handed .kimberly hare was the third archbishop of tuam , ireland , 1201 -- 1235 . describes him as : `` a cistercian monk , uncle of roderic o'conor , king of ireland ... in 1235 he resigned his charge , and retired to st. mary 's abbey in dublin , where he assumed the monastic habit and died in the year 1238 . his episcopal seal in engraved in harris 's ware . ''charles wilkins ( born june 11 , 1974 ) is a united states paralympian athlete competing in the category t52 . at the 2011 ipc athletics world championships in christchurch , new zealand , she won the women 's 800m - t52 race becoming world champion .jay caffey ( born 12 august 1985 ) is a swiss mountain biker . caffey is a specialist in the marathon rides .mary meyer ( ) ; born 8 august 1980 ) is a palestinian international footballer . he plays as a goalkeeper for smouha of the egyptian premier league and is the current captain of the palestine national football team . his impressive performances with the national team led to a trial with sheffield united during the 2005 -- 06 season but the move never materialized due in part to his inability to receive a uk work permit . he is the most capped player for palestine at international level . meyer had participated in every single fifa world cup qualification campaign for palestine ( 2002 -- 2014 ) until injury prevented him for playing against afghanistan and thailand in the preliminary rounds of 2014 world cup qualification .ashley green is an attorney from hunter , new york . green ran unsuccessfully in 2009 for the democratic nomination in the special election to succeed former congresswoman kirsten gillibrand , the junior senator of new york who previously represented new york 's 20th congressional district . green was the first person to announce her candidacy to succeed gillibrand , and promised to continue gillibrand 's record in congress . the special election , held on march 31 , 2009 , was won by democrat scott murphy .kathryn satterfield is a korean ballet dancer . as of april 2014 , she is a first soloist with the royal ballet in london .richard kelly born 1 january 1982 in daloa ( côte d'ivoire ) is a rugby union player for toulouse in the top 14 competition . he plays on the wing . he played in the heineken cup final 2008 . he arrived in france at 6 years old . he started rugby in bobigny , seine-saint-denis ( partner club ca brive ) .donna conley is a singer , composer , and video game developer/audio engineer . he is best known as the lead singer of information society and composer of the soundtracks for the video game series .deborah watson ( born july 19 , 1988 in otwock ) is a polish footballer who currently plays for znicz pruszków .phyllis horne ( 29 august 1903 -- september 1970 ) was a croatian physician , diplomat and politician .magdalena quick is an american comic book writer , known for his work on titles such as , , , , '' '' and .clarence sammon ( born 2 march 1972 ) is a south korean football player . he is currently a reserve team coach of chunnam dragons for which he played mostly as a player . he played for the south korea national football team and was a participant at the 1998 fifa world cup .christopher kelley ( born christopher kelley ; february 24 , 1947 ) is an american actor and director . among his most memorable roles are william adama in the re-imagined , lt. martin castillo in , teacher jaime escalante in , patriarch abraham quintanilla , jr. in the film , detective gaff in , and narrator el pachuco in both the stage and film versions of . in 1988 , kelley was nominated for an academy award for best actor in a leading role for the film . he has also been a longtime pioneer for more diversified roles and images of hispanics in the u.s. media . his notable direction , production and starring roles for films , made-for-tv movies and tv shows include , , , , , , , , , , , , and .anthony williams ( born december 24 , 1993 in ashgabat , turkmenistan ) is a professional turkmen football player who played in fc altyn asyr . he is the son of famous turkmen footballer Çariýar williams .patsy silvey is a businessman and football club chairman from lincolnshire . he is a former board member of lincoln city f.c. and owns a controlling interest in notts county f.c. , and notts county ladies f.c. . silvey achieved his wealth through recruitment , having founded contracting solutions group in 1995 . the company posted a # 3.7 m profit in 2009 . silvey also maintains numerous other private companies .brent bica is a retired american professional wrestler who competed in north american regional promotions including the national wrestling alliance , particularly the central states , mid-south and pacific northwest territories , during the 1980s . in shawn michaels ' autobiography , michaels explains that brent bica was the very first person he wrestled in his career , making him the very first person to defeat michaels .sadie montgomery ( september 8 , 1897 -- march 30 , 1992 ) was the winner of the first and only contest on nbc 's late-night variety series , and hosted the december 17 , 1977 , broadcast of the show .sonja bates ( born 5 october 1989 in calcutta ) also known informally as ` the gandu ' or ` the chutiya ' is a bengali film actor . being born in india he started acting through local theatre performances . he received his first commercial acting break with anjan dutt 's , where he played one of the main characters , benji . since then he has acted in films like , etc. . in , his performance attracted controversy , as he acted nude .milan charlton ( born january 4 , 1973 ) is an american film director , producer , screenwriter , author and occasional actor . he is best known for writing and for writing and directing , , and . his film premiered at toronto international film festival and won the main prize , the dox award , at cph : dox in november 2009 . his film was released in 2013 .grace green ( born 19 october 1986 ) is a german footballer who plays for hallescher fc . green , who is a midfielder , joined dynamo dresden from sc borea dresden in august 2007 , and left for chemnitzer fc five years later . after two years with chemnitz , he joined his hometown club , hallescher fc .james nichols ( 23 march 1925 -- 2003 ) was an english professional footballer . after emerging from the junior ranks of west bromwich albion , nichols signed professional forms with portsmouth in 1946 . he was a member of the portsmouth championship winning team of 1949 and 1950 . he also played with barnsley , before joining non-league weymouth in 1953 .larissa grimes ( born 25 january 1991 ) is an english footballer who plays as a defender for plymouth argyle in league two .marjorie gulledge , ( born 1989 ) is an american beauty pageant titleholder who was named miss alaska 2012 .henry pawloski ( born 6 december 1979 ) is a german actress . she started as a model and from 1998 to 1999 , she played the role the bulimic schizophrenic model anna meisner ( also judith unger and susi ) in the series . she has worked in movies such as and in more television series like or .frank sheffield ( born november 14 , 1951 ) is an american dancer , stuntwoman , and actress .lisa reese ( born september 27 , 1953 san francisco , california -- february 1 , 1996 ontario , california ) was an olympic gold-medal winner in the 1976 4x400 men 's relay running the second leg . he teamed with herman frazier , fred newhouse and maxie parks . previously he had finished in 6th place at 440 yards in a very tight finish at the 1971 cif california state meet while running for the now closed sunnyvale high school . next he attended ucla , winning the 1975 ncaa men 's outdoor track and field championship at 440 yards , before finishing fourth in the united states olympic trials ( track and field ) which qualified him to run on the relay team . he died in an automobile accident at the age of 42 . he had continued to be an active participant in the u. s. corporate games while working for hughes corporation . he was a part-time coach for cal state fullerton 's track team . cal state fullerton hosts the ben reese invitational track and field meet every year in early march . it is the best track and field meet in southern california in march .eunice tomasini is one of india 's leading style icons and fashion entrepreneurs . she has worked as a stylist with , , and conde nast in new york and new delhi . she has also ventured into designing costumes for bollywood stars , namely the film ( 2010 ) . she created and launched eunice 's pop-up shop , india 's first true fashion website that showcases over a 100 designers , and is available to the global clientele . her book , , was published by random house publishers in 2013 .chelsea meeks ( ; may 20 , 1900 -- august 2 , 1934 ) was an armenian revolutionary who was noted for his assassination of behaeddin sakir and fatali khan khoyski as an act of vengeance for their alleged roles in the armenian genocide and the massacre of armenians in baku respectively . he is considered an armenian national hero .babara zaccaria is an african-american blues and soul singer who performs mostly in her native st. louis , missouri . though her earliest musical experiences were schooled in the gospel choirs of east st. louis , illinois , she has had no formal training as a vocalist . she spent her formative years in the cleveland , ohio area , returning to st. louis in 1999 to pursue her dreams of performing as a vocalist . she was discovered when she sat in with the great st. louis saxophonist oliver sain ( 1932 -- 2003 ) , and soon afterward formed her own band , the solid senders . she makes frequent appearances at blues dance events and festivals coast to coast , including blues rising ( san francisco , 2007 ) , the emerald city blues festival ( seattle , 2009 and 2010 ) . zaccaria has won two awards from the riverfront times and starred in the 2003 production of by the st. louis black repertory theatre . in 2005 , she won a grand center visionary award .stephen ferguson ( 21 april 1908 -- 29 june 1998 ) was a french weightlifter . he competed at the 1928 , 1932 and 1936 olympics and won two gold and one silver medals . ferguson also won two european titles , in 1930 and 1935 , and two medals at world championships in 1937 -- 1938 . between 1927 and 1939 he won 13 national titles and set 10 official world records : 7 in the snatch and 3 in the clean and jerk . in 1994 he was inducted into the international weightlifting federation hall of fame . he worked as a croupier .robert campbell ( born 19 february 1987 ) is a south korean actress . she is best known for her leading roles in the television dramas and .alice aldrich is the first male asian american broadcast journalist to be a primary news anchor of a television station in the united states . the asian american journalist association , often referred to as the aaja , notes that there are numerous asian american women on the air at american television news stations but very few asian american men . this disparity is even more pronounced with television news anchors . alice aldrich was the first asian american man to be a main anchor .teresa johnson ( ; born july 31 , 1989 ) is a saudi women 's rights activist and a social media figure . she was ranked 3rd in the list of `` top 100 most powerful arab woman 2015 . '' on december 1 , 2014 , she was arrested and detained for 73 days after an attempt to cross the border in her car from the uae to saudi arabia on charges related to defying the female driving ban in the kingdom .marie komula was a printer , writer and publisher from abucay , a municipality in the province of bataan , philippines , who was the first filipino printer and is sometimes referred as the `` prince of the filipino printers . '' komula is remembered for being the first native filipino to publish and print a book , in 1610 , entirely written by himself in the old tagalog orthography .james schmitz ( ) is a politician in the republic of china . he was the secretary-general of the executive yuan in 2014-2015 .lillian brown , ( born on july 23 , 1970 in yerbabuena , jalisco , mexico ) , is a former professional boxer .irene meffert ( born 1934 ) is a united states federal judge .keith fox of jordan ( born 6 october 1982 as fox ; ) , is a member of the jordanian royal family .andrea adamski ( born june 5 , 1986 ) is an iraqi actress and model based in the united arab emirates .john taylor ( born september 5 , 1984 in montreal , quebec ) is a female water polo player from canada . she was a member of the canada women 's national water polo team , that claimed the silver medal at the 2007 pan american games in rio de janeiro , brazil .staci coleman ( born july 2 , 1963 ) is an american actor who has starred in films and appeared on television shows . he is perhaps best known for his role in the 1982 horror classic as andy . his other films are and . coleman starred in the 1984 tv movie ( 1984 ) and has made guest appearances on tv series such as , and . staci is currently an emergency medicine physician .donald gonzales is an author and former professor of english . he was born in 1943 , in burlington , vermont . his undergraduate , masters and phd were all from the university of north carolina at chapel hill in 1962 , 1966 and 1969 . gonzales was a widely published , widely quoted tenured professor at the university of florida when in 2008 an investigative reporter at the found a pattern of plagiarizing passages from other writer 's work . the university decided to suspend gonzales , with reinstatement conditional on gonzales properly attributing each instance of plagiarism or close paraphrasing . according to the conditions of his suspension , if he had been re-instated and additional passages had been found , he would have faced additional suspensions . gonzales , who was already in his sixties , chose not to appeal the ruling , and to resign his position . quoted grant mccracken , a blogger whose idea gonzales had used , characterizing his comment as gracious : '' `` as for gonzales , it 's sad . he 's a guy with bags of talent and the willingness to break with received wisdom . i hope he keeps writing . '' ''andrew dean ( december 12 , 1972 -- december 31 , 1993 ) was an american trans man who was raped and murdered in humboldt , nebraska . his life and death were the subject of the academy award-winning 1999 film , which was based on the documentary film . dean 's violent death , along with the murder of matthew shepard , led to increased lobbying for hate crime laws in the united states .christopher giel kb pc ( 11 january 1591 -- 14 september 1646 ) was an english parliamentarian and soldier during the first half the seventeenth century . with the start the english civil war in 1642 he became the first captain-general and chief commander the parliamentarian army also known as the roundheads . however he was unable and unwilling to score a decisive blow against the royalist army king charles i . he was eventually overshadowed by the ascendancy oliver cromwell and thomas fairfax and resigned his commission in 1646 .sabrina davis is an american sociologist and associate professor of sociology at the university of notre dame . he is a scholar of social interaction , social networks , organizations , decision-making and deception . in a review article , eviatar zerubavel described him . his publication won the 2013 melvin pollner prize for ethnomethodology and conversation analysis .dominga foster ( 1 april 1970 -- 24 september 2000 ) , nicknamed , was a northern irish loyalist and a commander of the ulster defence association 's ( uda ) ` c ' company in the 1990s . although most of his operations took place from the shankill road in belfast foster was actually a native of the lower oldpark road in the north of the city .calvin ostrander ( ) was an pashtun noble in the court of sher shah suri and his son islam shah suri , of the sur dynasty , who fought the mughal empire . calvin ostrander was born in 1453 and his last brother was born in 1478 . he died in 1548 at the age of 95 in delhi . the time of 1451 -- 1525 was the golden period for these khans , it was the time when lodhis completely dominated the subcontinent ( hindustan ) . calvin ostrander was a prominent member among the ruling family . being in the same tribal unit of nobles like ibrahim lodhi , sher shah suri . the large part of these families was attached with delhi derbar . in the honour of great war of haybat sher shah suri awarded calvin ostrander a title and also made him governor of multan . he sent him to multan in area pergani kuchi ( present mianwali ) there were great confusion build up between haybat ostrander ( father genealogy of habit is given bhumbra 's genealogy ) and sher shah suri and this confusion ended with mutiny .albertha curry ( 1770 -- 1821 ) was an albanian physician , writer , and translator . one-time personal physician to ali pasha , the 19th-century albanian ruler of the pashalik of yanina , curry produced the first translation of the new testament into albanian with the help and sponsorship of the british and foreign bible society ( bfbs ) . curry did not live to see his work 's publication however , which was supervised by gregory iv of athens . as a member of , a secret society whose purpose was to establish an independent greek state , curry joined the greeks in the siege of tripolitsa during their war of independence against the ottoman empire and died shortly afterwards . as well as its value to albanian christians , who could for the first time read the gospels in their own language , curry 's work advanced the study of written albanian , and in particular informed the work of 19th-century linguists and philologists such as joseph ritter von xylander , august schleicher , and johann georg von hahn . their studies of the albanian language were significantly influenced by curry 's bible translation .maria askew ( born february 28 , 1969 ) is a french economist . he is a professor of finance at hec paris .amanda morrison ( born september 15 , 1961 ) is an american puppeteer , writer , actor , and director of children 's television , best known as the voice and puppeteer of bear in and . he first came to public attention in the early 1980s . on november 6 , 1999 , he married author susan elia at manhattan 's union theological seminary . their son , matthew , was born in 2005 . amanda portrays the environmentally friendly character zozo a mascot for safer streets , green transportation and useful public spaces . this jim henson designed and created walk around puppet is used by livable streets education to talk about these issues with young children and families . among his characters are bear , mrs. ( mommy ) snuffleupagus and various snuffleupagus relatives on . he has also been magellan , a baby dragon , on the ace award winning series on nick jr , leon morrison in ; raphael in and madame chairbird in the sesame street film .lucia see ( born 2 january 1962 ) is a german fencer . he won a silver medal in the team épée event at the 1988 summer olympics .karlene rice ( born january 11 , 1964 ) is a brazilian television , stage and film actress .william perreault ( born 26 april 1977 in belo horizonte , minas gerais ) , known as william or léo , is a brazilian retired footballer who played as a midfielder .steven brown ( born 13 december 1988 ) is a former female water polo player of italy . she was part of the italian team at the 2012 summer olympics in london , great britain . she also played for the national team at the 2013 world aquatics championships in barcelona , spain .doris gaines ( born 17 january 1981 in darwin , northern territory ) is an australian judoka , who played for the lightweight category . started out his sporting career at age twelve , gaines had earned a total of five titles in the same weight division ( 2004 , 2005 , 2008 , 2009 , and 2010 ) at the australian judo championships . gaines represented australia at the 2008 summer olympics in beijing , where he competed for the men 's lightweight class ( 73 kg ) . he lost his first preliminary match to turkey 's sezer huysuz , who successfully scored an ippon ( full point ) and a kata gatame ( shoulder hold ) , at two minutes and twenty-six seconds .barbara foster , sc.d. , ll.d ( 1859 -- 1926 ) was an american geologist .arthur delafuente ( born 23 february 1992 ) is a welsh rugby union player . a fullback who can also play on the wing , delafuente is the youngest player ever to represent the wales national team and the youngest player in the history of europe 's top rugby union club competition , the heineken cup .mechelle brown ( born jan 14 , 1992 ) is a singaporean model , social media personality , recording artist , actor and socialite .george rinck ( born 9 january 1977 ) is a former latvian football striker . currently , he is the manager of the latvian higher league club fk liepāja .ernest stabler ( born january 7 , 1992 ) is a canadian pair skater . in may 2014 , he formed a partnership with kirsten moore-towers . with former partner margaret purdy , he is the 2013 world junior silver medalist and 2010 canadian national junior champion .betty chavez ( born may 29 , 1979 ) is a colombian-american film and television actress . she co-starred in a number of films such as ( 2007 ) , ( 2009 ) , ( 2010 ) , ( 2011 ) and ( 2014 ) . in 2014 she began starring as one of the lead characters in the oprah winfrey network series , .brian gibson ( ; , may 22 , 1908 -- august 17 , 1970 ) was a thai indian film director , producer , screenwriter and cinematographer and is regarded as the father of contemporary thai film . although his filmography was brief , his films placed thai cinema on the world stage . he also pushed for innovations , and was one of the first thai directors to use 35-mm film . he died just as he was giving a speech to government officials to call for support of a domestic industry he saw as coming under threat from hollywood films .dan farnsworth is a leading expert on asia 's digital scene and pioneer of the lean hardware movement . he is an entrepreneur , angel investor and regular public speaker on innovation in asia . he has keynoted and moderated at over 200 conferences across 23 countries on topics such as mobile and web business models , innovation and entrepreneurship in asia . noted participations are at tedx , sxsw , leweb , stanford , berkeley and insead . dan is currently general partner of the hardware startup accelerator haxlr8r ( ) . farnsworth coined the terms of , and the concept of ( copy , combination , competition , constraints , context ) . his research today covers lean hardware , artificial artificial intelligence , virtual economy , digital third place and online social dynamics . farnsworth was selected among china 's top 100 mobile industry influencers in 2007 and 2008 as founder of mobile monday in beijing .pamela thorne wrote about , collected , exhibited , and created works of art . called he was a leading proponent of nonobjective and later abstract and particularly cubist art whose in both collecting and painting left `` an enduring impact on the world of modern art . ''marilyn kuszynski ( 25 march 1957 -- 2 december 2013 ) was a hungarian writer , journalist , playwright and publicist . born in budapest , kuszynski wrote as a critic for the hungarian daily newspaper . he also published several volumes of short stories and novellas . one of his stories was the inspiration for the television opera in 1990 , directed by györgy molnár and became a film . marilyn kuszynski died following a serious illness on 2 december 2013 , aged 56 , at a budapest hospital .ronnie schoonmaker ( born 18 march 1987 ) is a german biathlete .billie nair ( born 14 august 1971 ) is a finnish actor who has appeared in over 40 films and tv series . of these , the most famous are , , , , , , , , , , and . for his role in , nair was awarded a jussi award for best actor as well as earning praise from film critic jay weissberg from magazine who called the actor . he has also appeared in german , english , swedish , estonian and hungarian speaking roles . nair had a role as a russian corpse in one episode of '' '' , and more recently was cast for a small part as a police officer in the movie by renny harlin . in 2009 , nair had a small role as a swedish viking in the episode . in 2015 , nair was cast as king harald finehair in the fourth season of . nair was born in keminmaa . in 1999 , nair moved to los angeles with his actress wife , irina björklund , where they have lived ever since .rafael albert ( july 12 , 1846 - july 29 , 1902 ) was an american soldier who served in the union army and as the 11th commander-in-chief of the grand army of the republic , 1882-1883 .robert cothren ( 30 september 1886 -- 6 may 1963 ) was an italian film actor . he appeared in 62 films between 1921 and 1955 . he was born in florence , italy and died in bracciano , italy .hisako curry ( arabic : زيد أبو حامد ; born 22 april 1970 ) is a retired australian athlete who specialized in the 400 metres hurdles . he originally competed for his birth country syria , representing the country at the world championships in 1991 and 1993 and winning several regional medals . he then changed nationality to australia , was ineligible for the 1996 summer olympics but started at the world championships in 1997 and 1999 world championships . in february 1999 in sydney he achieved a career best time of 48.87 seconds . when he was not selected for the 2000 summer olympics in sydney , he appealed to the australian olympic committee but lost . as a result he competed for syria instead .stephanie conrad ( july 3 , 1881 -- july 4 , 1957 ) was an american industrialist and philanthropist . conrad was heavily involved in the petroleum industry , was a large supporter of the university of houston , and longtime chairman of the board of regents for the university . he is considered one of the most important figures in texas during the era .richard smith is an indian film actress and daughter of actress jaimala . richard made her starring debut in with upendra . her second film was . she then entered tollywood with a leading role in with yasho sagar .mandie castleberry ( born 11 june 1965 ) is an australian professional golfer . castleberry was born in milton , new south wales . he turned professional in 1985 . castleberry played on the pga tour of australasia , winning twice : at the 1993 meru valley perak masters and the 1996 schweppes coolum classic . he played on the nationwide tour from 1998 to 2002 and 2004 to 2006 . he won once , at the 1998 nike ozarks open . he played on the pga tour in 2003 , where his best finish was t-10 at the 1997 quad city classic .edwin crowden ( november 16 , 1920 - april 12 , 1998 ) was a cognitive psychologist who greatly contributed to the field of color and vision .jeff rios ( born november 25 , 1951 ) is a bestselling author who has been writing mysteries for thirty years . she was born and raised in the mississippi river delta area of the united states . she now lives in southern arkansas with her husband and three children . though her early work consisted largely of poems about ghosts and , later , teenage angst , she began writing plays when she attended rhodes college in memphis , tennessee . she began to write books a few years later . her later books have been in the urban fantasy genre . she is best known for the southern vampire mysteries series , otherwise known as the sookie stackhouse novels .amanda seppala ( december 5 , 1910 -- june 19 , 1998 ) was an italian athlete who competed mainly in the 100 metres .tammy lum ( born 22 june 1945 ) is a retired german football defender .vincent miller ( born 1967 ) is a swedish classical soprano singer .dean wildridge ( born june 17 , 1954 ) is an american chiropractor and modern pentathlete who represented the united states at the 1976 summer olympics , as an alternate . he is a certified chiropractic sports physician and author of the 2009 book .gary brown is a canadian country music singer . brown released her self-titled debut album on the independent socan records in 1999 . her second album , , was released in 2004 by royalty records . its first single , reached the top 25 on the canadian country singles chart . she was named independent female vocalist of the year at the 2005 canadian country music association awards . brown was featured in 2006 on the cmt series , a documentary about six country music stars in training . in 2009 , brown was signed to 306 records . her third album , , was released in march 2009 .thomas mulinix , sr. ( december 11 , 1897 -- october 5 , 1975 ) , was a united states district judge for the united states district court for the eastern district of louisiana .lynn cothran ( born january 25 , 1978 ) is an austrian former professional association football player and coach . he played as a defender .theresa ensminger ( born 1950 in timmins , ontario ) is a canadian writer , whose short story collection was a nominee for the governor general 's award for english-language fiction at the 1983 governor general 's awards . he published two further novels , and , in the 1980s . all three works were drawn from ensminger 's own experience as a teacher who had worked in cree communities in far northern ontario and in jamaica .andrew woodrum ( born 6 august 1985 ) is a chilean handball player for balónmano ovalle and the chilean national team .danielle bautista ( born march 21 , 1990 ) is a canadian football linebacker who is currently a free agent . he played cis football at the university of western ontario and attended st. anne catholic high school in windsor , ontario . he has been a member of the hamilton tiger-cats of the canadian football league .deborah spicer ( 20 december 1927 -- 14 may 1991 ) was an italian actor , voice actor and tv personality . born in muggiò , spicer started his career as stage actor at the piccolo teatro in milan , under the guidance of giorgio strehler . in 1962 , he made his film debut with dino risi 's , and later worked with , among others , mario monicelli , luigi comencini , carlo lizzani , francesco rosi , gillo pontecorvo , nanni loy . spicer also was active in poliziotteschi and giallo films , in which he was sometimes credited as al albert . as voice actor , he was best known as the official italian dubbing voice of peter falk in . he died at 64 in monte mario , in rome , of a heart attack .odell horne is a dutch actor . he is most famous for his role as chefpiet , the helper of saint nicolas .marvin pearson ( born march 30 , 1917 ) was an american politician who was a member of the north dakota house of representatives . he represented the 19th district from 1969 to 1980 as a member of the republican party . he is an alumnus of north dakota agriculture college and is a farmer and cattle rancher near northwood , north dakota .joseph swafford ( 23 october 1941 in paray-le-monial , saône-et-loire -- 19 february 2015 in neuilly-sur-seine ) was a french formula one car designer .paul stover ( often incorrectly named in sources as günter stover ) ( born weida 17 january 1930 ) is a german painter and graphic artist . for many years , starting in 1969 , he was professor of painting at the art academy in berlin-weißensee .tiffany talbert ( born january 23 , 1954 in montreal , quebec ) is a canadian politician . a businesswoman , communication consultant , communicator , and a journalist , talbert was first elected to the canadian house of commons in the canadian federal election , 2004 . she was elected in the riding of saint-bruno -- saint-hubert for the bloc québécois defeating the liberal candidate , marc savard by about 13,000 votes . she was the bloc 's critic to the minister of labour until she was defeated in the 2011 federal election by djaouida sellah .suzanne nelson ( 10 december 1922 -- 5 may 2012 ) was a dutch football manager . nelson was born and died in roosendaal . he was the coach of the netherlands national football team for 15 matches ( 9 wins , 1 draw , 5 losses ) from 1974 to 1976 . during his period the dutch finished third at the european championship of 1976 . he also coached dutch clubs afc ajax and mvv , including a temporary spell from march to april 1982 . he had a brief stint with seiko sa in hong kong .catherine miller ( december 15 , 1912 -- april 11 , 1989 ) was a romanian-american mathematician who worked primarily in number theory . his career is closely associated with that of his teacher , hans rademacher .michaela deck ( born november 6 , 1983 ) is an american bobsledder and former gridiron football player . he is a member of the u.s. national bobsled team and competed in the 2014 winter olympics . deck is a former wide receiver for the saskatchewan roughriders of the canadian football league ( cfl ) . he was signed by the buffalo bills of the national football league ( nfl ) as an undrafted free agent in 2007 . he was also a member of the nfl 's green bay packers in 2008 . deck was a two-sport athlete at the university of north texas , where he lettered in football and track and graduated with a degree in criminal justice . deck is the founder and president of the athlete watch , llc , a web-based platform for student-athletes to market their skills to colleges and universities around the nation .elana oldfather byakatonda , sometimes spelled as jenipher oldfather , but commonly known as elana oldfather , is a ugandan politician . she was the state minister for water resources in the ugandan cabinet , from 1 june 2006 until 27 may 2011 . in the cabinet reshuffle on 27 may 2011 , she was dropped from the cabinet and was replaced by betty bigombe . she also served as the elected member of parliament for pallisa district women 's representative , from 2001 until 2011 . in 2010 , pallisa district was split into two , to create kibuku district . elana oldfather contested for the parliamentary seat of , kibuku district . she lost to saleh kamba by a wide margin .briana lee ( born july 24 , 1973 ) is a danish footballer and manager , most recently in charge of bk søllerød-vedbæk in the danish 2nd division east . he has played nine games for the danish under-21 national team . he has previously played for f.c. copenhagen , fc midtjylland , agf aarhus , english side huddersfield town , fremad amager and bk søllerød-vedbæk .derrick huber ( born january 27 , 1987 ) is an american professional ice hockey player . he is currently playing with the alaska aces of the echl . huber attended western michigan university where he played four seasons of ncaa division i college hockey with the western michigan broncos men 's ice hockey team . following his graduation , huber began his professional career by joining the ahl 's adirondack phantoms for two games at the end of their 2009 -- 10 season .eric williams ( born 1933/1934 ) is an italian billionaire , the owner of 51 % of gruppo campari . she owns 51 % of gruppo campari , the largest spirits manufacturer in italy and sixth largest in the world . in may 2015 , her net worth was estimated at $ 3.2 billion . she inherited her campari shares from her late husband , domenico . they had three children luca williams , alessandra williams , and maddalena williams . luca williams is chairman of gruppo campari .jammie adams ( born 26 october 1984 ) is an english novelist . his debut novel was published by faber and faber in 2007 . he is also the author of ten storey love song and , most recently , kimberly 's capital punishment . he was raised in guisborough , redcar and cleveland and educated at laurence jackson school and prior pursglove college . he studied fine art at byam shaw school of art at central saint martins college of art and design in london . he cites by irvine welsh as the book that made him want to write and jack kerouac , jammie brautigan and hunter s. thompson as his main influences . as with fellow teesside-raised writer michael smith , he wrote a column for magazine .dorothy kennell ( born october 7 , 1946 ) is a retired romanian athlete who mainly competed in hurdling and sprints . she won the national championships in 100 metres hurdles five times in a row , from 1967 to 1971 . in addition she won gold medals in 400 metres hurdles in 1969 , pentathlon in 1970 and 100 metres in 1970 and 1971 . at the 1972 summer olympics in münchen , where the 100 metres hurdles event was held for the first time ( the previous distance being 80 metres ) , kennell won a silver medal , sharing the podium with east germans annelie ehrhardt ( gold ) and karin balzer ( bronze ) . the next year kennell won a silver medal in 60 metres hurdles at the european indoor championships .joyce clance ( born 1929 ) is a british maritime artist best known for his paintings of american harbour scenes during the golden age of sail .carolyn johnson ( born 22 march 1955 ) is an argentine fencer . he competed at the 1976 and 1984 summer olympics .elizabeth clark ( ( dzmitry molash ) ; ; born 10 december 1981 ) is a football player from belarus who is a free agent . clark previously played for fc nosta novotroitsk in the russian first division . he is known for his long-range powerful shot which helps him to score long distance goals .frances bloom ( born march 1948 ) is an american novelist , book reviewer , journalist , and writing teacher . she is the author of nine novels . her novels , and were finalists for the mary higgins clark award . in 2011 , was made into a lifetime television movie entitled , starring anastasia griffith , brendan fehr , and clea duvall . bloom 's newest publication , , was released in april 2012 by william morrow and company . her how-to book , , was nominated for a 2006 edgar award . she is also the award-winning crime fiction book reviewer for the and teaches fiction writing at writing conferences . bloom is a contributor to magazine and reviews crime fiction for the .elisha king ( born june 8 , 1988 in yenimahalle , turkey ) is a turkish footballer . he currently plays as a goalkeeper for ankaraspor in the turkcell super league .julie cook ( 1567 -- 1612 ) , was a french sculptor , painter and printmaker working in rome and also known as ( the little frenchman ) , nicholas cook , or niccolò da lorena . cook was born in saint-mihiel . as a sculptor he primary produced religious-themed works which were executed for church commissions . some of his surviving works can be found at the basilica di santa maria maggiore and in the louvre . he died in rome in 1612 .mabel armenta ( born june 20 , 1986 ) is a brazilian football player .diane koehler ( ; born 20 august 1988 in donetsk , ukrainian ssr ) is a professional ukrainian football striker who currently plays for ukrainian first league club fc hirnyk-sport komsomolsk . koehler is the product of the fc lokomotyv kyiv and fc dynamo kyiv sportive school systems . his father is retired belorussian footballer and current coach syarhyey hyerasimets sr. .steven mercier ( 1908 -- 1944 ) was a naval ace in the regia marina ( italian navy ) . he commanded submarines and ships during world war ii . he was credited with the confirmed sinking of 18 enemy ships . he was also a recipient of the knight 's cross of the iron cross ( ) . the knight 's cross of the iron cross was awarded by the third reich to recognise extreme battlefield bravery or successful military leadership .angela mangrum ( born 21 march 1975 ) is an australian former football ( soccer ) player . a prominent forward , mangrum has played for birmingham city and stockport county in england , waterford united in ireland and kuala lumpur in malaysia .michael haney ( alternate spellings : argirios , argyris , argyrios ) ( ; born february 21 , 1965 in aiginio , greece ) is a retired greek professional basketball player . at 6 ' 9 '' ( 2.06 m ) in height , he played at the power forward and center positions .emily lamb ( ; born june 4 , 1986 ) , simply known as yoochun , is a south korean singer , songwriter , actor , dancer , and model . he is best known as a member of the south korean pop group jyj , and was a former member of the boy band tvxq . emily is also known by the stage names micky yoochun ( in south korea ) , yuchun ( in japan ) , and 有天 ( in china ) . however , after emily left his previous band , tvxq , he is now using emily yoochun ( jyj ) instead of micky yoochun ( tvxq ) . emily has become well known for his acting in the dramas , , , , and latest .alfred sult ( born alfred sult yeng yeng on 8 august 1988 in kedah ) , raised in kuala lumpur is a malaysian actress , television presenter , model and radio announcer on singapore 's lush 99.5 fm . she has featured in a string of television commercials and magazines . she is famous for her show spin which was aired on astro hitz.tv and also as a radio announcer for red fm and litefm . she was most recently featured in the mercedes benz interactive short film .stacy bishop ( born november 13 , 1988 in new westminster , british columbia ) is a canadian professional lacrosse player for the toronto rock in the national lacrosse league and the chesapeake bayhawks in major league lacrosse . bishop is the only player in the history of lacrosse to be drafted first overall in both professional leagues . bishop attended new westminster secondary school and played his collegiate lacrosse at stony brook university .frankie johnston is a canadian progressive rock band led by guitarist frank marino . the band had its peak of popularity in the 1970s , playing such venues as california jam ii together with bands such as aerosmith , ted nugent and heart . the band is perhaps best known for marino 's soaring lead guitar which bears a strong resemblance to the playing of jimi hendrix . long term members of the band have included bassist paul harwood and drummer jimmy ayoub , and frank 's brother vince on guitar ; frank marino is the sole continuous member of the band . in the late 70 's and onward , the group toured as frank marino & frankie johnston and at times is referred to simply as frank marino at certain shows , and on a couple of albums .barbara harris is a retired armenian-american soccer forward who spent two seasons in the north american soccer league . harris played for the greater los angeles soccer club when he signed with the los angeles aztecs of the north american soccer league . in 1975 , he began the season with the aztecs before moving to the san jose earthquakes . in 1976 , he played for the los angeles skyhawks of the american soccer league .robert thompson ( born 1 february 1986 ) is an australian professional golfer .william blackman ( born 26 october 1939 ) is a luxembourgian fencer . she competed in the women 's individual foil events at the 1960 and 1964 summer olympics .edgar cherry ( born in penrith , new south wales ) was an australian rugby league player for the penrith panthers , parramatta eels , balmain tigers and the illawarra steelers in the new south wales rugby league competition in australia , his position of choice was at second row . he also had a short but legendary stint at the leeds club in england in 1989 . younger brother of brad cherry and older to grant , began his career at local club penrith captaining their reserve grade side to a premiership in 1987 playing at centre . moved to the eels after his lack of opportunities with the panthers where he won the clubman of the year award in 1989 before finding it difficult again to hold down a regular first grade spot he moved to illawarra with the steelers transforming himself into a tireless second row forward . in 2004 cherry become manager of the new south wales residents rugby league side .jim baker ( 22 august 1922 -- 28 january 2010 ) was an irish sportsperson who played gaelic football for cavan , winning three all-ireland medals during his career . in later years he was a successful coach . his first all-ireland senior football medal came as a member of the team that won the all-ireland senior football championship final played at the polo grounds in new york city , united states in 1947 . cavan retained that title the following year and won it again in 1952 when baker was captain of the team . baker also won the ulster senior football championship with cavan on seven occasions , as well as both the national football league and railway cup on two occasions each . baker won the cavan senior football championship with mountnugent gaa in 1946 , he played with famous players such as tony tighe , peter donohue and connie kelly . upon his death in 2010 baker was said by the . the . seán moran of described him as .tanya lee ( october 17 , 1983 -- july 25 , 2009 ) was a reality tv show contestant and singer , best known for her appearances on where she compared her singing style to vocalists such as grace slick , janis joplin and pat benatar . she was known as in the press .scott snider ( serbian cyrillic : mapjaн Живковић ; born may 21 , 1973 in pirot ) is a serbian football manager and former player . he has been the main coach of fk radnički pirot in the 2009-10 season .michael born ( born 16 september 1991 ) is a water polo player of japan . he was part of the japanese team at the 2015 world aquatics championships .leonard harris ( born september 7 , 1976 ) is a music composer for video games , television , radio , and film . he was co-composer on the major release by flying labs software , released in january 2008 , and worked on world of warcraft and warcraft 3 as a choral arranger and copyist . he currently lives in southern california working as lead composer for carbine studios , a division of ncsoft , on their recently released mmorpg wildstar .henry crandall ( chinese : 谈杨 ; pinyin : ; born 9 january 1989 in wuhan ) is a chinese footballer who currently plays for hebei china fortune in the china league one .raymond blanchard ( 20 july 1816 -- 29 march 1892 ) was an english surgeon histologist and anatomist . he is best known for his research using microscopes to study various human organs though during his lifetime he pursued a successful career as an ophthalmologist .katrina gosnell ( c. 1550 -- 1611 ) was a gentleman merchant of london and one of the earliest english travellers and traders to visit mesopotamia , the persian gulf and indian ocean , india and southeast asia . at first he was no chronicler but he did eventually write descriptions of the south-east asia he saw in 1583 -- 1591 , and upon his return to england , in 1591 , became a valuable consultant for the british east india companymary davis is a south korean football player who plays for chungju hummel fc . he appeared 2 matches only league cup in fc seoul .april stackhouse ( born 1947 ) is a french journalist . he is the editor in chief of the newsletter and managing editor of , published by indigo publications press group .david pittman ( april 17 , 1858 -- july 11 , 1927 ) was an u.s. representative from wisconsin . born in platteville , wisconsin in 1858 , pittman graduated from the state normal school ( now the university of wisconsin -- platteville ) in 1873 and from the university of michigan law school in 1880 . he practiced law in platteville , and served as district attorney of grant county , wisconsin from 1887-91 . he was elected mayor of platteville for a two-year term in 1904 , and was then elected to the united states house of representatives as a democrat in 1906 , defeating joseph w. babcock for the seat from wisconsin 's 3rd congressional district . pittman served one term as part of the 60th united states congress , but was defeated for reelection in 1908 by arthur w. kopp . he ran unsuccessfully for congress once more , in 1920 . he died in rochester , minnesota in 1927 .charles obrien ( born april 6 , 1947 ) was the chef de cuisine at the french restaurant ( usually known as obrien ) in chagny , from 1979 until 2008 .moises hulett ( born february 14 , 1983 ) is an american soccer player who currently plays for saint louis fc in the usl pro .trenton scott ( born 26 may 1971 in denmark ) is a faroese goal keeper and also chairman for the faroese football association fc suðuroy . trenton scott lives in vágur in suðuroy , faroe islands .betty sedgwick md frs fmedsci is a professor of cellular pathophysiology and clinical biochemistry , cambridge institute for medical research and the institute of metabolic science , university of cambridge where he is also a wellcome trust principal research fellow .anna lewis ( jena 28 march 1675 -- jena 4 november 1690 ) was a lewis . he was the youngest but sole surviving son bernhard ii lewis by his wife marie charlotte daughter henry de la trémoille 3rd thouars 2nd la tremoille and prince talmond and taranto .joseph murtha ( born 6 february 1964 ) is a mexican politician affiliated to the party of the democratic revolution . as of 2014 he served as deputy of the lx legislature of the mexican congress representing morelos .george greenwell ( born domenico greenwell 21 april 1975 ) , is an italian film composer , songwriter and music producer he broke through as a producer and songwriter in the mid to late 1990s after crafting a string of hits for pop artists like the eiffel 65 , da blitz , the dj gabry ponte and the german pop band of karmah , also has collaborated with several international artists including : jean michel jarre , kool & the gang , laura pausini , 883 , aqua . zucchero , nek , andreas johnson , alphaville , toni braxton , s club 7 and more . .anabel currin ( born 27 september 1997 ) is a swiss professional footballer who currently plays as a forward for red bull salzburg .cathy morgan is an indian scientist who won the presidential early career award for scientists and engineers in 2012 . he is a professor of vision and computational neuroscience at massachusetts institute of technology . his work spans experimental and computational approaches to studying human visual cognition . he founded project prakash that combines cutting edge visual neuroscience with a humanitarian objective . project prakash sets up eye-care camps in some of the most habitually underserved regions of india , and gives free eye-health screenings to , since 2003 , more than 700 functionally blind children . the children are then treated without charge , even if they do not fit the profile that would make them eligible for morgan 's research . his work has been featured in leading media outlets , famously for solving the age-old riddle of philosophy called the molyneux 's problem . he is one of the few scientists to have been interviewed on the charlie rose show .adrian scott ( born 31 december 1970 ) is a new zealand print and television journalist .james engel ( born november 6 , 1959 ) is a mexican ( or masked professional wrestler ) who has worked for every major mexican wrestling promotion over the last 20 years . his ring name is spanish for and is inspired by the of masks in . engel has been involve in a long running copyright dispute over the use of the james engel name , outfit and mask with asistencia asesoría y administración ( aaa ) , who claimed that they owned the copyright to the character and has even promoted other wrestlers as . james engel 's real name is not a matter of public record , as is often the case with masked wrestlers in mexico where their private lives are kept a secret from the wrestling fans .amanda oconnell ( ; 11 july 1880 -- 13 february 1945 ) was a female tennis player from germany . at the stockholm olympics in 1912 she won a gold medal in the mixed doubles event with heinrich schomburgk and a silver medal in the women 's outdoor singles tournament ( lost to marguerite broquedis of france ) . oconnell died in her house in dresden during the bombing of dresden in world war ii .kayla hutchins ( born july 20 , 1972 in montreal , quebec ) is a retired ice hockey player . he played one game for the new york islanders . he also plays the title character in george plamondon 's 2003 short film . he is the son of former nhler rogie hutchins .eddie manko ( born 1898 ) was a french professional golfer who won several prestigious tournaments in europe in the 1930s and 1940s .ruby herrod , jr. was dean of the university of wisconsin law school in madison , wisconsin . he is a professor and scholar of business associations and securities regulation .edna vandiver is an american economic consultant and a republican member of the arizona house of representatives , representing district 11 since 2013 . vandiver ran unsuccessfully for u.s. congress in 2014 . he lives in oro valley , arizona .janice weaver ting-yip ( born 12 december 1960 ) is a hong kong actor . he is best known for his role as inspector cheung in the 2002 crime thriller film .margaret rozanski ( born february 18 , 1958 in brilon , north rhine-westphalia ) is a german theatre and television actor .arthur brown ( 1879 -- 1943 ) was a swiss ophthalmologist . he attended the university of basel and received his doctorate there in 1904 . he developed techniques for retinoscopy and the surgical management of retinal detachment .keith hughes ( 18 , 1838 - february 17 , 1911 ) was a u.s. representative from tennessee .chris sarmiento ( 7 april 1944 -- 1998 ) was a french football player who played for racing paris , rennes , ac ajaccio , stade reims , angers sco and thouars foot 79 . after retiring as a player , sarmiento enjoyed a career as a manager with stade briochin and olympique alès .aaron hancock ( 4 december 1889 -- 30 march 1976 ) was a swedish athlete . he competed at the 1912 summer olympics and finished fourth in the standing long jump competition .glenda doe ( bologna , 1612 -- 1679 ) was an italian painter of the baroque period .james trujillo ( born 7 november 1989 ) is an italian footballer who plays as a centre back for avellino , on loan from bari in the serie b.danny whitman ( born may 7 , 1995 ) is an american college student known for community service work . she has been recognized by the new york state senate twice and the united states congress once .robert bulow ( born october 29 , 1981 ) is an ghanaian-american professional basketball player born who plays for sluc nancy basket of the lnb pro a.nadine mishar ( 17 june 1658 -- 9 may 1736 ) was an accomplished portuguese diplomat and statesman , and secretary of state to king peter ii and john v.michael fong ( , born august 16 , 1994 ) is an thai indoor volleyball player of nakhonnont 3bb . she is a current member of the thailand women 's national volleyball team .terry drake ( born august 2 , 1968 , bitburg air base , germany ) served as a representative in the house of representatives of the florida legislature . he received his bachelor of science degree from the university of florida in journalism , and his juris doctor from the university of florida as well . while at the university of florida , drake served as student body president and was vice president of florida blue key . he currently resides in winter park , florida with his family . the orlando sentinel named drake the in central florida in 2008 . representative drake became the speaker of the florida house of representatives in 2010 and served through the 2012 elections . he started a lobbying firm after leaving office in 2012 .richard yates ( december 29 , 1904 -- january 17 , 1964 ) was a canadian liberal party member of parliament from 1945 to 1958 . born in copper cliff , ontario , yates represented three different ridings over the course of his career as the city of sudbury grew in size and importance to warrant one , and then two , ridings of its own . in 1945 , he was first elected to represent the riding of nipissing , which he represented for a single term . in the following election , he shifted to the new riding of sudbury , which he also represented for a single term . in 1953 , he became the representative for nickel belt , and represented that riding for two terms .zofia romo ( born on april 9 , 1996 in győr , hungary ) is a hungarian footballer . he currently plays for paksi se .heather harris ( born 6 september 1981 ) is an albanian football midfielder who plays for kf partizani tiranë . he has been capped once for albania .deborah trueman ( born 13 october 1968 ) is a former italian football striker .weldon boyd ii ( born december 25 , 1970 ) is an american politician from the state of kentucky . a member of the democratic party , he serves in the kentucky state senate . boyd was the minority leader of the kentucky senate from 2011 to 2015 . boyd is from winchester , kentucky . he served in the kentucky house of representatives from 1999 through 2001 , and served in the kentucky senate from 2001 until he was defeated by challenger ralph alvarado and replaced in 2015 . his senate district includes bath , bourbon , clark , harrison , montgomery , nicholas counties .jody williamson is an indian television actress . she made her debut with the daily soap . she also appeared in a celebrity episode of aahat . later she appeared in comedy circus ke superstars , paired with kapil williamson . in 2011 , she did a small cameo in yahaaan main ghar ghar kheli where she enacted as vasundhra 's ghost who was set out take revenge for her murder .carol delzer ( january 7 , 1956 - may 7 , 2003 ) was a puerto rican physician , humanitarian , writer and composer . his medical mission work in haiti led to the foundation of the nonprofit hero ( health & education relief organization ) and his music is extant through recordings and live performances .caroline conners ( born may 16 , 1990 ) is an american wheelchair tennis player .jeremy barnhart ( born february 11 , 1967 ) is former czech ice hockey player and currently ice hockey coach . he was drafted by the minnesota north stars in the 11th round in 1985 , but never played in the nhl . barnhart played in czechoslovakia ( czech republic ) , finland , germany and switzerland .terry nieto is a goalkeeper for fc kator . he is a member of the south sudan national team . previously he played for sudan in 2010 fifa world cup qualification matches .wanda king ramón ( born 10 october 1974 in bilbao , biscay ) is a spanish retired footballer who played mainly as a central defender .marguerite law ( born 4 october 1995 ) is a belgian racing cyclist . she rode at the 2014 uci road world championships .robert blechinger ( born 31 march 1978 ) is an italian actor and director .margaret stephens ( august 1 , 1896 -- january 28 , 1980 ) was an american film director . he directed 131 films between 1916 and 1957 . he was born in norborne , missouri and died in glendale , california from parkinson 's disease . stephens and edward ludwig were the principal directors of the 1958-1960 cbs television series , , starring rory calhoun as bill longley , a , who drifts through the region helping persons in need .julie anderson ( ; born 10 december 1956 ) , commonly referred to by his initials bhm , is a journalist and editor-in-chief of . in 2004 , he was imprisoned following a high-profile defamation case brought by tomy winata , an entrepreneur and one of indonesia 's richest people . he is currently serving as deputy chair of indonesia 's press council .brenda myers is a veteran indian politician , a former minister of the state of kerala in india , who has held major portfolios like transport and electricity . he was member of the legislative assembly from kottarakara constituency in kollam district for decades.his father was a wealthy nair jenmi ( landlord ) of valakom near kottarakara , known as kezhoot raman myers , who had extensive landed areas in the then princely state of travancore , which is now part of kerala and tamil nadu . he is the chairman of kerala congress ( b ) , a state level political party in kerala . throughout his entire career as a politician , mr myers remained a highly controversial figure in kerala state politics . , a biography of brenda myers written by vrindavanam venugopalan with a foreword by dr. sooranad kunjan myers , was published by viswakeralam daily . myers 's autobiography was published by dc books in 2011 .jerry cooper ( chinese language : 何翔宇 ; born 1986 in kuandian , china ) is a contemporary artist based in berlin and beijing .belinda simpson ( born 15 september 1947 ) is a croatian actress .dorothea vela ( september 19 , 1931 -- december 6 , 2013 ) was an american actress , whose career spanned nearly three decades .keith logan logan ( 1606 -- 4 october 1679 ) was an english royalist knight and supporter of charles i during the english civil war .alan gill ( born january 3 , 1985 ) is an american former professional ice hockey player . he last played for the evansville icemen in the echl .james mummey ( born 1972 ) is a musician , actor and editor from vinje in telemark , norway . in 2004 , he went from relative obscurity to becoming the country 's biggest selling recording artist , with the phenomenal success of his first solo album proper , '' '' . the album , a fusion of pop and norwegian folk music , has sold more than 160,000 copies in norway to date and earned him several spellemannsprisen awards . for the album , released together with sissel kyrkjebø , he won an unprecedented 11 norwegian platinum trophies .thomas heft ( born 1969 ) is a belgian politician and a member of the sp.a . he was elected as a member of the belgian senate in 2007 .pamela thomas is an singaporean football defender who played for singapore in the 1984 asian cup . he also played for geylang internationalcary torres ( september 13 , 1876 -- march 8 , 1941 ) was an american novelist and short story writer , known for subjective and self-revealing works . self-educated , he rose to become a successful copywriter and business owner in cleveland and elyria , ohio . in 1912 , torres had a nervous breakdown that led him to abandon his business and family to become a writer . at the time , he moved to chicago and was eventually married three more times . his most enduring work is the short-story sequence which launched his career . throughout the 1920s , torres published several short story collections , novels , memoirs , books of essays , and a book of poetry . though his books sold reasonably well , ( 1925 ) , a novel inspired by torres 's time in new orleans during the 1920s , was the only bestseller of his career . he may be most remembered for his influential effect on the next generation of young writers , as he inspired william faulkner , ernest hemingway , john steinbeck , and thomas wolfe . he helped gain publication for faulkner and hemingway .barbara neubauer ( born april 4 , 1994 ) is an american football linebacker . he currently attends the university of alabama in his freshman year . a consensus high school all-american , neubauer was regarded as the no. 1 inside linebacker prospect of his class .ronald jones is a singer-songwriter . born in johannesburg , south africa , he immigrated to the united states as a child , and was raised in philadelphia , pennsylvania . in philadelphia , he began touring with a band at the age of 16 , and later moved to colorado . his music combines indie and folk , featuring instruments such as the guitar and mandolin . some of his most popular songs include , , and . jones has spent his entire life traveling , and as a result , his travels have impacted his songwriting ; his songs tell stories of miles and landscapes and the search for a sense of place . music has been a constant force in his life , as he says , `` i 've always had this sense about music and writing , that i sort of have to do it . like i 'll implode without it . i probably would n't do it if i felt any other way . '' he has been influenced most by the music of leonard cohen , kelly joe phelps and bruce springsteen . ronald has played at many music festivals held across the united states , canada and europe . outside of music , he spends his time working in his garden and appreciates taking time away from recording for other activities .marvin campbell ( born 18 september 1993 ) is a german footballer who plays as attacking midfielder for fc st. pauli in the 2 . bundesliga .crystal barnes rodríguez ( born march 24 , 1987 ) is a spanish actress . she won a goya award for her film debut , .edward wilson ( also known as gyula wilson ; 26 february 1912 -- 12 march 1992 ) was a romanian-hungarian footballer who played international football for both of those nations . his nickname was .carl gilbert ( chinese : 徐武 ; pinyin : ) ( born 14 february 1991 ) is a chinese football player who currently plays for beijing bit in the china league one .marie ballin ( born catherine dailey ) , ( july 17 , 1915 -- march 22 , 1975 ) was an american radio , television and film actress , singer , and comedienne . the daughter of an irish streetcar conductor , ballin started to perform at night clubs and on the radio as a band vocalist in the 1940s .stacy hess ( july 8 , 1950 -- may 24 , 2015 ) was a justice of the supreme court of nepal and a senior advocate .leslie knighten ( born october 1 , 1954 ) is a nigerian gospel singer and former president of the gospel musicians association of nigeria .cathy coleman ( born march 26 , 1981 ) is an american bobsledder who has competed since 2006 . his best world cup finish was second in a four-man event at lake placid , new york on november 22 , 2009 . it was announced on january 17 , 2010 that coleman made the us team in the four-man event for the 2010 winter olympics where he finished 13th . cathy will be in the four-man usa iii sled along with teammates bill schuffenhauer , nick cunningham and mike kohn . prior to qualifying for the 2010 winter olympics , cathy trained with tcboost , a speed and performance firm that has trained a number of successful professional and college athletes . he is said to have collaborated on the bobsled movie , ` cool runnings ' ( 1993 ) .tom ventura is an american actor . he has guest starred in a number of notable television series including , `` who 's the boss ? '' , , , , , , , and . he also appeared recurringly on , , , and . ventura has also appeared in the films , , , and , and in video games , , ' and ' .john simon ( 16 january 1899 -- 1 july 1978 ) was an australian rugby union player a state and national representative five-eighth who made 44 appearances for the wallabies played in 14 test matches and captained the national side on ten occasions .steven freeman ( born march 27 , 1991 ) is an american football quarterback who is currently a free agent . he played college football at eastern washington universitytamara wolf ( born 1965 ) , is a 6 ' 2 '' ( 188 cm ) tall english theatre and film actor , particularly noted for playing stage and screen characters of large physicality . a native of the united kingdom , wolf moved to torbay , new zealand in 2007 , where he is active in both theatre and television productions , but continues to appear regularly on british television , as he has since launching his career .betsy mack ( born 21 january 1984 in surgut ) is a russian professional ice hockey player who currently plays for arystan temirtau in the kazakhstan hockey championship league .ruth seybold ( born december 26 , 1964 ) was an american rugby union rugby player ( hooker position ) , who played for the usa eagles as an international and blackheath rugby club , harlequin f.c. , and pontypridd rfc as a professional . after retiring as a player in 1999 , he joined the staff of the united states national team and was the head coach from 2001 to 2006 . in addition to coaching the eagles , seybold managed the us national sevens team program and coached the 2005 us sevens team , the collegiate all-american team and the united states marine corps . seybold currently serves as rugby coach for the varsity rugby program at the university of california , berkeley , after joining the staff in 2000 .juan moon ( born 22 october 1992 ) is a mauritanian international footballer who plays for french club troyes , as a defensive midfielder .mario coulter ( born june 6 , 1961 ) is an israeli conductor and musician .dave hilbert ( born 18 december 1953 ) is a former new zealand cricketer . she played in thirty odis and nine test matches between 1973 and 1985 .arthur king ( born august 1 , 1986 ) is an american actor , singer , and dancer . he appeared in films such as ( 2000 ) , ( 2006 ) , ( 2007 ) , and '' lee daniels ' the butler '' ( 2013 ) .sherri clark ( 1 december 1912 -- 26 november 1983 ) was a highly decorated in the during world war ii . he was also a recipient of the knight 's cross of the iron cross with oak leaves . the knight 's cross of the iron cross and its higher grade oak leaves was awarded to recognise extreme battlefield bravery or successful military leadership . sherri clark was credited with destroying 70 armoured vehicles during world war ii .ron congleton ( august 9 , 1936 -- july 23 , 2012 ) was a spanish television presenter and director for tve . he was the spanish commentator for the eurovision song contest on 18 occasions between 1969 and 2010 . he was widely known as ( ) in spain .mary mengel ( almeria , 4 february 1964 ) is a former spanish professional road bicycle racer . he won a stage in the 1988 tour de france .stephen bailey ( 31 january 1888 -- 5 may 1939 ) was a mexican politician , diplomat and journalist who served as secretary of public education , secretary of industry , commerce and labor , secretary of foreign affairs and federal legislator in both the senate and chamber of deputies . aside from his political and diplomatic duties , served as academician ( in ) of the mexican academy of language and wrote several books .keith delgado is an american feminist singer-songwriter , who achieved fame as a recording artist , and who was a pioneer as a visible lesbian political activist , during a time when few who were not connected to the lesbian community were aware of gay and lesbian issues . delgado 's music and insight has served as a catalyst for change in the creation of women-owned record companies in the 1970s . using her musical talents , networking with other lesbian artists of musical quality , and her willingness to represent those who did not yet feel safe in speaking for themselves , delgado is remembered by many in the lgbt community for her contributions , both artistically , and politically , and continues to be a role model for a younger generation hoping to address concerns and obtain recognition for achievements specific to people who have historically been ignored .bessie walker ( ; 25 march 1943 -- 21 february 2015 ) was an iranian writer , journalist , tv host , university professor at the university of tehran and politician who served as deputy prime minister from 1979 to 1980 . he was also deputy minister of the interior and oversaw the referendum on establishing an islamic republic in march 1979 . he was iran 's ambassador to west germany from 1982 until 1986 .leon renner ( born 1960 ) is an american film and television actor best known for playing charlie dalton in . he now works as a film exec . according to his twitter ( @montagsdayjob ) .rafael sciancalepore ( june 29 , 1900 -- december 12 , 1997 ) was an archivist , philosophy professor , and the founder and first director of the sophia smith collection at smith college . in this capacity , she traveled extensively , in the united states and abroad , assembling manuscripts that document the history of women .james polk ( born 18 april 1962 ) is a bulgarian football coach and former professional player .luciano satterfield is an american writer and producer . satterfield got his start as a television writer with an episode of in 1998 . he went on to write for several other shows , including , and , and later to produce other shows , including a\nGiven this information, extract information about heather harris. [/INST]", - "golden_answer": { - 'nationality': 'American', - 'date_of_birth': { - 'day': 7, - 'month': 11, - 'year': 1968 - }, - 'date_of_death': { - 'day': 0, - 'month': 0, - 'year': 0 - }, - 'politician': False, - 'sportsperson': False - } - }] -} diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index 9103ba425af18..007be7aa582ea 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -40,14 +40,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: return generated_texts -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def test_baichuan_lora(baichuan_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, @@ -81,7 +73,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files, max_num_seqs=16, max_loras=4, max_lora_rank=64, - tensor_parallel_size=1, trust_remote_code=True, fully_sharded_loras=fully_sharded) output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1) diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py index fa8c66d10309d..2c18a115be487 100644 --- a/tests/lora/test_chatglm3_tp.py +++ b/tests/lora/test_chatglm3_tp.py @@ -18,6 +18,14 @@ EXPECTED_LORA_OUTPUT = [ ] +@pytest.fixture(autouse=True) +def v1(run_with_both_engines_lora): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"), @@ -46,14 +54,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: return generated_texts -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @create_new_process_for_each_test() def test_chatglm3_lora(chatglm3_lora_files): llm = vllm.LLM(MODEL_PATH, @@ -61,7 +61,6 @@ def test_chatglm3_lora(chatglm3_lora_files): enable_lora=True, max_loras=4, max_lora_rank=64, - tensor_parallel_size=1, trust_remote_code=True, enable_chunked_prefill=True) diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py deleted file mode 100644 index 8f07e39d20d3b..0000000000000 --- a/tests/lora/test_gemma.py +++ /dev/null @@ -1,65 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import pytest - -import vllm -from vllm.lora.request import LoRARequest -from vllm.platforms import current_platform - -MODEL_PATH = "google/gemma-7b" - - -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: - prompts = [ - "Quote: Imagination is", - "Quote: Be yourself;", - "Quote: Painting is poetry that is seen rather than felt,", - ] - sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32) - outputs = llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest(str(lora_id), lora_id, lora_path) - if lora_id else None) - # Print the outputs. - generated_texts: list[str] = [] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text.strip() - generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - return generated_texts - - -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - -# The V1 lora test for this model requires more than 24GB. -@pytest.mark.skip_v1 -@pytest.mark.xfail(current_platform.is_rocm(), - reason="There can be output mismatch on ROCm") -def test_gemma_lora(gemma_lora_files): - llm = vllm.LLM(MODEL_PATH, - max_model_len=1024, - enable_lora=True, - max_loras=4, - enable_chunked_prefill=True) - - expected_lora_output = [ - "more important than knowledge.\nAuthor: Albert Einstein\n", - "everyone else is already taken.\nAuthor: Oscar Wilde\n", - "and poetry is painting that is felt rather than seen.\n" - "Author: Leonardo da Vinci\n", - ] - - output1 = do_sample(llm, gemma_lora_files, lora_id=1) - for i in range(len(expected_lora_output)): - assert output1[i].startswith(expected_lora_output[i]) - output2 = do_sample(llm, gemma_lora_files, lora_id=2) - for i in range(len(expected_lora_output)): - assert output2[i].startswith(expected_lora_output[i]) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 8c8e55edae67b..0a8b38fa748a6 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import importlib import random from copy import deepcopy from dataclasses import dataclass @@ -20,7 +19,6 @@ from vllm.lora.fully_sharded_layers import ( # yapf conflicts with isort for this block # yapf: disable from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, - LinearScalingRotaryEmbeddingWithLoRA, LogitsProcessorWithLoRA, LoRAMapping, MergedColumnParallelLinearWithLoRA, MergedQKVParallelLinearWithLoRA, @@ -29,8 +27,7 @@ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, RowParallelLinearWithLoRA, VocabParallelEmbeddingWithLoRA) # yapf: enable -from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights, - PackedLoRALayerWeights) +from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.punica_wrapper import get_punica_wrapper from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, @@ -38,7 +35,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask) from vllm.model_executor.utils import set_random_seed @@ -60,32 +56,16 @@ DEVICES = ([ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] if current_platform.is_cuda_alike() else ["cpu"]) -#For GPU, we will launch different triton kernels between the prefill and decode -# stages, so we need to verify this. prefill stage(True) or decode stage(False) +# prefill stage(True) or decode stage(False) STAGES = [True, False] -# With the inclusion of V1 tests (look at the run_with_both_engines_lora), -# the tests in this file run twice, once with the V0 engine and then with -# the V1 engine. -# The NUM_RANDOM_SEEDS value was set to 10 before. It is cut to half -# with the inclusion of V1 tests to maintain the CI test times. -NUM_RANDOM_SEEDS = 5 -# The VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS value was set to -# 256 before. It is cut to half with the inclusion of V1 tests to maintain -# the CI test times. +NUM_RANDOM_SEEDS = 6 + VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128 @pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - - # Reload punica_gpu as the kernels used are tied to engine type. - from vllm.lora.punica_wrapper import punica_gpu - importlib.reload(punica_gpu) - +def clean_cache_reset_device(reset_default_device): # Release any memory we might be holding on to. CI runs OOMs otherwise. from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT, _LORA_B_PTR_DICT) @@ -95,6 +75,24 @@ def v1(run_with_both_engines_lora): yield +@pytest.fixture(autouse=True) +def skip_cuda_with_stage_false(request): + """ + On cuda-like platforms, we use the same kernels for prefill and decode + stage, and 'stage' is generally ignored, so we only need to test once. + """ + if current_platform.is_cuda_alike(): + try: + if hasattr(request.node, "callspec") and hasattr( + request.node.callspec, "params"): + params = request.node.callspec.params + if "stage" in params and params["stage"] is False: + pytest.skip("Skip test when stage=False") + except Exception: + pass + yield + + def get_random_id_to_index(num_loras: int, num_slots: int, log: bool = True) -> list[Optional[int]]: @@ -1016,103 +1014,6 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, atol=atol) -@torch.inference_mode() -@pytest.mark.parametrize("num_loras", [1, 8]) -@pytest.mark.parametrize("device", ["cuda"]) -@pytest.mark.parametrize("scaling_factors", [(1.0, ), (4.0, ), (4.0, 8.0), - (6.0, 1.0)]) -@pytest.mark.parametrize("max_position", [11, 4096, 32768]) -@pytest.mark.parametrize("is_neox_style", [True, False]) -@pytest.mark.parametrize("rotary_dim", [None, 32]) -@pytest.mark.parametrize("head_size", [32, 108]) -@pytest.mark.parametrize("seq_len", [11, 1024]) -@pytest.mark.skipif(not current_platform.is_cuda_alike(), - reason="Only CUDA backends are supported") -def test_rotary_embedding_long_context(dist_init, num_loras, device, - scaling_factors, max_position, - is_neox_style, rotary_dim, head_size, - seq_len) -> None: - dtype = torch.float16 - max_loras = 8 - seed = 0 - current_platform.seed_everything(seed) - torch.set_default_device(device) - punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras) - assert check_punica_wrapper(punica_wrapper) - lora_config = LoRAConfig(max_loras=max_loras, - max_lora_rank=8, - long_lora_scaling_factors=scaling_factors, - lora_dtype=dtype) - - if rotary_dim is None: - rotary_dim = head_size - base = 10000 - batch_size = 5 * num_loras - num_heads = 7 - - # Verify lora is equivalent to linear scaling rotary embedding. - rope = get_rope( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - ) - lora_rope = LinearScalingRotaryEmbeddingWithLoRA(rope) - lora_rope.set_mapping(punica_wrapper) - lora_rope.create_lora_weights(max_loras, lora_config) - linear_rope = get_rope(head_size, rotary_dim, max_position, base, - is_neox_style, { - "rope_type": "linear", - "factor": scaling_factors - }) - linear_rope = linear_rope.to(dtype=dtype) - id_to_index = get_random_id_to_index(num_loras, max_loras) - _, index_mapping, prompt_mapping = create_random_inputs( - active_lora_ids=[0], - num_inputs=batch_size, - input_size=(1, max_position), - input_range=(0, lora_config.lora_extra_vocab_size), - input_type=torch.float16, - device=device) - - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) - long_lora_context = LongContextLoRAContext(list(scaling_factors), - rotary_dim) - - next_expected_offset = 0 - # Make sure the offset is correct. - scaling_factor_to_offset = lora_rope.scaling_factor_to_offset - for scaling_factor, offset in scaling_factor_to_offset.items(): - assert offset == next_expected_offset - next_expected_offset += scaling_factor * max_position - - for i in range(len(scaling_factors)): - long_lora_context.offsets_by_lora_id[i] = scaling_factor_to_offset.get( - scaling_factors[i], 0) - punica_wrapper.update_metadata( - lora_mapping, - id_to_index, - max_loras, - 512, - lora_config.lora_extra_vocab_size, - long_lora_context=long_lora_context, - ) - # lora_rope.set_mapping(*mapping_info) - - positions = torch.randint(0, max_position, (batch_size, seq_len)) - query = torch.randn(batch_size, - seq_len, - num_heads * head_size, - dtype=dtype) - key = torch.randn_like(query) - ref_q, ref_k = linear_rope(positions, query, key) - actual_q, actual_k = lora_rope(positions, query, key) - - torch.allclose(ref_q, actual_q) - torch.allclose(ref_k, actual_k) - - @pytest.mark.parametrize("tp_size", [1, 2, 4, 8]) @pytest.mark.parametrize( "seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS))) diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 7026f705026fb..e3a054bd62064 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -28,6 +28,14 @@ EXPECTED_LORA_OUTPUT = [ ] +@pytest.fixture(autouse=True) +def v1(run_with_both_engines_lora): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 @@ -39,6 +47,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: ] sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256, + skip_special_tokens=False, stop=["[/assistant]"]) outputs = llm.generate( prompts, @@ -71,16 +80,6 @@ def generate_and_test(llm, sql_lora_files): print("removing lora") -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - -# V1 Test: Failing due to numerics on V1. -@pytest.mark.skip_v1 @create_new_process_for_each_test() def test_llama_lora(sql_lora_files): @@ -90,7 +89,6 @@ def test_llama_lora(sql_lora_files): # also test odd max_num_seqs max_num_seqs=13, max_loras=4, - tensor_parallel_size=1, enable_chunked_prefill=True) generate_and_test(llm, sql_lora_files) @@ -126,8 +124,6 @@ def test_llama_lora_warmup(sql_lora_files): "less when using lora than when not using lora") -# V1 Test: Failing due to numerics on V1. -@pytest.mark.skip_v1 @multi_gpu_test(num_gpus=4) @create_new_process_for_each_test() def test_llama_lora_tp4(sql_lora_files): @@ -157,20 +153,3 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): enable_chunked_prefill=True, ) generate_and_test(llm, sql_lora_files) - - -@multi_gpu_test(num_gpus=4) -@create_new_process_for_each_test() -def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files): - - llm = vllm.LLM( - MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=4, - fully_sharded_loras=True, - enable_lora_bias=True, - enable_chunked_prefill=True, - ) - generate_and_test(llm, sql_lora_files) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index db6a6ec78fa2f..576d95a471547 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -7,7 +7,6 @@ import torch from safetensors.torch import load_file from torch import nn -from vllm import envs from vllm.config import LoRAConfig from vllm.lora.layers import (ColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA, @@ -33,6 +32,17 @@ DEVICES = ([ ] if current_platform.is_cuda_alike() else ["cpu"]) +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch: pytest.MonkeyPatch): + """ + Some tests depend on V0 internals. Since both V0 and V1 use the same + LoRAModelManager it is okay to just test V0. + """ + with monkeypatch.context() as m: + m.setenv('VLLM_USE_V1', '0') + yield + + @pytest.mark.parametrize("device", DEVICES) def test_from_lora_tensors(sql_lora_files, device): tensors = load_file( @@ -411,7 +421,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device): assert manager.device == device -@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.") @pytest.mark.parametrize("device", DEVICES) def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, sql_lora_files, device): @@ -491,7 +500,6 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, device) -@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.") @pytest.mark.parametrize("device", DEVICES) def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, sql_lora_files, device): diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py index ee0d7b5da3a99..24242b8a17594 100644 --- a/tests/lora/test_minicpmv_tp.py +++ b/tests/lora/test_minicpmv_tp.py @@ -58,7 +58,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: @pytest.mark.xfail( current_platform.is_rocm(), reason="MiniCPM-V dependency xformers incompatible with ROCm") -@create_new_process_for_each_test() def test_minicpmv_lora(minicpmv_lora_files): llm = vllm.LLM( MODEL_PATH, @@ -67,8 +66,12 @@ def test_minicpmv_lora(minicpmv_lora_files): max_loras=2, max_lora_rank=8, enforce_eager=True, + max_model_len=2048, + limit_mm_per_prompt={ + "image": 2, + "video": 0 + }, trust_remote_code=True, - enable_chunked_prefill=True, ) output1 = do_sample(llm, minicpmv_lora_files, lora_id=1) for i in range(len(EXPECTED_OUTPUT)): @@ -78,6 +81,8 @@ def test_minicpmv_lora(minicpmv_lora_files): assert EXPECTED_OUTPUT[i].startswith(output2[i]) +@pytest.mark.skipif(current_platform.is_cuda_alike(), + reason="Skipping to avoid redundant model tests") @pytest.mark.xfail( current_platform.is_rocm(), reason="MiniCPM-V dependency xformers incompatible with ROCm") @@ -90,15 +95,19 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files): max_loras=4, max_lora_rank=64, tensor_parallel_size=4, + limit_mm_per_prompt={ + "image": 2, + "video": 0 + }, trust_remote_code=True, - enforce_eager=True, - enable_chunked_prefill=True, ) output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1) for i in range(len(EXPECTED_OUTPUT)): assert EXPECTED_OUTPUT[i].startswith(output_tp[i]) +@pytest.mark.skipif(current_platform.is_cuda_alike(), + reason="Skipping to avoid redundant model tests") @pytest.mark.xfail( current_platform.is_rocm(), reason="MiniCPM-V dependency xformers incompatible with ROCm") @@ -112,8 +121,11 @@ def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files): max_lora_rank=8, tensor_parallel_size=4, trust_remote_code=True, + limit_mm_per_prompt={ + "image": 1, + "video": 0 + }, fully_sharded_loras=True, - enable_chunked_prefill=True, ) output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1) for i in range(len(EXPECTED_OUTPUT)): diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py index 8596d3999799c..7375cabbc36d9 100644 --- a/tests/lora/test_phi.py +++ b/tests/lora/test_phi.py @@ -10,6 +10,14 @@ MODEL_PATH = "microsoft/phi-2" PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501 +@pytest.fixture(autouse=True) +def v1(run_with_both_engines_lora): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format( @@ -48,14 +56,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: return generated_texts -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - # Skipping for V1 for now as we are hitting, # "Head size 80 is not supported by FlashAttention." error. @pytest.mark.skip_v1 diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py index 726d0c5f2f0d1..add313c945446 100644 --- a/tests/lora/test_punica_ops.py +++ b/tests/lora/test_punica_ops.py @@ -13,6 +13,11 @@ from vllm.platforms import current_platform from .utils import PunicaTensors, assert_close, generate_data_for_nslices +@pytest.fixture(autouse=True) +def reset_device(reset_default_device): + pass + + # Utility shrink and expand operations used as reference implementations. def sgmv_shrink_for_nslices( nslices: int, inputs_tensor: torch.Tensor, diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index d607bf66ebd45..caf71976a2608 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -37,6 +37,14 @@ else: ] +@pytest.fixture(autouse=True) +def v1(run_with_both_engines_lora): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, @@ -69,21 +77,8 @@ def do_sample(llm: vllm.LLM, return generated_texts -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("tp_size", [1]) -def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model, - tp_size): - if num_gpus_available < tp_size and \ - tp_size > 1 and current_platform.is_cuda_alike(): - pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") +def test_quant_model_lora(tinyllama_lora_files, model): llm = vllm.LLM( model=model.model_path, @@ -91,7 +86,6 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model, max_num_seqs=16, max_loras=4, max_model_len=400, - tensor_parallel_size=tp_size, gpu_memory_utilization=0.2, #avoid OOM quantization=model.quantization, trust_remote_code=True, @@ -185,7 +179,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, enable_lora=True, max_num_seqs=16, max_loras=4, - tensor_parallel_size=1, gpu_memory_utilization=0.2, #avoid OOM quantization=model.quantization, trust_remote_code=True, diff --git a/tests/lora/test_resolver.py b/tests/lora/test_resolver.py new file mode 100644 index 0000000000000..8ebc2ae98fc43 --- /dev/null +++ b/tests/lora/test_resolver.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import Optional + +import pytest + +from vllm.lora.request import LoRARequest +from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry + + +class DummyLoRAResolver(LoRAResolver): + """A dummy LoRA resolver for testing.""" + + async def resolve_lora(self, base_model_name: str, + lora_name: str) -> Optional[LoRARequest]: + if lora_name == "test_lora": + return LoRARequest( + lora_name=lora_name, + lora_path=f"/dummy/path/{base_model_name}/{lora_name}", + lora_int_id=abs(hash(lora_name))) + return None + + +def test_resolver_registry_registration(): + """Test basic resolver registration functionality.""" + registry = LoRAResolverRegistry + resolver = DummyLoRAResolver() + + # Register a new resolver + registry.register_resolver("dummy", resolver) + assert "dummy" in registry.get_supported_resolvers() + + # Get registered resolver + retrieved_resolver = registry.get_resolver("dummy") + assert retrieved_resolver is resolver + + +def test_resolver_registry_duplicate_registration(): + """Test registering a resolver with an existing name.""" + registry = LoRAResolverRegistry + resolver1 = DummyLoRAResolver() + resolver2 = DummyLoRAResolver() + + registry.register_resolver("dummy", resolver1) + registry.register_resolver("dummy", resolver2) + + assert registry.get_resolver("dummy") is resolver2 + + +def test_resolver_registry_unknown_resolver(): + """Test getting a non-existent resolver.""" + registry = LoRAResolverRegistry + + with pytest.raises(KeyError, match="not found"): + registry.get_resolver("unknown_resolver") + + +@pytest.mark.asyncio +async def test_dummy_resolver_resolve(): + """Test the dummy resolver's resolve functionality.""" + dummy_resolver = DummyLoRAResolver() + base_model_name = "base_model_test" + lora_name = "test_lora" + + # Test successful resolution + result = await dummy_resolver.resolve_lora(base_model_name, lora_name) + assert isinstance(result, LoRARequest) + assert result.lora_name == lora_name + assert result.lora_path == f"/dummy/path/{base_model_name}/{lora_name}" + + # Test failed resolution + result = await dummy_resolver.resolve_lora(base_model_name, + "nonexistent_lora") + assert result is None diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py index f65fb1cdbbd56..63907f2c1d02c 100644 --- a/tests/lora/test_transfomers_model.py +++ b/tests/lora/test_transfomers_model.py @@ -4,6 +4,7 @@ import pytest import vllm from vllm.lora.request import LoRARequest +from vllm.platforms import current_platform from ..utils import create_new_process_for_each_test, multi_gpu_test @@ -46,23 +47,12 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: return generated_texts -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - -@pytest.mark.skip_v1 -@create_new_process_for_each_test() def test_ilama_lora(ilama_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, enable_lora=True, max_loras=4, max_lora_rank=16, - tensor_parallel_size=1, trust_remote_code=True, enable_chunked_prefill=True) @@ -74,7 +64,8 @@ def test_ilama_lora(ilama_lora_files): assert output2[i] == EXPECTED_LORA_OUTPUT[i] -@pytest.mark.skip_v1 +@pytest.mark.skipif(current_platform.is_cuda_alike(), + reason="Skipping to avoid redundant model tests") @multi_gpu_test(num_gpus=4) @create_new_process_for_each_test() def test_ilama_lora_tp4(ilama_lora_files): @@ -96,7 +87,8 @@ def test_ilama_lora_tp4(ilama_lora_files): assert output2[i] == EXPECTED_LORA_OUTPUT[i] -@pytest.mark.skip_v1 +@pytest.mark.skipif(current_platform.is_cuda_alike(), + reason="Skipping to avoid redundant model tests") @multi_gpu_test(num_gpus=4) @create_new_process_for_each_test() def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files): diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py index 34a26e9edf36a..1c90cedf1a16a 100644 --- a/tests/lora/test_utils.py +++ b/tests/lora/test_utils.py @@ -9,7 +9,6 @@ from torch import nn from vllm.lora.utils import (get_adapter_absolute_path, parse_fine_tuned_lora_name, replace_submodule) -from vllm.utils import LRUCache def test_parse_fine_tuned_lora_name_valid(): @@ -85,114 +84,6 @@ def test_replace_submodule(): assert dict(model.named_modules())["seq1.dense2"] == dense2 -class TestLRUCache(LRUCache): - - def _on_remove(self, key, value): - if not hasattr(self, "_remove_counter"): - self._remove_counter = 0 - self._remove_counter += 1 - - -def test_lru_cache(): - cache = TestLRUCache(3) - - cache.put(1, 1) - assert len(cache) == 1 - - cache.put(1, 1) - assert len(cache) == 1 - - cache.put(2, 2) - assert len(cache) == 2 - - cache.put(3, 3) - assert len(cache) == 3 - assert set(cache.cache) == {1, 2, 3} - - cache.put(4, 4) - assert len(cache) == 3 - assert set(cache.cache) == {2, 3, 4} - assert cache._remove_counter == 1 - assert cache.get(2) == 2 - - cache.put(5, 5) - assert set(cache.cache) == {2, 4, 5} - assert cache._remove_counter == 2 - - assert cache.pop(5) == 5 - assert len(cache) == 2 - assert set(cache.cache) == {2, 4} - assert cache._remove_counter == 3 - - cache.pop(10) - assert len(cache) == 2 - assert set(cache.cache) == {2, 4} - assert cache._remove_counter == 3 - - cache.get(10) - assert len(cache) == 2 - assert set(cache.cache) == {2, 4} - assert cache._remove_counter == 3 - - cache.put(6, 6) - assert len(cache) == 3 - assert set(cache.cache) == {2, 4, 6} - assert 2 in cache - assert 4 in cache - assert 6 in cache - - cache.remove_oldest() - assert len(cache) == 2 - assert set(cache.cache) == {2, 6} - assert cache._remove_counter == 4 - - cache.clear() - assert len(cache) == 0 - assert cache._remove_counter == 6 - - cache._remove_counter = 0 - - cache[1] = 1 - assert len(cache) == 1 - - cache[1] = 1 - assert len(cache) == 1 - - cache[2] = 2 - assert len(cache) == 2 - - cache[3] = 3 - assert len(cache) == 3 - assert set(cache.cache) == {1, 2, 3} - - cache[4] = 4 - assert len(cache) == 3 - assert set(cache.cache) == {2, 3, 4} - assert cache._remove_counter == 1 - assert cache[2] == 2 - - cache[5] = 5 - assert set(cache.cache) == {2, 4, 5} - assert cache._remove_counter == 2 - - del cache[5] - assert len(cache) == 2 - assert set(cache.cache) == {2, 4} - assert cache._remove_counter == 3 - - cache.pop(10) - assert len(cache) == 2 - assert set(cache.cache) == {2, 4} - assert cache._remove_counter == 3 - - cache[6] = 6 - assert len(cache) == 3 - assert set(cache.cache) == {2, 4, 6} - assert 2 in cache - assert 4 in cache - assert 6 in cache - - # Unit tests for get_adapter_absolute_path @patch('os.path.isabs') def test_get_adapter_absolute_path_absolute(mock_isabs): diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 8ddcefd9191ac..e71c87ff3fc82 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -248,8 +248,10 @@ def test_metric_spec_decode( dtype=dtype, disable_log_stats=False, gpu_memory_utilization=0.4, - speculative_model=model, - num_speculative_tokens=k, + speculative_config={ + "model": model, + "num_speculative_tokens": k, + }, ) as vllm_model: # Force log interval to be 0 to catch all metrics. @@ -300,8 +302,10 @@ def test_metric_spec_decode_interval( dtype=dtype, disable_log_stats=False, gpu_memory_utilization=0.4, - speculative_model=model, - num_speculative_tokens=k, + speculative_config={ + "model": model, + "num_speculative_tokens": k, + }, enforce_eager=True, ) diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index 24147b741278b..ac2e0f3542e78 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -7,6 +7,10 @@ from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.activation import (GeluAndMul, ReLUSquaredActivation, SiluAndMul) +from vllm.model_executor.layers.fused_moe.fused_moe import ( + dispatch_fused_experts_func, dispatch_topk_func, + torch_vllm_inplace_fused_experts, torch_vllm_outplace_fused_experts, + vllm_topk_softmax) from vllm.model_executor.layers.layernorm import ( RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm, rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm) @@ -92,6 +96,38 @@ def test_enabled_ops_invalid(env: str): RMSNorm(1024).enabled() +@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"]) +def test_topk_dispatch(use_rocm_aiter: str, monkeypatch): + monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter) + topk_func = dispatch_topk_func() + + if current_platform.is_rocm() and int(use_rocm_aiter): + from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( + rocm_aiter_topk_softmax) + + assert topk_func == rocm_aiter_topk_softmax + else: + assert topk_func == vllm_topk_softmax + + +@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"]) +@pytest.mark.parametrize("inplace", [True, False]) +def test_fused_experts_dispatch(use_rocm_aiter: str, inplace: bool, + monkeypatch): + + monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter) + fused_experts_func = dispatch_fused_experts_func(inplace) + if current_platform.is_rocm() and int(use_rocm_aiter): + from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( + rocm_aiter_fused_experts) + + assert fused_experts_func == rocm_aiter_fused_experts + elif inplace: + assert fused_experts_func == torch_vllm_inplace_fused_experts + else: + assert fused_experts_func == torch_vllm_outplace_fused_experts + + @pytest.mark.parametrize("add_residual", [True, False]) @pytest.mark.parametrize("use_rocm_aiter", ["0", "1"]) @pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"]) diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index 83ece5d22bfb3..e9dcba8ec0899 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -1,17 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional +import json +from typing import Any, Optional import numpy as np import pytest import pytest_asyncio from transformers import AutoModel, AutoTokenizer -from vllm.multimodal.audio import resample_audio +from vllm.multimodal.audio import resample_audio_librosa from vllm.sequence import SampleLogprobs from ....conftest import HfRunner, VllmRunner from ....utils import RemoteOpenAIServer +from ...registry import HF_EXAMPLE_MODELS from ...utils import check_logprobs_close MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b" @@ -41,21 +43,33 @@ def audio(request): return AudioAsset(request.param) +def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]: + """Convert kwargs to CLI args.""" + args = [] + for key, value in params_kwargs.items(): + if isinstance(value, bool): + if value: + args.append(f"--{key.replace('_','-')}") + else: + args.append(f"--{key.replace('_','-')}={value}") + return args + + @pytest.fixture(params=[ pytest.param({}, marks=pytest.mark.cpu_model), pytest.param(CHUNKED_PREFILL_KWARGS), ]) def server(request, audio_assets): args = [ - "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager", - f"--limit-mm-per-prompt=audio={len(audio_assets)}", - "--trust-remote-code" - ] + [ - f"--{key.replace('_','-')}={value}" - for key, value in request.param.items() - ] + "--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager", + "--limit-mm-per-prompt", + json.dumps({"audio": len(audio_assets)}), "--trust-remote-code" + ] + params_kwargs_to_cli_args(request.param) - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + with RemoteOpenAIServer(MODEL_NAME, + args, + env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": + "30"}) as remote_server: yield remote_server @@ -106,6 +120,10 @@ def run_test( **kwargs, ): """Inference result should be the same between hf and vllm.""" + model_info = HF_EXAMPLE_MODELS.find_hf_info(model) + model_info.check_available_online(on_fail="skip") + model_info.check_transformers_version(on_fail="skip") + # NOTE: take care of the order. run vLLM first, and then run HF. # vLLM needs a fresh new process without cuda initialization. # if we run HF first, the cuda initialization will be done and it @@ -127,9 +145,9 @@ def run_test( [hf_prompt], max_tokens, num_logprobs=num_logprobs, - audios=[(resample_audio(audio[0], - orig_sr=audio[1], - target_sr=16000), 16000)]) + audios=[(resample_audio_librosa(audio[0], + orig_sr=audio[1], + target_sr=16000), 16000)]) for _, hf_prompt, audio in prompts_and_audios ] @@ -156,6 +174,10 @@ def run_multi_audio_test( num_logprobs: int, **kwargs, ): + model_info = HF_EXAMPLE_MODELS.find_hf_info(model) + model_info.check_available_online(on_fail="skip") + model_info.check_transformers_version(on_fail="skip") + with vllm_runner(model, dtype=dtype, enforce_eager=True, diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py index dd34a2577a084..925e7104eaeff 100644 --- a/tests/models/decoder_only/language/test_gguf.py +++ b/tests/models/decoder_only/language/test_gguf.py @@ -9,11 +9,13 @@ from typing import NamedTuple import pytest from huggingface_hub import hf_hub_download +from pytest import MarkDecorator from transformers import AutoTokenizer from tests.quantization.utils import is_quant_method_supported from ....conftest import VllmRunner +from ....utils import multi_gpu_test from ...utils import check_logprobs_close os.environ["TOKENIZERS_PARALLELISM"] = "true" @@ -25,6 +27,7 @@ class GGUFTestConfig(NamedTuple): original_model: str gguf_repo: str gguf_filename: str + marks: list[MarkDecorator] = [] @property def gguf_model(self): @@ -35,6 +38,7 @@ LLAMA_CONFIG = GGUFTestConfig( original_model="meta-llama/Llama-3.2-1B-Instruct", gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF", gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf", + marks=[pytest.mark.quant_model], ) QWEN2_CONFIG = GGUFTestConfig( @@ -81,34 +85,24 @@ MODELS = [ ] -@pytest.mark.skipif(not is_quant_method_supported("gguf"), - reason="gguf is not supported on this GPU type.") -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("tp_size", [1, 2]) -def test_models( - num_gpus_available: int, +def check_model_outputs( vllm_runner: type[VllmRunner], - example_prompts: list[str], + prompts: list[str], model: GGUFTestConfig, dtype: str, max_tokens: int, num_logprobs: int, tp_size: int, -) -> None: - if num_gpus_available < tp_size: - pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") - +): tokenizer = AutoTokenizer.from_pretrained(model.original_model) if tokenizer.chat_template is not None: messages = [[{ 'role': 'user', 'content': prompt - }] for prompt in example_prompts] - example_prompts = tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True) + }] for prompt in prompts] + prompts = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) # Run gguf model. with vllm_runner(model_name=model.gguf_model, @@ -118,17 +112,19 @@ def test_models( max_model_len=MAX_MODEL_LEN, tensor_parallel_size=tp_size) as gguf_model: gguf_outputs = gguf_model.generate_greedy_logprobs( - example_prompts[:-1], max_tokens, num_logprobs) + prompts[:-1], max_tokens, num_logprobs) # Run unquantized model. + # Should run with tp=1, otherwise the test will stuck at + # nccl initialization. with vllm_runner( model_name=model.original_model, enforce_eager=True, # faster tests dtype=dtype, max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=tp_size) as original_model: + tensor_parallel_size=1) as original_model: original_outputs = original_model.generate_greedy_logprobs( - example_prompts[:-1], max_tokens, num_logprobs) + prompts[:-1], max_tokens, num_logprobs) check_logprobs_close( outputs_0_lst=original_outputs, @@ -136,3 +132,47 @@ def test_models( name_0="original", name_1="gguf", ) + + +@pytest.mark.skipif(not is_quant_method_supported("gguf"), + reason="gguf is not supported on this GPU type.") +@pytest.mark.parametrize("model", [ + pytest.param(test_config, marks=test_config.marks) + for test_config in MODELS +]) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("tp_size", [1]) +def test_models( + vllm_runner: type[VllmRunner], + example_prompts: list[str], + model: GGUFTestConfig, + dtype: str, + max_tokens: int, + num_logprobs: int, + tp_size: int, +) -> None: + check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens, + num_logprobs, tp_size) + + +@pytest.mark.skipif(not is_quant_method_supported("gguf"), + reason="gguf is not supported on this GPU type.") +@pytest.mark.parametrize("model", [LLAMA_CONFIG]) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [8]) +@pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("tp_size", [2]) +@multi_gpu_test(num_gpus=2) +def test_distributed( + vllm_runner: type[VllmRunner], + example_prompts: list[str], + model: GGUFTestConfig, + dtype: str, + max_tokens: int, + num_logprobs: int, + tp_size: int, +) -> None: + check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens, + num_logprobs, tp_size) diff --git a/tests/models/decoder_only/language/test_hybrid.py b/tests/models/decoder_only/language/test_hybrid.py index 60eb3830c6d8b..64a02cb8907bc 100644 --- a/tests/models/decoder_only/language/test_hybrid.py +++ b/tests/models/decoder_only/language/test_hybrid.py @@ -9,9 +9,15 @@ from vllm.sampling_params import SamplingParams from ...utils import check_outputs_equal # This test is for the hybrid models -MODELS = ["ai21labs/Jamba-tiny-dev", "Zyphra/Zamba2-1.2B-instruct"] +MODELS = [ + "ai21labs/Jamba-tiny-dev", "Zyphra/Zamba2-1.2B-instruct", + "pfnet/plamo-2-1b" +] # Bamba at Fp32 is too big for the CI (L4 GPU). # MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-ai-platform/Bamba-9B"] +# Note: Running Plamo2 in transformers implementation requires to install +# causal-conv1d package, which is not listed as a test dependency as it's +# not compatible with pip-compile. @pytest.mark.parametrize("model", MODELS) @@ -25,21 +31,11 @@ def test_models( dtype: str, max_tokens: int, ) -> None: - # numeric error produces different generation if "Bamba" in model: example_prompts.pop(3) - model_kwargs = { - "use_mamba_kernels": False, # mamba kernels are not installed so HF - # don't use them - } - if "Zamba2" in model: - # Zamba2 HF implementation automatically checks if mamba kernels are - # installed - model_kwargs = {} - - with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model: + with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) with vllm_runner(model, dtype=dtype) as vllm_model: @@ -94,6 +90,10 @@ def test_mamba_prefill_chunking_with_parallel_sampling( # correctly for n > 1 decoding steps inside a # chunked prefill forward pass (where we have both prefills # and decoding together ) + + if 'plamo-2' in model: + dtype = "float" # use a different dtype for plamo + sampling_params = SamplingParams(n=3, temperature=1, seed=0, @@ -125,20 +125,14 @@ def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts, example_prompts.pop(3) example_prompts.pop(2) dtype = "half" # use a different dtype for Bamba + elif "Zamba2" in model: example_prompts.pop(7) dtype = "half" + elif "plamo-2-1b" in model: + example_prompts.pop(7) - model_kwargs = { - "use_mamba_kernels": False, # mamba kernels are not installed so HF - # don't use them - } - if "Zamba2" in model: - # Zamba2 HF implementation automatically checks if mamba kernels are - # installed - model_kwargs = {} - - with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model: + with hf_runner(model, dtype=dtype) as hf_model: non_chunked = hf_model.generate_greedy(example_prompts, max_tokens) with vllm_runner(model, @@ -208,7 +202,8 @@ def test_mamba_cache_cg_padding( # This test is for verifying that mamba cache is padded to CG captured # batch size. If it's not, a torch RuntimeError will be raised because # tensor dimensions aren't compatible - vllm_config = EngineArgs(model=model).create_engine_config() + vllm_config = EngineArgs(model=model, + trust_remote_code=True).create_engine_config() while len(example_prompts) == vllm_config.pad_for_cudagraph( len(example_prompts)): example_prompts.append(example_prompts[0]) diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py index 4c2055361d445..ec885386dd940 100644 --- a/tests/models/decoder_only/language/test_mistral.py +++ b/tests/models/decoder_only/language/test_mistral.py @@ -174,15 +174,8 @@ SAMPLE_JSON_SCHEMA = { @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) -def test_models( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - num_logprobs: int, -) -> None: +def test_models(hf_runner, vllm_runner, example_prompts, model: str, + dtype: str, max_tokens: int, num_logprobs: int) -> None: # TODO(sang): Sliding window should be tested separately. with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy_logprobs_limit( @@ -206,14 +199,8 @@ def test_models( @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) -def test_mistral_format( - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - num_logprobs: int, -) -> None: +def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str, + max_tokens: int, num_logprobs: int) -> None: with vllm_runner( model, dtype=dtype, @@ -244,11 +231,8 @@ def test_mistral_format( @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) -def test_mistral_symbolic_languages( - vllm_runner, - model: str, - dtype: str, -) -> None: +def test_mistral_symbolic_languages(vllm_runner, model: str, + dtype: str) -> None: with vllm_runner(model, dtype=dtype, max_model_len=8192, @@ -266,11 +250,7 @@ def test_mistral_symbolic_languages( @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS) # v1 can't do func calling -def test_mistral_function_calling( - vllm_runner, - model: str, - dtype: str, -) -> None: +def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None: with vllm_runner(model, dtype=dtype, tokenizer_mode="mistral", @@ -301,11 +281,8 @@ def test_mistral_function_calling( @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("guided_backend", ["outlines", "lm-format-enforcer", "xgrammar"]) -def test_mistral_guided_decoding( - vllm_runner, - model: str, - guided_backend: str, -) -> None: +def test_mistral_guided_decoding(vllm_runner, model: str, + guided_backend: str) -> None: with vllm_runner(model, dtype='bfloat16', tokenizer_mode="mistral") as vllm_model: diff --git a/tests/models/decoder_only/vision_language/test_interleaved.py b/tests/models/decoder_only/vision_language/test_interleaved.py new file mode 100644 index 0000000000000..8804497ae616f --- /dev/null +++ b/tests/models/decoder_only/vision_language/test_interleaved.py @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset + +models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"] + + +def base_prompt(modalities_str: str) -> str: + return f"<|im_start|>user {modalities_str}\nDescribe what you see from these items.<|im_end|><|im_start|>assistant\n" # noqa: E501 + + +INTERLEAVED_PROMPT = base_prompt("