diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh index 686f70dbece6c..69b6b146b3549 100644 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -43,7 +43,7 @@ main() { - # The figures should be genereated by a separate process outside the CI/CD pipeline + # The figures should be generated by a separate process outside the CI/CD pipeline # # generate figures # python3 -m pip install tabulate pandas matplotlib diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh index 3f38cf5137535..32bd34c431c89 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh @@ -301,6 +301,104 @@ run_serving_tests() { kill_gpu_processes } +run_genai_perf_tests() { + # run genai-perf tests + + # $1: a json file specifying genai-perf test cases + local genai_perf_test_file + genai_perf_test_file=$1 + + # Iterate over genai-perf tests + jq -c '.[]' "$genai_perf_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # prepend the current serving engine to the test name + test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} + + # get common parameters + common_params=$(echo "$params" | jq -r '.common_parameters') + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tp') + dataset_name=$(echo "$common_params" | jq -r '.dataset_name') + dataset_path=$(echo "$common_params" | jq -r '.dataset_path') + port=$(echo "$common_params" | jq -r '.port') + num_prompts=$(echo "$common_params" | jq -r '.num_prompts') + reuse_server=$(echo "$common_params" | jq -r '.reuse_server') + + # get client and server arguments + server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") + qps_list=$(echo "$params" | jq -r '.qps_list') + qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') + echo "Running over qps list $qps_list" + + # check if there is enough GPU to run the test + if [[ $gpu_count -lt $tp ]]; then + echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." + continue + fi + + if [[ $reuse_server == "true" ]]; then + echo "Reuse previous server for test case $test_name" + else + kill_gpu_processes + bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ + "$server_params" "$common_params" + fi + + if wait_for_server; then + echo "" + echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." + else + echo "" + echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." + break + fi + + # iterate over different QPS + for qps in $qps_list; do + # remove the surrounding single quote from qps + if [[ "$qps" == *"inf"* ]]; then + echo "qps was $qps" + qps=$num_prompts + echo "now qps is $qps" + fi + + new_test_name=$test_name"_qps_"$qps + backend=$CURRENT_LLM_SERVING_ENGINE + + if [[ "$backend" == *"vllm"* ]]; then + backend="vllm" + fi + #TODO: add output dir. + client_command="genai-perf profile \ + -m $model \ + --service-kind openai \ + --backend vllm \ + --endpoint-type chat \ + --streaming \ + --url localhost:$port \ + --request-rate $qps \ + --num-prompts $num_prompts \ + " + + echo "Client command: $client_command" + + eval "$client_command" + + #TODO: process/record outputs + done + done + + kill_gpu_processes + +} prepare_dataset() { @@ -328,12 +426,17 @@ main() { pip install -U transformers + pip install -r requirements-dev.txt + which genai-perf + # check storage df -h ensure_installed wget ensure_installed curl ensure_installed jq + # genai-perf dependency + ensure_installed libb64-0d prepare_dataset @@ -345,6 +448,10 @@ main() { # run the test run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json" + # run genai-perf tests + run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json" + mv artifacts/ $RESULTS_FOLDER/ + # upload benchmark results to buildkite python3 -m pip install tabulate pandas python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py" diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json new file mode 100644 index 0000000000000..edbe9f2df0ce0 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json @@ -0,0 +1,23 @@ +[ + { + "test_name": "llama8B_tp1_genai_perf", + "qps_list": [4,8,16,32], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "tp": 1, + "port": 8000, + "num_prompts": 500, + "reuse_server": false + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "genai_perf_input_parameters": { + } + } +] \ No newline at end of file diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 1a4dae8f65e99..e19ace782feb5 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -30,7 +30,7 @@ function cpu_tests() { # offline inference docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c " set -e - python3 examples/offline_inference/offline_inference.py" + python3 examples/offline_inference/basic.py" # Run basic model test docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " @@ -61,7 +61,7 @@ function cpu_tests() { pytest -s -v -k cpu_model \ tests/basic_correctness/test_chunked_prefill.py" - # online inference + # online serving docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e export VLLM_CPU_KVCACHE_SPACE=10 @@ -75,8 +75,14 @@ function cpu_tests() { --num-prompts 20 \ --endpoint /v1/completions \ --tokenizer facebook/opt-125m" + + # Run multi-lora tests + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + set -e + pytest -s -v \ + tests/lora/test_qwen2vl.py" } -# All of CPU tests are expected to be finished less than 25 mins. +# All of CPU tests are expected to be finished less than 40 mins. export -f cpu_tests -timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" +timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh index 1e5ff77895a38..3e4e409466b8a 100644 --- a/.buildkite/run-gh200-test.sh +++ b/.buildkite/run-gh200-test.sh @@ -24,5 +24,5 @@ remove_docker_container # Run the image and test offline inference docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' - python3 examples/offline_inference/offline_inference.py + python3 examples/offline_inference/basic.py ' diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh index a50570ab53438..1edcb1d2669e9 100644 --- a/.buildkite/run-hpu-test.sh +++ b/.buildkite/run-hpu-test.sh @@ -8,9 +8,17 @@ set -ex docker build -t hpu-test-env -f Dockerfile.hpu . # Setup cleanup +# certain versions of HPU software stack have a bug that can +# override the exit code of the script, so we need to use +# separate remove_docker_container and remove_docker_container_and_exit +# functions, while other platforms only need one remove_docker_container +# function. +EXITCODE=1 remove_docker_container() { docker rm -f hpu-test || true; } -trap remove_docker_container EXIT +remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; } +trap remove_docker_container_and_exit EXIT remove_docker_container # Run the image and launch offline inference -docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/offline_inference.py \ No newline at end of file +docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py +EXITCODE=$? diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh index 52d485939b1d0..189714ebb6d75 100644 --- a/.buildkite/run-neuron-test.sh +++ b/.buildkite/run-neuron-test.sh @@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ --name "${container_name}" \ ${image_name} \ - /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/offline_inference_neuron.py" + /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py" diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh index 380f7a44a429a..6159b21ff8206 100755 --- a/.buildkite/run-openvino-test.sh +++ b/.buildkite/run-openvino-test.sh @@ -13,4 +13,4 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference -docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/offline_inference.py +docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh index a8f021890f742..650af0fac4c61 100644 --- a/.buildkite/run-tpu-test.sh +++ b/.buildkite/run-tpu-test.sh @@ -23,4 +23,4 @@ docker run --privileged --net host --shm-size=16G -it \ && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ && python3 /workspace/vllm/tests/tpu/test_compilation.py \ && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \ - && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py" + && python3 /workspace/vllm/examples/offline_inference/tpu.py" diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh index 160e10aa3bb9b..4d344e58db8ac 100644 --- a/.buildkite/run-xpu-test.sh +++ b/.buildkite/run-xpu-test.sh @@ -14,6 +14,6 @@ remove_docker_container # Run the image and test offline inference/tensor parallel docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c ' - python3 examples/offline_inference/offline_inference.py - python3 examples/offline_inference/offline_inference_cli.py -tp 2 + python3 examples/offline_inference/basic.py + python3 examples/offline_inference/cli.py -tp 2 ' diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index e288f8f30159a..d2b140e718501 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -106,7 +106,7 @@ steps: source_file_dependencies: - vllm/ commands: - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process @@ -125,11 +125,15 @@ steps: - tests/distributed - tests/spec_decode/e2e/test_integration_dist_tp4 - tests/compile + - examples/offline_inference/rlhf.py commands: - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py + # TODO: create a dedicated test section for multi-GPU example tests + # when we have multiple distributed example tests + - python3 ../examples/offline_inference/rlhf.py - label: Metrics, Tracing Test # 10min num_gpus: 2 @@ -187,19 +191,19 @@ steps: - examples/ commands: - pip install tensorizer # for tensorizer test - - python3 offline_inference/offline_inference.py + - python3 offline_inference/basic.py - python3 offline_inference/cpu_offload.py - - python3 offline_inference/offline_inference_chat.py - - python3 offline_inference/offline_inference_with_prefix.py + - python3 offline_inference/chat.py + - python3 offline_inference/prefix_caching.py - python3 offline_inference/llm_engine_example.py - - python3 offline_inference/offline_inference_vision_language.py - - python3 offline_inference/offline_inference_vision_language_multi_image.py + - python3 offline_inference/vision_language.py + - python3 offline_inference/vision_language_multi_image.py - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/offline_inference_encoder_decoder.py - - python3 offline_inference/offline_inference_classification.py - - python3 offline_inference/offline_inference_embedding.py - - python3 offline_inference/offline_inference_scoring.py - - python3 offline_inference/offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2 + - python3 offline_inference/encoder_decoder.py + - python3 offline_inference/classification.py + - python3 offline_inference/embedding.py + - python3 offline_inference/scoring.py + - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - label: Prefix Caching Test # 9min mirror_hardwares: [amd] @@ -214,6 +218,7 @@ steps: - vllm/model_executor/layers - vllm/sampling_metadata.py - tests/samplers + - tests/conftest.py commands: - pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers @@ -229,13 +234,15 @@ steps: - pytest -v -s test_logits_processor.py - pytest -v -s model_executor/test_guided_processors.py -- label: Speculative decoding tests # 30min +- label: Speculative decoding tests # 40min source_file_dependencies: - vllm/spec_decode - tests/spec_decode + - vllm/model_executor/models/eagle.py commands: - pytest -v -s spec_decode/e2e/test_multistep_correctness.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py + - pytest -v -s spec_decode/e2e/test_eagle_correctness.py - label: LoRA Test %N # 15min each mirror_hardwares: [amd] @@ -367,6 +374,7 @@ steps: - tests/models/encoder_decoder/vision_language commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model' - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model' - pytest -v -s models/embedding/vision_language -m core_model @@ -457,7 +465,10 @@ steps: - vllm/worker/worker_base.py - vllm/worker/worker.py - vllm/worker/model_runner.py + - entrypoints/llm/test_collective_rpc.py commands: + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml deleted file mode 100644 index 0226cf0ca00e9..0000000000000 --- a/.github/workflows/actionlint.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: Lint GitHub Actions workflows -on: - push: - branches: - - "main" - paths: - - '.github/workflows/*.ya?ml' - - '.github/workflows/actionlint.*' - - '.github/workflows/matchers/actionlint.json' - pull_request: - branches: - - "main" - paths: - - '.github/workflows/*.ya?ml' - - '.github/workflows/actionlint.*' - - '.github/workflows/matchers/actionlint.json' - -env: - LC_ALL: en_US.UTF-8 - -defaults: - run: - shell: bash - -permissions: - contents: read - -jobs: - actionlint: - runs-on: ubuntu-latest - steps: - - name: "Checkout" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: "Run actionlint" - run: | - echo "::add-matcher::.github/workflows/matchers/actionlint.json" - tools/actionlint.sh -color diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml deleted file mode 100644 index 68149d2dc019f..0000000000000 --- a/.github/workflows/clang-format.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: clang-format - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - paths: - - '**/*.h' - - '**/*.cpp' - - '**/*.cu' - - '**/*.cuh' - - '.github/workflows/clang-format.yml' - pull_request: - branches: - - main - paths: - - '**/*.h' - - '**/*.cpp' - - '**/*.cu' - - '**/*.cuh' - - '.github/workflows/clang-format.yml' - -jobs: - clang-format: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.11"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install clang-format==18.1.5 - - name: Running clang-format - run: | - EXCLUDES=( - 'csrc/moe/topk_softmax_kernels.cu' - 'csrc/quantization/gguf/ggml-common.h' - 'csrc/quantization/gguf/dequantize.cuh' - 'csrc/quantization/gguf/vecdotq.cuh' - 'csrc/quantization/gguf/mmq.cuh' - 'csrc/quantization/gguf/mmvq.cuh' - ) - find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ - | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \ - | xargs clang-format --dry-run --Werror diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml deleted file mode 100644 index 68887adaae54b..0000000000000 --- a/.github/workflows/codespell.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: codespell - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - paths: - - "**/*.py" - - "**/*.md" - - "**/*.rst" - - pyproject.toml - - requirements-lint.txt - - .github/workflows/codespell.yml - pull_request: - branches: - - main - paths: - - "**/*.py" - - "**/*.md" - - "**/*.rst" - - pyproject.toml - - requirements-lint.txt - - .github/workflows/codespell.yml - -jobs: - codespell: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Spelling check with codespell - run: | - codespell --toml pyproject.toml diff --git a/.github/workflows/dummy.yml b/.github/workflows/dummy.yml new file mode 100644 index 0000000000000..ea507fab6b2de --- /dev/null +++ b/.github/workflows/dummy.yml @@ -0,0 +1,20 @@ +name: dummy-checks + +on: + pull_request: + +jobs: + mypy: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.12"] + steps: + - run: echo "This is a dummy step that always passes" + ruff: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.12"] + steps: + - run: echo "This is a dummy step that always passes" diff --git a/.github/workflows/matchers/ruff.json b/.github/workflows/matchers/ruff.json deleted file mode 100644 index f6d4479ee1996..0000000000000 --- a/.github/workflows/matchers/ruff.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "problemMatcher": [ - { - "owner": "ruff", - "pattern": [ - { - "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$", - "file": 1, - "line": 2, - "column": 3, - "code": 4, - "message": 5 - } - ] - } - ] - } diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml deleted file mode 100644 index 73eeacf1fa562..0000000000000 --- a/.github/workflows/mypy.yaml +++ /dev/null @@ -1,51 +0,0 @@ -name: mypy - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - paths: - - '**/*.py' - - '.github/workflows/mypy.yaml' - - 'tools/mypy.sh' - - 'pyproject.toml' - pull_request: - branches: - - main - # This workflow is only relevant when one of the following files changes. - # However, we have github configured to expect and require this workflow - # to run and pass before github with auto-merge a pull request. Until github - # allows more flexible auto-merge policy, we can just run this on every PR. - # It doesn't take that long to run, anyway. - #paths: - # - '**/*.py' - # - '.github/workflows/mypy.yaml' - # - 'tools/mypy.sh' - # - 'pyproject.toml' - -jobs: - mypy: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.9", "3.10", "3.11", "3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install mypy==1.11.1 - pip install types-setuptools - pip install types-PyYAML - pip install types-requests - pip install types-setuptools - - name: Mypy - run: | - echo "::add-matcher::.github/workflows/matchers/mypy.json" - tools/mypy.sh 1 ${{ matrix.python-version }} diff --git a/.github/workflows/png-lint.yml b/.github/workflows/png-lint.yml deleted file mode 100644 index 4932af943a07b..0000000000000 --- a/.github/workflows/png-lint.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: Lint PNG exports from excalidraw -on: - push: - branches: - - "main" - paths: - - '*.excalidraw.png' - - '.github/workflows/png-lint.yml' - pull_request: - branches: - - "main" - paths: - - '*.excalidraw.png' - - '.github/workflows/png-lint.yml' - -env: - LC_ALL: en_US.UTF-8 - -defaults: - run: - shell: bash - -permissions: - contents: read - -jobs: - actionlint: - runs-on: ubuntu-latest - steps: - - name: "Checkout" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: "Run png-lint.sh to check excalidraw exported images" - run: | - tools/png-lint.sh diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000000000..8c72a709cf330 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,17 @@ +name: pre-commit + +on: + pull_request: + push: + branches: [main] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: "3.12" + - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" + - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml deleted file mode 100644 index 7266cc378cfb0..0000000000000 --- a/.github/workflows/ruff.yml +++ /dev/null @@ -1,52 +0,0 @@ -name: ruff - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - paths: - - "**/*.py" - - pyproject.toml - - requirements-lint.txt - - .github/workflows/matchers/ruff.json - - .github/workflows/ruff.yml - pull_request: - branches: - - main - # This workflow is only relevant when one of the following files changes. - # However, we have github configured to expect and require this workflow - # to run and pass before github with auto-merge a pull request. Until github - # allows more flexible auto-merge policy, we can just run this on every PR. - # It doesn't take that long to run, anyway. - #paths: - # - "**/*.py" - # - pyproject.toml - # - requirements-lint.txt - # - .github/workflows/matchers/ruff.json - # - .github/workflows/ruff.yml - -jobs: - ruff: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Analysing the code with ruff - run: | - echo "::add-matcher::.github/workflows/matchers/ruff.json" - ruff check --output-format github . - - name: Run isort - run: | - isort . --check-only diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml deleted file mode 100644 index 4b1587e373e17..0000000000000 --- a/.github/workflows/shellcheck.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: Lint shell scripts -on: - push: - branches: - - "main" - paths: - - '**/*.sh' - - '.github/workflows/shellcheck.yml' - pull_request: - branches: - - "main" - paths: - - '**/*.sh' - - '.github/workflows/shellcheck.yml' - -env: - LC_ALL: en_US.UTF-8 - -defaults: - run: - shell: bash - -permissions: - contents: read - -jobs: - shellcheck: - runs-on: ubuntu-latest - steps: - - name: "Checkout" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: "Check shell scripts" - run: | - tools/shellcheck.sh diff --git a/.github/workflows/sphinx-lint.yml b/.github/workflows/sphinx-lint.yml deleted file mode 100644 index e0bb24276a653..0000000000000 --- a/.github/workflows/sphinx-lint.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: Lint documentation - -on: - push: - branches: - - main - paths: - - "docs/**" - pull_request: - branches: - - main - paths: - - "docs/**" - -jobs: - sphinx-lint: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Linting docs - run: tools/sphinx-lint.sh diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml deleted file mode 100644 index ff441f94435ad..0000000000000 --- a/.github/workflows/yapf.yml +++ /dev/null @@ -1,38 +0,0 @@ -name: yapf - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - paths: - - "**/*.py" - - .github/workflows/yapf.yml - pull_request: - branches: - - main - paths: - - "**/*.py" - - .github/workflows/yapf.yml - -jobs: - yapf: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install yapf==0.32.0 - pip install toml==0.10.2 - - name: Running yapf - run: | - yapf --diff --recursive . diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000000..8ea0f37885d9f --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,73 @@ +repos: +- repo: https://github.com/google/yapf + rev: v0.32.0 + hooks: + - id: yapf + args: [--in-place, --verbose] + additional_dependencies: [toml] # TODO: Remove when yapf is upgraded +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.5 + hooks: + - id: ruff + args: [--output-format, github] +- repo: https://github.com/codespell-project/codespell + rev: v2.3.0 + hooks: + - id: codespell + exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*' +- repo: https://github.com/PyCQA/isort + rev: 5.13.2 + hooks: + - id: isort +- repo: https://github.com/pre-commit/mirrors-clang-format + rev: v18.1.5 + hooks: + - id: clang-format + exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))' + types_or: [c++, cuda] + args: [--style=file, --verbose] +- repo: https://github.com/jackdewinter/pymarkdown + rev: v0.9.27 + hooks: + - id: pymarkdown + files: docs/.* +- repo: local + hooks: + - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward + name: Run mypy for Python 3.9 + entry: tools/mypy.sh 1 "3.9" + language: python + types: [python] + additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests] + - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward + name: Run mypy for Python 3.10 + entry: tools/mypy.sh 1 "3.10" + language: python + types: [python] + additional_dependencies: *mypy_deps + - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward + name: Run mypy for Python 3.11 + entry: tools/mypy.sh 1 "3.11" + language: python + types: [python] + additional_dependencies: *mypy_deps + - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward + name: Run mypy for Python 3.12 + entry: tools/mypy.sh 1 "3.12" + language: python + types: [python] + additional_dependencies: *mypy_deps + - id: shellcheck + name: Lint shell scripts + entry: tools/shellcheck.sh + language: script + types: [shell] + - id: png-lint + name: Lint PNG exports from excalidraw + entry: tools/png-lint.sh + language: script + types: [png] +- repo: https://github.com/rhysd/actionlint + rev: v1.7.6 + hooks: + - id: actionlint diff --git a/Dockerfile.cpu b/Dockerfile.cpu index f163edc27cba8..ebe226cf6d148 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -26,10 +26,10 @@ RUN pip install intel_extension_for_pytorch==2.5.0 WORKDIR /workspace -COPY requirements-build.txt requirements-build.txt ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \ pip install --upgrade pip && \ pip install -r requirements-build.txt @@ -37,9 +37,9 @@ FROM cpu-test-1 AS build WORKDIR /workspace/vllm -COPY requirements-common.txt requirements-common.txt -COPY requirements-cpu.txt requirements-cpu.txt RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \ + --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \ pip install -v -r requirements-cpu.txt COPY . . diff --git a/Dockerfile.hpu b/Dockerfile.hpu index 87e0c1a6a934e..66cf68c32f2ca 100644 --- a/Dockerfile.hpu +++ b/Dockerfile.hpu @@ -1,4 +1,4 @@ -FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest COPY ./ /workspace/vllm diff --git a/Dockerfile.rocm b/Dockerfile.rocm index e733994f8c33e..e922cb207b899 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -51,10 +51,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \ *"rocm-6.2"*) \ python3 -m pip uninstall -y torch torchvision \ && python3 -m pip install --pre \ - torch==2.6.0.dev20241113+rocm6.2 \ + torch \ 'setuptools-scm>=8' \ - torchvision==0.20.0.dev20241113+rocm6.2 \ - --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \ + torchvision \ + --extra-index-url https://download.pytorch.org/whl/rocm6.2;; \ *) ;; esac ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer diff --git a/README.md b/README.md index 253a0bb913e37..658b9fb6edd8c 100644 --- a/README.md +++ b/README.md @@ -38,10 +38,12 @@ The first vLLM meetup in 2025 is happening on January 22nd, Wednesday, with Goog ## About vLLM is a fast and easy-to-use library for LLM inference and serving. +Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry. + vLLM is fast with: - State-of-the-art serving throughput -- Efficient management of attention key and value memory with **PagedAttention** +- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) - Continuous batching of incoming requests - Fast model execution with CUDA/HIP graph - Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8. @@ -72,16 +74,16 @@ Find the full list of supported models [here](https://docs.vllm.ai/en/latest/mod ## Getting Started -Install vLLM with `pip` or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source): +Install vLLM with `pip` or [from source](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/index.html#build-wheel-from-source): ```bash pip install vllm ``` -Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more. -- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html) -- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html) -- [List of Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html) +Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more. +- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html) +- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html) +- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html) ## Contributing diff --git a/SECURITY.md b/SECURITY.md index de0032d26c87b..47196a1f1221e 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -4,7 +4,7 @@ If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. -Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/contributing/vulnerability_management/). +Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). --- diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index b67849038cf0d..a9ab4fc9b621e 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -22,6 +22,7 @@ class RequestFuncInput: prompt_len: int output_len: int model: str + model_name: Optional[str] = None best_of: int = 1 logprobs: Optional[int] = None extra_body: Optional[dict] = None @@ -78,7 +79,7 @@ async def async_request_tgi( continue chunk_bytes = chunk_bytes.decode("utf-8") - #NOTE: Sometimes TGI returns a ping response without + # NOTE: Sometimes TGI returns a ping response without # any data, we should skip it. if chunk_bytes.startswith(":"): continue @@ -235,7 +236,8 @@ async def async_request_openai_completions( async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: payload = { - "model": request_func_input.model, + "model": request_func_input.model_name \ + if request_func_input.model_name else request_func_input.model, "prompt": request_func_input.prompt, "temperature": 0.0, "best_of": request_func_input.best_of, @@ -328,7 +330,8 @@ async def async_request_openai_chat_completions( if request_func_input.multi_modal_content: content.append(request_func_input.multi_modal_content) payload = { - "model": request_func_input.model, + "model": request_func_input.model_name \ + if request_func_input.model_name else request_func_input.model, "messages": [ { "role": "user", @@ -417,14 +420,35 @@ def get_model(pretrained_model_name_or_path: str) -> str: def get_tokenizer( - pretrained_model_name_or_path: str, trust_remote_code: bool + pretrained_model_name_or_path: str, + tokenizer_mode: str = "auto", + trust_remote_code: bool = False, + **kwargs, ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: if pretrained_model_name_or_path is not None and not os.path.exists( pretrained_model_name_or_path): pretrained_model_name_or_path = get_model( pretrained_model_name_or_path) - return AutoTokenizer.from_pretrained(pretrained_model_name_or_path, - trust_remote_code=trust_remote_code) + if tokenizer_mode == "slow": + if kwargs.get("use_fast", False): + raise ValueError( + "Cannot use the fast tokenizer in slow tokenizer mode.") + kwargs["use_fast"] = False + if tokenizer_mode == "mistral": + try: + from vllm.transformers_utils.tokenizer import MistralTokenizer + except ImportError as e: + raise ImportError("MistralTokenizer requires vllm package.\n" + "Please install it with `pip install vllm` " + "to use mistral tokenizer mode.") from e + return MistralTokenizer.from_pretrained( + str(pretrained_model_name_or_path)) + else: + return AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + **kwargs, + ) ASYNC_REQUEST_FUNCS = { diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py index 13477ef535e86..0b8fba38156f1 100644 --- a/benchmarks/benchmark_long_document_qa_throughput.py +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -2,8 +2,7 @@ Offline benchmark to test the long document QA throughput. Example usage: - # This command run the vllm with 50GB CPU memory for offloading - # The workload samples 8 different prompts with a default input + # This workload samples 8 different prompts with a default input # length of 20000 tokens, then replicates each prompt 2 times # in random order. python benchmark_long_document_qa_throughput.py \ diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 5e9381f712e10..3ab421a89c935 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -10,7 +10,8 @@ Fixed example usage: --model meta-llama/Llama-2-7b-chat-hf \ --enable-prefix-caching \ --num-prompts 1 \ - --repeat-count 100 + --repeat-count 100 \ + --input-length-range 128:256 ShareGPT example usage: # This command samples 20 prompts with input lengths diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 4eb0e1f8ac903..53186e10c5452 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -525,6 +525,7 @@ async def benchmark( api_url: str, base_url: str, model_id: str, + model_name: str, tokenizer: PreTrainedTokenizerBase, input_requests: List[Tuple[str, int, int]], logprobs: Optional[int], @@ -553,6 +554,7 @@ async def benchmark( "Multi-modal content is only supported on 'openai-chat' backend.") test_input = RequestFuncInput( model=model_id, + model_name=model_name, prompt=test_prompt, api_url=api_url, prompt_len=test_prompt_len, @@ -573,6 +575,7 @@ async def benchmark( if profile: print("Starting profiler...") profile_input = RequestFuncInput(model=model_id, + model_name=model_name, prompt=test_prompt, api_url=base_url + "/start_profile", prompt_len=test_prompt_len, @@ -616,6 +619,7 @@ async def benchmark( async for request in get_request(input_requests, request_rate, burstiness): prompt, prompt_len, output_len, mm_content = request request_func_input = RequestFuncInput(model=model_id, + model_name=model_name, prompt=prompt, api_url=api_url, prompt_len=prompt_len, @@ -780,6 +784,7 @@ def main(args: argparse.Namespace): backend = args.backend model_id = args.model + model_name = args.served_model_name tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model tokenizer_mode = args.tokenizer_mode @@ -877,6 +882,7 @@ def main(args: argparse.Namespace): api_url=api_url, base_url=base_url, model_id=model_id, + model_name=model_name, tokenizer=tokenizer, input_requests=input_requests, logprobs=args.logprobs, @@ -1222,5 +1228,12 @@ if __name__ == "__main__": 'always use the slow tokenizer. \n* ' '"mistral" will always use the `mistral_common` tokenizer.') + parser.add_argument("--served-model-name", + type=str, + default=None, + help="The model name used in the API. " + "If not specified, the model name will be the " + "same as the ``--model`` argument. ") + args = parser.parse_args() main(args) diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py new file mode 100644 index 0000000000000..e1f613e1da509 --- /dev/null +++ b/benchmarks/kernels/benchmark_lora.py @@ -0,0 +1,1147 @@ +import argparse +import copy +import json +import pickle +import time +from dataclasses import dataclass +from enum import Enum, auto +from itertools import product +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Tuple + +import torch +import torch.utils.benchmark as TBenchmark +from torch.utils.benchmark import Measurement as TMeasurement +from utils import ArgPool, Bench, CudaGraphBenchParams +from weight_shapes import WEIGHT_SHAPES + +from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand +from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice +from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink +from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand +from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink +from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT +from vllm.utils import FlexibleArgumentParser + +DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) +DEFAULT_TP_SIZES = [1] +DEFAULT_BATCH_SIZES = [ + 1, 16, 32, 64, 128, 192, 256, 320, 384, 448, 512, 640, 768, 896, 1024, + 2048, 3072, 4096, 5120, 6144, 7168, 8192 +] +DEFAULT_HIDDEN_SIZES = [1024, 2048, 4096, 8192, 16384] +DEFAULT_LORA_RANKS = [16] +DEFAULT_NUM_LORAS = [1, 2, 3, 4] +DEFAULT_SORT_BY_LORA_IDS = [False, True] +DEFAULT_SEQ_LENGTHS = [1] +DEFAULT_EXPAND_FN_ADD_INPUTS = [True, False] + + +# Utilities +def dtype_to_str(dtype: torch.dtype): + if dtype == torch.float16: + return "f16" + if dtype == torch.bfloat16: + return "bf16" + if dtype == torch.float32: + return "f32" + raise ValueError(f"Unsupported dtype {dtype}") + + +def make_rand_lora_weight_tensor(k: int, + n: int, + num_loras: int, + dtype: torch.dtype, + device: str = "cuda") -> torch.Tensor: + + # LoRA weights column major + return torch.rand((num_loras, n, k), dtype=dtype).to(device) + + +def make_rand_tensors( + a_shape: Tuple[int], + b_shape: Tuple[int], + c_shape: Tuple[int], + a_dtype: torch.dtype, + b_dtype: torch.dtype, + c_dtype: torch.dtype, + num_slices: int, + device: str = "cuda", +) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]: + """ + Make LoRA input/output matrices. + """ + A = torch.rand(a_shape, dtype=a_dtype).to(device) + + # LoRA weights column major + Bs = [ + torch.rand(b_shape, dtype=b_dtype).to(device) + for _ in range(num_slices) + ] + + C = torch.zeros(c_shape, dtype=c_dtype).to(device) + return A, Bs, C + + +def make_prompt_lora_mapping(num_prompts: int, num_active_loras: int, + sort_by_lora_id: bool, + device: str) -> torch.Tensor: + """ + All prompts are mapped to a Lora ID in range [0, num_active_loras). + where 0 refers to first lora, 1 refers to second lora and so on. + """ + assert num_active_loras > 0 + + if not sort_by_lora_id: + return torch.randint(0, + num_active_loras, (num_prompts, ), + dtype=torch.long) + + # Divide LoRAs equally and in order. + part_size = num_prompts // num_active_loras + part_size = max(part_size, 1) + + lora_id = 0 + prompt_lora_mapping = [] + while len(prompt_lora_mapping) < num_prompts: + prompt_lora_mapping.extend([lora_id] * part_size) + lora_id = lora_id + 1 if lora_id + 1 < num_active_loras else lora_id + return torch.tensor(prompt_lora_mapping[:num_prompts], + dtype=torch.long, + device=device) + + +def make_token_lora_mapping(num_tokens: int, num_prompts: int, + prompt_lora_mapping: torch.Tensor, + seq_len_tensor: torch.Tensor, device: str): + """ + Make token_lora_mapping from prompt_lora_mapping and seq_lens_tensor + """ + assert prompt_lora_mapping.shape[0] == num_prompts + + # token to lora index mapping + token_lora_mapping = [0] * num_tokens + current_offset = 0 + for b_id in range(num_prompts): + lora_index = prompt_lora_mapping[b_id].item() + s = current_offset + e = s + seq_len_tensor[b_id].item() + token_lora_mapping[s:e] = [lora_index] * (e - s) + current_offset += seq_len_tensor[b_id].item() + + return torch.tensor(token_lora_mapping, dtype=torch.long, device=device) + + +def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor, + lora_weights: List[torch.Tensor], + seq_lens_cpu: torch.Tensor, + prompt_lora_mapping_cpu: torch.Tensor, scaling: float, + add_inputs: Optional[bool]): + """ + Torch group gemm reference implementation to test correctness of + benchmarking operations. + """ + batches = seq_lens_cpu.size(0) + out_list = [] + current_offset = 0 + for lora_index, b_length in zip(range(batches), seq_lens_cpu): + x = input[current_offset:b_length + current_offset, :] + current_offset += b_length + w = lora_weights[prompt_lora_mapping_cpu[lora_index]] + result = torch.nn.functional.linear(x, w) + result *= scaling + out_list.append(result) + torch.cat(out_list, dim=0) + + cat_result = torch.cat(out_list, dim=0) + + if add_inputs: + ref_out += cat_result + else: + ref_out.copy_(cat_result) + + +class OpType(Enum): + """ + LoRA Ops to benchmark and its properties. + """ + SGMV_SHRINK = auto() + BGMV_SHRINK = auto() + SGMV_EXPAND = auto() + BGMV_EXPAND = auto() + BGMV_EXPAND_SLICE = auto() + + @staticmethod + def from_str(s: str) -> "OpType": + if s.lower() == 'sgmv_shrink': + return OpType.SGMV_SHRINK + if s.lower() == 'sgmv_expand': + return OpType.SGMV_EXPAND + if s.lower() == 'bgmv_shrink': + return OpType.BGMV_SHRINK + if s.lower() == 'bgmv_expand': + return OpType.BGMV_EXPAND + if s.lower() == "bgmv_expand_slice": + return OpType.BGMV_EXPAND_SLICE + raise ValueError(f"Unrecognized str {s} to convert to OpType") + + def is_shrink_fn(self) -> bool: + return self in [OpType.SGMV_SHRINK, OpType.BGMV_SHRINK] + + def is_expand_fn(self) -> bool: + return self in [OpType.SGMV_EXPAND, OpType.BGMV_EXPAND] + + def is_prefill_op(self) -> bool: + return self in [OpType.SGMV_SHRINK, OpType.SGMV_EXPAND] + + def is_decode_op(self) -> bool: + return self in [ + OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE + ] + + def is_expand_slice_fn(self) -> bool: + return self in [OpType.BGMV_EXPAND_SLICE] + + def num_slices(self) -> List[int]: + if self in [OpType.SGMV_EXPAND, OpType.SGMV_SHRINK]: + # SGMV kernels supports slices + return [1, 2, 3] + if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]: + return [1] + if self in [OpType.BGMV_EXPAND_SLICE]: + return [2, 3] + raise ValueError(f"Unrecognized OpType {self}") + + def mkn(self, batch_size: int, seq_length: int, hidden_size: int, + lora_rank: int) -> Tuple[int, int, int]: + num_tokens = batch_size * seq_length + if self.is_shrink_fn(): + m = num_tokens + k = hidden_size + n = lora_rank + else: + assert self.is_expand_fn() or self.is_expand_slice_fn() + m = num_tokens + k = lora_rank + n = hidden_size + return m, k, n + + def matmul_dtypes( + self, op_dtype: torch.dtype + ) -> Tuple[torch.dtype, torch.dtype, torch.dtype]: + """ + return a type, b type and c type for A x B = C + """ + if self.is_shrink_fn(): + return op_dtype, op_dtype, torch.float32 + else: + assert self.is_expand_fn() or self.is_expand_slice_fn() + return torch.float32, op_dtype, op_dtype + + def matmul_shapes( + self, batch_size: int, seq_length: int, hidden_size: int, + lora_rank: int, num_loras: int, + num_slices: int) -> Tuple[Tuple[int], Tuple[int], Tuple[int]]: + """ + Given num_slices, return the shapes of the A, B, and C matrices + in A x B = C, for the op_type + """ + m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank) + + b_shape = (num_loras, n, k) # col-major + if self == OpType.SGMV_SHRINK: + # SGMV shrink supports num_slices inherently in the kernel + return ((m, k), b_shape, (num_slices, m, n)) + if self == OpType.SGMV_EXPAND: + # SGMV expand supports num_slices inherently in the kernel + return ((num_slices, m, k), b_shape, (m, n * num_slices)) + if self == OpType.BGMV_SHRINK: + return ((m, k), b_shape, (m, n)) + if self == OpType.BGMV_EXPAND: + return ((m, k), b_shape, (m, n)) + if self == OpType.BGMV_EXPAND_SLICE: + return ((num_slices, m, k), b_shape, (m, n * num_slices)) + + raise ValueError(f"Unrecognized op_type {self}") + + def bench_fn(self) -> Callable: + + def emulate_bgmv_expand_slice(kwargs_list: List[Dict[str, Any]]): + for x in kwargs_list: + bgmv_expand_slice(**x) + + if self == OpType.SGMV_SHRINK: + return sgmv_shrink + if self == OpType.SGMV_EXPAND: + return sgmv_expand + if self == OpType.BGMV_SHRINK: + return bgmv_shrink + if self == OpType.BGMV_EXPAND: + return bgmv_expand + if self == OpType.BGMV_EXPAND_SLICE: + return emulate_bgmv_expand_slice + raise ValueError(f"Unrecognized optype {self}") + + def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor, + lora_weights: List[torch.Tensor], + **kwargs) -> Callable: + """Each benchmark operation expected the input, lora_weights and outputs + in a slightly different format. Refer to self.matmul_shapes(). + run_ref_group_gemm accounts for those differences in executing a + reference group gemm for correctness testing. + """ + w_dtype = lora_weights[0].dtype + num_slices = len(lora_weights) + if self == OpType.SGMV_SHRINK: + for slice_idx in range(num_slices): + ref_group_gemm(ref_out=output[slice_idx, :], + input=input, + lora_weights=lora_weights[slice_idx], + **kwargs) + if self == OpType.SGMV_EXPAND: + hidden_size = lora_weights[0].shape[1] + for slice_idx in range(num_slices): + slice_offset = slice_idx * hidden_size + ref_group_gemm( + ref_out=output[:, slice_offset:slice_offset + hidden_size], + input=input[slice_idx].clone().to(dtype=w_dtype), + lora_weights=lora_weights[slice_idx], + **kwargs) + if self == OpType.BGMV_SHRINK: + assert num_slices == 1 + ref_group_gemm(ref_out=output, + input=input, + lora_weights=lora_weights[0], + **kwargs) + if self == OpType.BGMV_EXPAND: + assert num_slices == 1 + ref_group_gemm(ref_out=output, + input=input.clone().to(dtype=w_dtype), + lora_weights=lora_weights[0], + **kwargs) + if self == OpType.BGMV_EXPAND_SLICE: + hidden_size = lora_weights[0].shape[1] + for slice_idx in range(num_slices): + slice_offset = slice_idx * hidden_size + ref_group_gemm( + ref_out=output[:, slice_offset:slice_offset + hidden_size], + input=input[slice_idx].clone().to(dtype=w_dtype), + lora_weights=lora_weights[slice_idx], + **kwargs) + raise ValueError(f"Unrecognized optype {self}") + + +@dataclass +class BenchmarkContext: + """ + LoRA benchmark context + """ + batch_size: int + hidden_size: int + num_loras: int + num_active_loras: int + lora_rank: int + sort_by_lora_id: bool + dtype: torch.dtype + seq_length: Optional[int] = None + num_slices: Optional[int] = None # num_slices for slice based ops + + def with_seq_length(self, seq_length: int) -> "BenchmarkContext": + ctx = copy.copy(self) + ctx.seq_length = seq_length + return ctx + + def with_num_slices(self, num_slices: int) -> "BenchmarkContext": + ctx = copy.copy(self) + ctx.num_slices = num_slices + return ctx + + def bench_label(self) -> str: + return f"lora-{self.dtype}" + + def bench_sublabel(self, op_type: OpType) -> str: + m, k, n = op_type.mkn(self.batch_size, self.seq_length, + self.hidden_size, self.lora_rank) + desc = { + 'bs': self.batch_size, + 'sl': self.seq_length, + 'm': m, + 'k': k, + 'n': n, + 'num_loras': self.num_loras, + 'sort_by_lora': self.sort_by_lora_id, + 'num_slices': self.num_slices, + } + return json.dumps(desc) + + +@dataclass +class BenchmarkTensors: + """ + Input/Output tensors used for benchmarks + """ + # matmul tensors + input: torch.Tensor + lora_weights_lst: List[torch.Tensor] + output: torch.Tensor + # metadata tensors + seq_lens: torch.Tensor + seq_start_loc: torch.Tensor + prompt_lora_mapping: torch.Tensor + token_lora_mapping: torch.Tensor + + def io_types(self) -> str: + return (f"{dtype_to_str(self.input.dtype)}x" + f"{dtype_to_str(self.lora_weights_lst[0].dtype)}=>" + f"{dtype_to_str(self.output.dtype)}") + + @staticmethod + def make(ctx: BenchmarkContext, + op_type: OpType, + device: str = "cuda") -> "BenchmarkTensors": + + # Make input / output matmul tensors. + a_shape, b_shape, c_shape = op_type.matmul_shapes( + ctx.batch_size, ctx.seq_length, ctx.hidden_size, ctx.lora_rank, + ctx.num_loras, ctx.num_slices) + a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype) + input_tensor, lora_weights, output_tensor = \ + make_rand_tensors(a_shape, b_shape, c_shape, a_type, b_type, c_type, + num_slices = ctx.num_slices) + + # Make metadata tensors. + # Keep the metadata tensors in the CPU for further processing if needed. + # The tensors get moved to the GPU before benchmarking. + assert ctx.num_active_loras <= ctx.num_loras + total_tokens = ctx.batch_size * ctx.seq_length + + # Prepare seq lens tensor + seq_len_tensor = torch.randint(ctx.seq_length, ctx.seq_length + 1, + (ctx.batch_size, )) + # Prepare seq_start_loc tensor + seq_start_loc_tensor = torch.cumsum(torch.tensor( + [0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), + dim=0) + assert total_tokens == seq_len_tensor.sum() + # Prepare prompt lora indices tensor + prompt_lora_indices_tensor = make_prompt_lora_mapping( + ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu") + # Prepare token lora indices tensor + token_lora_indices_tensor = make_token_lora_mapping( + total_tokens, ctx.batch_size, prompt_lora_indices_tensor, + seq_len_tensor, "cpu") + + return BenchmarkTensors(input_tensor, lora_weights, output_tensor, + seq_len_tensor, seq_start_loc_tensor, + prompt_lora_indices_tensor, + token_lora_indices_tensor) + + def sanity_check(self) -> None: + """ + Fails asserts when non-conformality is detected. + """ + num_tokens = self.input.shape[-2] + # check metadata tensors + assert torch.sum(self.seq_lens) == num_tokens + num_seqs = self.seq_lens.shape[0] + assert self.seq_start_loc.shape[0] == num_seqs + assert self.prompt_lora_mapping.shape[0] == num_seqs + assert self.token_lora_mapping.shape[0] == num_tokens + + def to_device(self, device: str): + """ + Transfer tensors to device if the tensors aren't already on the device + """ + + def to_device(tensor: torch.Tensor): + if tensor.device != device: + tensor = tensor.to(device=device) + return tensor + + self.input = to_device(self.input) + self.output = to_device(self.output) + self.seq_lens = to_device(self.seq_lens) + self.seq_start_loc = to_device(self.seq_start_loc) + self.prompt_lora_mapping = to_device(self.prompt_lora_mapping) + self.token_lora_mapping = to_device(self.token_lora_mapping) + for i in range(len(self.lora_weights_lst)): + self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i]) + + def metadata(self) -> Tuple[int, int, int]: + """ + Return num_seqs, num_tokens and max_seq_len + """ + num_seqs = self.seq_lens.shape[0] + num_tokens = self.token_lora_mapping.shape[0] + max_seq_len = torch.max(self.seq_lens).item() + num_slices = len(self.lora_weights_lst) + return num_seqs, num_tokens, max_seq_len, num_slices + + def convert_to_sgmv_benchmark_tensors(self): + """ + For sgmv punica kernels, when consecutive sequences have the + same LoRA ID, we just merge them together. + This happens in punica.py::compute_metadata + """ + + # Collapse seq_lens and seq_start_loc + _, seq_lens = torch.unique_consecutive(self.token_lora_mapping, + return_counts=True) + cum_result = torch.cumsum(seq_lens, dim=0) + seq_start_loc = torch.zeros_like(seq_lens) + seq_start_loc[1:].copy_(cum_result[:-1]) + + # Collapse prompt mapping + prompt_lora_mapping = torch.unique_consecutive( + self.prompt_lora_mapping) + + assert torch.sum(seq_lens) == torch.sum(self.seq_lens), \ + f"dont match - new {torch.sum(seq_lens)} vs {torch.sum(self.seq_lens)}" + + self.prompt_lora_mapping = prompt_lora_mapping.to( + dtype=self.prompt_lora_mapping.dtype) + self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype) + self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype) + + def as_sgmv_shrink_kwargs(self) -> Dict[str, Any]: + self.convert_to_sgmv_benchmark_tensors() + self.sanity_check() + self.to_device(self.input.device) + + num_seqs, num_tokens, max_seq_len, num_slices = self.metadata() + + # Sanity check matrix shapes. + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape [num_tokens, hidden_size] + assert len(i_shape) == 2 + assert i_shape[0] == num_tokens + hidden_size = i_shape[1] + # Expected lora weight shape [num_loras, lora_rank, hidden_size] + assert len(lw_shape) == 3 + assert lw_shape[2] == hidden_size + lora_rank = lw_shape[1] + # Expected output shape [num_slices, num_tokens, lora_rank] + assert len(o_shape) == 3 + assert o_shape == (num_slices, num_tokens, lora_rank) + + return { + 'inputs': self.input, + 'lora_a_weights': self.lora_weights_lst, + 'output_tensor': self.output, + 'b_seq_start_loc': self.seq_start_loc, + 'seq_len_tensor': self.seq_lens, + 'lora_indices_tensor': self.prompt_lora_mapping, + 'batches': num_seqs, + 'max_seq_length': max_seq_len, + 'token_nums': num_tokens, + 'scaling': 1.0, + } + + def as_sgmv_expand_kwargs(self, add_inputs: bool) -> Dict[str, Any]: + + self.convert_to_sgmv_benchmark_tensors() + self.sanity_check() + self.to_device(self.input.device) + + num_seqs, num_tokens, max_seq_len, num_slices = self.metadata() + + # Sanity check matrix shapes. + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape : [num_slices, num_tokens, lora_rank] + assert len(i_shape) == 3 + assert i_shape[0] == num_slices + assert i_shape[1] == num_tokens + lora_rank = i_shape[2] + # Expected lora weight shape : [num_lora, hidden_size, lora_rank] + assert len(lw_shape) == 3 + assert lw_shape[2] == lora_rank + hidden_size = lw_shape[1] + # Expected output shape : [num_tokens, hidden_size * num_slices] + assert len(o_shape) == 2 + assert o_shape == (num_tokens, hidden_size * num_slices) + + return { + 'inputs': self.input, + 'lora_b_weights': self.lora_weights_lst, + 'output_tensor': self.output, + 'b_seq_start_loc': self.seq_start_loc, + 'seq_len_tensor': self.seq_lens, + 'lora_indices_tensor': self.prompt_lora_mapping, + 'batches': num_seqs, + 'max_seq_length': max_seq_len, + 'token_nums': num_tokens, + 'offset_start': 0, + 'add_inputs': add_inputs, + } + + def as_bgmv_shrink_kwargs(self) -> Dict[str, Any]: + assert len(self.lora_weights_lst) == 1 + self.to_device(self.input.device) + + _, num_tokens, _, _ = self.metadata() + # Sanity check shapes + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape [num_tokens, hidden_size] + assert len(i_shape) == 2 + assert i_shape[0] == num_tokens + hidden_size = i_shape[1] + # Expected lora weight shape [num_loras, lora_rank, hidden_size] + assert len(lw_shape) == 3 + assert lw_shape[2] == hidden_size + lora_rank = lw_shape[1] + # Expected output shape [num_tokens, lora_rank] + assert len(o_shape) == 2 + assert o_shape == (num_tokens, lora_rank) + + return { + 'inputs': self.input, + 'lora_a_weights': self.lora_weights_lst[0], + 'output_tensor': self.output, + 'lora_indices_tensor': self.token_lora_mapping, + 'scaling': 1.0 + } + + def as_bgmv_expand_kwargs(self, add_inputs: bool): + assert len(self.lora_weights_lst) == 1 + self.to_device(self.input.device) + + _, num_tokens, _, _ = self.metadata() + # Sanity check shapes + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape [num_tokens, lora_rank] + assert len(i_shape) == 2 + assert i_shape[0] == num_tokens + lora_rank = i_shape[1] + # Expected lora weight shape [num_loras, hidden_size, lora_rank] + assert len(lw_shape) == 3 + assert lw_shape[2] == lora_rank + hidden_size = lw_shape[1] + # Expected output shape [num_tokens, hidden_size] + assert len(o_shape) == 2 + assert o_shape == (num_tokens, hidden_size) + + return { + 'inputs': self.input, + 'lora_b_weights': self.lora_weights_lst[0], + 'output_tensor': self.output, + 'lora_indices_tensor': self.token_lora_mapping, + 'add_inputs': add_inputs + } + + def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> Dict[str, Any]: + + _, num_tokens, _, num_slices = self.metadata() + # Sanity check shapes + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape [num_slices, num_tokens, lora_rank] + assert len(i_shape) == 3 + assert i_shape[0] == num_slices + assert i_shape[1] == num_tokens + lora_rank = i_shape[2] + # Expected lora weight shape [num_loras, hidden_size, lora_rank] + assert len(lw_shape) == 3 + assert lw_shape[2] == lora_rank + hidden_size = lw_shape[1] + # Expected output shape [num_tokens, hidden_size * num_slices] + assert len(o_shape) == 2 + assert o_shape == (num_tokens, hidden_size * num_slices) + + self.to_device(self.input.device) + + kwargs_list = [] + for i in range(num_slices): + kwargs_list.append({ + 'inputs': self.input[i], + 'lora_b_weights': self.lora_weights_lst[i], + 'output_tensor': self.output, + 'lora_indices_tensor': self.token_lora_mapping, + 'slice_offset': i * hidden_size, + 'slice_size': hidden_size, + 'add_inputs': add_inputs, + }) + return {'kwargs_list': kwargs_list} + + def bench_fn_kwargs(self, + op_type: OpType, + add_inputs: Optional[bool] = None) -> Dict[str, Any]: + if op_type.is_shrink_fn(): + assert add_inputs is None + else: + assert add_inputs is not None + + if op_type == OpType.SGMV_SHRINK: + return self.as_sgmv_shrink_kwargs() + if op_type == OpType.SGMV_EXPAND: + return self.as_sgmv_expand_kwargs(add_inputs) + if op_type == OpType.BGMV_SHRINK: + return self.as_bgmv_shrink_kwargs() + if op_type == OpType.BGMV_EXPAND: + return self.as_bgmv_expand_kwargs(add_inputs) + if op_type == OpType.BGMV_EXPAND_SLICE: + return self.as_bgmv_expand_slice_kwargs(add_inputs) + raise ValueError(f"Unrecognized optype {self}") + + def test_correctness(self, op_type: OpType, + expand_fn_add_inputs: Optional[bool]) -> bool: + """ + Test correctness of op_type implementation against a grouped gemm + reference implementation. + """ + seq_lens_cpu = self.seq_lens.to(device="cpu") + prompt_lora_mapping_cpu = self.prompt_lora_mapping.to(device="cpu") + ref_output = self.output.clone() + + self.output.zero_() + op_type.bench_fn()( + **self.bench_fn_kwargs(op_type, expand_fn_add_inputs)) + + op_type.run_ref_group_gemm( + ref_output, + self.input, + self.lora_weights_lst, + seq_lens_cpu=seq_lens_cpu, + prompt_lora_mapping_cpu=prompt_lora_mapping_cpu, + scaling=1.0, + add_inputs=expand_fn_add_inputs) + + rtol, atol = { + torch.float16: (6e-2, 6e-2), + torch.bfloat16: (6e-2, 6e-2), + torch.float32: (1e-2, 1e-2), + }[self.output.dtype] + + return torch.allclose(ref_output, self.output, rtol=rtol, atol=atol) + + +def bench_optype(ctx: BenchmarkContext, + arg_pool_size: int, + op_type: OpType, + cuda_graph_nops: Optional[int] = None, + expand_fn_add_inputs: Optional[bool] = None, + test_correctness: bool = False) -> TMeasurement: + + assert arg_pool_size >= 1 + if op_type.is_shrink_fn(): + assert expand_fn_add_inputs is None + else: + assert expand_fn_add_inputs is not None + + # BenchmarkContext -> BenchmarkTensors + bench_tensors : List[BenchmarkTensors] = \ + [BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)] + for bt in bench_tensors: + bt.sanity_check() + + # Test correctness of our implementation. + if test_correctness: + assert all([ + bt.test_correctness(op_type, expand_fn_add_inputs) + for bt in bench_tensors + ]) + + # BenchmarkTensors -> Dict (kwargs) + kwargs_list = [ + bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs) + for bt in bench_tensors + ] + + # Clear LoRA optimization hash-maps. + _LORA_A_PTR_DICT.clear() + _LORA_B_PTR_DICT.clear() + # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup + for kwargs in kwargs_list: + op_type.bench_fn()(**kwargs) + torch.cuda.synchronize() + + # Merge into a single kwargs and qualify arguments as ArgPool + kwargs = {k: ArgPool([]) for k in kwargs_list[0]} + for _kwargs in kwargs_list: + for k, v in _kwargs.items(): + kwargs[k].values.append(v) + + describe_args = (f"add_inputs={expand_fn_add_inputs}" + if expand_fn_add_inputs is not None else "") + description = ( + f"{op_type.name}({describe_args}) ({bench_tensors[0].io_types()})") + + cuda_graph_params = None + if cuda_graph_nops: + cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops) + timer = None + with Bench(cuda_graph_params, + ctx.bench_label(), ctx.bench_sublabel(op_type), description, + op_type.bench_fn(), **kwargs) as bench: + timer = bench.run() + return timer + + +def bench_torch_mm(ctx: BenchmarkContext, + arg_pool_size: int, + op_type: OpType, + cuda_graph_nops: Optional[int] = None) -> TMeasurement: + """ + Benchmark basic torch.mm as a roofline. + + When all the input tokens have the same LoRA ID, the LoRA kernels are just + a matmul. This torch.mm benchmark serves as a roofline for that case. + + input op_type is used in determining the m, k, n dimensions for the matmul. + """ + + batch_size, hidden_size, lora_rank, seq_length, dtype = (ctx.batch_size, + ctx.hidden_size, + ctx.lora_rank, + ctx.seq_length, + ctx.dtype) + + m, k, n = op_type.mkn(batch_size, seq_length, hidden_size, lora_rank) + # For a fairer comparison. + n = n * ctx.num_slices + + # Get matmul input and output tensors for A x B = C + As, Bs, Cs = [], [], [] + for _ in range(arg_pool_size): + As.append(torch.rand((m, k), dtype=dtype).to("cuda")) + Bs.append(torch.rand((n, k), dtype=dtype).to("cuda").t()) + Cs.append(torch.rand((m, n), dtype=dtype).to("cuda")) + + # Make torch.mm kwargs + mm_kwargs = {'input': ArgPool(As), 'mat2': ArgPool(Bs), 'out': ArgPool(Cs)} + + description = ( + f"single-lora roofline using torch.mm ({dtype_to_str(dtype)}" + f"x{dtype_to_str(dtype)}" + f"=>{dtype_to_str(dtype)})") + cuda_graph_params = None + if cuda_graph_nops: + cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops) + with Bench(cuda_graph_params, ctx.bench_label(), + ctx.bench_sublabel(op_type), description, torch.mm, + **mm_kwargs) as bench: + return bench.run() + + +# runner +def use_cuda_graph_recommendation() -> str: + return """ + Triton kernels have a significant launch overhead with + launched directly via python. This overhead is more noticeable + for small the problem sizes. For these cases, it is recommended + to use the script with `--cuda-graph-nops N` to benchmark N + consecutive invocations of the benchmarking operations from + inside a CUDA Graph. Note that the returned measurement is for N + invocations of the operation. + """ + + +def print_timers(timers: List[TMeasurement], + args: Optional[argparse.Namespace] = None): + compare = TBenchmark.Compare(timers) + compare.print() + + if args and args.cuda_graph_nops: + print( + f"Note : The timings reported above is for {args.cuda_graph_nops} " + "consecutive invocations of the benchmarking functions. " + f"Please divide by {args.cuda_graph_nops} for single invocation " + "timings.") + + print("Note on Comparison with torch.mm : The torch.mm numbers are " + "benchmark numbers of a simple matmul emulating the single lora " + "case. It is provided as a roofline for comparing our LoRA Kernel " + "implementations. It is expected that the LoRA kernels will be " + "slower than torch.mm in cases where num_loras is big. But for " + "small num_loras the goal should be to match the torch.mm numbers.") + + +def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]): + + if args.cuda_graph_nops is not None: + assert args.cuda_graph_nops > 0 + print(f"Benchmarking {args.cuda_graph_nops} invocations inside a CUDA " + "Graph") + else: + print(f"CUDA Graphs not enabled.\n{use_cuda_graph_recommendation()}") + + timers = [] + for bench_ctx in bench_ctxs: + for seq_len in args.seq_lengths: + bench_ops: List[OpType] = [] + if seq_len == 1: + # bench all decode ops + bench_ops = [op for op in args.op_types if op.is_decode_op()] + else: + # bench all prefill ops + bench_ops = [op for op in args.op_types if op.is_prefill_op()] + + seq_len_timers = [] + for bench_op in bench_ops: + for num_slices in bench_op.num_slices(): + _ctx = bench_ctx.with_seq_length(seq_len).with_num_slices( + num_slices) + # Benchmark torch.mm as a roofline + seq_len_timers.append( + bench_torch_mm(_ctx, args.arg_pool_size, bench_op, + args.cuda_graph_nops)) + + # Benchmark bench_op + expand_fn_add_inputs = [ + None + ] if bench_op.is_shrink_fn() else args.expand_fn_add_inputs + for add_input_arg in expand_fn_add_inputs: + seq_len_timers.append( + bench_optype(_ctx, args.arg_pool_size, bench_op, + args.cuda_graph_nops, add_input_arg, + args.test_correctness)) + + print_timers(seq_len_timers) + timers.extend(seq_len_timers) + + # Result stdout dump + print("== All Results ====") + print_timers(timers, args) + + if args.output_directory: + # Result file dump + od = Path(args.output_directory) + if not od.exists(): + od.mkdir() + + timestamp = int(time.time()) + pkl_file = od / f"lora_bench-{timestamp}.pkl" + print(f"Writing benchmarks to {pkl_file}") + with open(pkl_file, "wb") as f: + pickle.dump(timers, f) + + +def as_benchmark_contexts(hidden_sizes: List[int], lora_ranks: List[int], + args: argparse.Namespace) -> List[BenchmarkContext]: + + ctxs: List[BenchmarkContext] = [] + for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product( # noqa + args.batch_sizes, list(hidden_sizes), lora_ranks, args.num_loras, + args.sort_by_lora_id): + ctxs.append( + BenchmarkContext( + batch_size=batch_size, + hidden_size=hidden_size, + lora_rank=lora_rank, + num_loras=num_loras, + num_active_loras=args.num_active_loras + if args.num_active_loras else num_loras, + # To be filled based on the OpType to benchmark + seq_length=None, + sort_by_lora_id=sort_by_lora_id, + dtype=args.dtype, + # To be filled based on the OpType to benchmark + num_slices=None)) + + return ctxs + + +def run_list_bench(args: argparse.Namespace): + print(args) + + print("List bench :\n" + f" Hidden Sizes {args.hidden_sizes}" + f" LoRA Ranks {args.lora_ranks}") + + # Get all benchmarking contexts + bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + hidden_sizes=args.hidden_sizes, lora_ranks=args.lora_ranks, args=args) + + run(args, bench_contexts) + + +def run_range_bench(args: argparse.Namespace): + print(args) + + hidden_sizes = list( + range(args.hidden_sizes_start, args.hidden_sizes_end + 1, + args.hidden_sizes_increment)) + lora_ranks = list( + range(args.lora_ranks_start, args.lora_ranks_end + 1, + args.lora_ranks_increment)) + + print("Range bench :\n" + f" Hidden Sizes {hidden_sizes}" + f" LoRA Ranks {lora_ranks}") + + # Get all benchmarking contexts + bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + hidden_sizes=hidden_sizes, lora_ranks=lora_ranks, args=args) + + run(args, bench_contexts) + + +def run_model_bench(args: argparse.Namespace): + print(args) + + def hidden_sizes_from_model(model: str, tp_size: int) -> set[int]: + hidden_sizes = set() + for KN, tp_split_dim in WEIGHT_SHAPES[model]: + KN[tp_split_dim] = KN[tp_split_dim] // tp_size + hidden_sizes.add(KN[1]) + return hidden_sizes + + # Get all hidden sizes + hidden_sizes: set[int] = set() + for model_name, tp_size in product(args.models, args.tp_sizes): + hidden_sizes = hidden_sizes.union( + hidden_sizes_from_model(model_name, tp_size)) + + print("Model bench :\n" + f" Hidden Sizes {hidden_sizes}" + f" LoRA Ranks {args.lora_ranks}") + + # Get all benchmarking contexts + bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + hidden_sizes=hidden_sizes, lora_ranks=args.lora_ranks, args=args) + + run(args, bench_contexts) + + +if __name__ == '__main__': + + def to_torch_dtype(dt): + if dt == "torch.float16": + return torch.float16 + if dt == "torch.bfloat16": + return torch.bfloat16 + raise ValueError("unsupported dtype") + + def get_bool(s: str) -> bool: + return s.lower() in ['true', '1'] + + def add_common_command_args(p: argparse.ArgumentParser): + p.add_argument( + "--dtype", + type=to_torch_dtype, + required=True, + help="Available options are ['torch.float16', 'torch.bfloat16']") + + p.add_argument( + "--arg-pool-size", + type=int, + default=32, + help="Run profiles with a pool of input/output/meta tensors instead" + "of simply reusing the same tensors for all runs. A bigger arg-pool" + "mitigates hardware caching effects during benchmarking.") + + p.add_argument( + "--cuda-graph-nops", + type=int, + help=("when set profiling is done using cudagraph, " + "with the given number of operations in a graph." + "Note that the measurement returned is the time " + "taken for N consecutive executions of the benchmarking " + "functions, where N is the value of this argument.")) + p.add_argument("--num-loras", + nargs="+", + type=int, + default=DEFAULT_NUM_LORAS) + p.add_argument("--num-active-loras", + type=int, + default=None, + help="Active LoRAs. When None, all LoRAs are active") + p.add_argument("--sort-by-lora-id", + nargs="+", + type=get_bool, + default=DEFAULT_SORT_BY_LORA_IDS) + p.add_argument("--op-types", + nargs="+", + type=OpType.from_str, + default=list(OpType)) + p.add_argument('--seq-lengths', + nargs="+", + type=int, + default=DEFAULT_SEQ_LENGTHS) + p.add_argument("--batch-sizes", + nargs="+", + type=int, + default=DEFAULT_BATCH_SIZES) + p.add_argument("--expand-fn-add-inputs", + nargs="+", + type=get_bool, + default=DEFAULT_EXPAND_FN_ADD_INPUTS) + p.add_argument( + '-o', + '--output-directory', + type=str, + help=("Output directory to store a the list of benchmarking" + "TMeasurement objects as a pickle file")) + + p.add_argument( + "--test-correctness", + action='store_true', + help=("When enabled, the benchmarking functions are tested" + "for correctness before the actual benchmarking")) + + parser = FlexibleArgumentParser( + description=f""" +Benchmark LoRA kernels: + {use_cuda_graph_recommendation()} + + list_bench example: + python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 + + model_bench example: + python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 + + range_bench example: + python3 benchmarks/kernels/benchmark_lora.py range_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 + """, # noqa: E501 + formatter_class=argparse.RawTextHelpFormatter) + + subparsers = parser.add_subparsers(dest="cmd", required=True) + + list_parser = subparsers.add_parser("list_bench") + list_parser.add_argument("--hidden-sizes", + nargs="+", + type=int, + default=DEFAULT_HIDDEN_SIZES) + list_parser.add_argument("--lora-ranks", + nargs="+", + type=int, + default=DEFAULT_LORA_RANKS) + add_common_command_args(list_parser) + list_parser.set_defaults(func=run_list_bench) + + range_parser = subparsers.add_parser("range_bench") + range_parser.add_argument("--hidden-sizes-start", type=int, required=True) + range_parser.add_argument("--hidden-sizes-end", type=int, required=True) + range_parser.add_argument("--hidden-sizes-increment", + type=int, + required=True) + range_parser.add_argument("--lora-ranks-start", type=int, required=True) + range_parser.add_argument("--lora-ranks-end", type=int, required=True) + range_parser.add_argument("--lora-ranks-increment", + type=int, + required=True) + add_common_command_args(range_parser) + range_parser.set_defaults(func=run_range_bench) + + model_parser = subparsers.add_parser("model_bench") + model_parser.add_argument("--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES.keys()) + model_parser.add_argument("--tp-sizes", + nargs="+", + type=int, + default=DEFAULT_TP_SIZES) + model_parser.add_argument("--lora-ranks", + nargs="+", + type=int, + default=DEFAULT_LORA_RANKS) + add_common_command_args(model_parser) + model_parser.set_defaults(func=run_model_bench) + + args = parser.parse_args() + args.func(args) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 8f538c21f7f7e..1d59a01422412 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -1,6 +1,7 @@ import argparse import time from datetime import datetime +from itertools import product from typing import Any, Dict, List, Tuple, TypedDict import ray @@ -11,7 +12,10 @@ from transformers import AutoConfig from vllm.model_executor.layers.fused_moe.fused_moe import * from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils import FlexibleArgumentParser, is_navi + +FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm( +) and not is_navi() else torch.float8_e4m3fn class BenchmarkConfig(TypedDict): @@ -80,8 +84,8 @@ def benchmark_config( a1_scale = torch.randn(1, dtype=torch.float32) a2_scale = torch.randn(1, dtype=torch.float32) - w1 = w1.to(torch.float8_e4m3fn) - w2 = w2.to(torch.float8_e4m3fn) + w1 = w1.to(FP8_DTYPE) + w2 = w2.to(FP8_DTYPE) input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32) @@ -141,28 +145,172 @@ def benchmark_config( return avg -def get_configs_compute_bound() -> List[Dict[str, int]]: - # Reduced search space for faster tuning. - # TODO(woosuk): Increase the search space and use a performance model to - # prune the search space. +def get_rocm_tuning_space(use_fp16): + block_mn_range = [16, 32, 64, 128, 256] + block_k_range = [16, 32, 64, 128, 256] + if not use_fp16: + block_k_range.remove(16) # BLOCK_K=16 not supported for fp8 + num_warps_range = [1, 2, 4, 8] + group_m_range = [1, 4, 8, 16, 32] + num_stage_range = [2] + waves_per_eu_range = [0] + matrix_instr_nonkdim_range = [16, 32] if use_fp16 else [] + kpack_range = [1, 2] if use_fp16 else [] + + param_ranges = { + "BLOCK_SIZE_M": block_mn_range, + "BLOCK_SIZE_N": block_mn_range, + "BLOCK_SIZE_K": block_k_range, + "GROUP_SIZE_M": group_m_range, + "num_warps": num_warps_range, + "num_stages": num_stage_range, + "waves_per_eu": waves_per_eu_range, + } + if use_fp16: + param_ranges["matrix_instr_nonkdim"] = matrix_instr_nonkdim_range + param_ranges["kpack"] = kpack_range + + return param_ranges + + +def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]: configs: List[BenchmarkConfig] = [] - for num_stages in [2, 3, 4, 5]: - for block_m in [16, 32, 64, 128, 256]: - for block_k in [64, 128, 256]: - for block_n in [32, 64, 128, 256]: - for num_warps in [4, 8]: - for group_size in [1, 16, 32, 64]: - configs.append({ - "BLOCK_SIZE_M": block_m, - "BLOCK_SIZE_N": block_n, - "BLOCK_SIZE_K": block_k, - "GROUP_SIZE_M": group_size, - "num_warps": num_warps, - "num_stages": num_stages, - }) + + if current_platform.is_rocm(): + param_ranges = get_rocm_tuning_space(use_fp16) + else: + # Reduced search space for faster tuning. + # TODO(woosuk): Increase the search space and use a performance model to + # prune the search space. + block_m_range = [16, 32, 64, 128, 256] + block_n_range = [32, 64, 128, 256] + block_k_range = [64, 128, 256] + num_warps_range = [4, 8] + group_m_range = [1, 16, 32, 64] + num_stage_range = [2, 3, 4, 5] + + param_ranges = { + "BLOCK_SIZE_M": block_m_range, + "BLOCK_SIZE_N": block_n_range, + "BLOCK_SIZE_K": block_k_range, + "GROUP_SIZE_M": group_m_range, + "num_warps": num_warps_range, + "num_stages": num_stage_range, + } + + keys, values = zip(*param_ranges.items()) + for config_values in product(*values): + config = dict(zip(keys, config_values)) + configs.append(config) return configs +def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size, + search_space, is_fp16): + N1, K1 = shard_intermediate_size, hidden_size + N2, K2 = hidden_size, shard_intermediate_size // 2 + pruned_space_1 = prune_rocm_configs(num_tokens * 2, N1, K1, search_space, + is_fp16) + pruned_space_2 = prune_rocm_configs(num_tokens * 2, N2, K2, search_space, + is_fp16) + search_space = merge_unique_dicts(pruned_space_1, pruned_space_2) + return search_space + + +# The following code is inspired by ROCm/Triton GEMM tuning script: +# https://github.com/ROCm/triton/blob/triton-mlir/scripts/amd/gemm/tune_gemm.py#L89 +def prune_rocm_configs(M, N, K, configs, is_fp16=True): + pruned_configs = [] + elemBytes_a = 2 if is_fp16 else 1 + elemBytes_b = 2 if is_fp16 else 1 + + mfma = 16 if M < 32 or N < 32 else 32 + + # TODO (zhanglx): figure out the boundary between large and small gemms + large_gemm = False + if M >= 2048 and N >= 2048: + large_gemm = True + + for config in configs: + BLOCK_SIZE_M = config.get("BLOCK_SIZE_M") + BLOCK_SIZE_N = config.get("BLOCK_SIZE_N") + BLOCK_SIZE_K = config.get("BLOCK_SIZE_K") + num_warps = config.get("num_warps") + + if is_fp16: + matrix_instr_nonkdim = config.get("matrix_instr_nonkdim") + if matrix_instr_nonkdim > mfma: + continue + if mfma == 4 and BLOCK_SIZE_K < 64: + continue + # some layouts could not work properly in case + # number elements per thread is less 1 + if BLOCK_SIZE_M * BLOCK_SIZE_N < 64: + continue + SPLIT_K = config.get("SPLIT_K", 1) + GROUP_M = config.get("GROUP_SIZE_M") + if is_fp16: + if (matrix_instr_nonkdim > BLOCK_SIZE_M + or matrix_instr_nonkdim > BLOCK_SIZE_N): + continue + if (matrix_instr_nonkdim >= M + and matrix_instr_nonkdim != BLOCK_SIZE_M): + continue + if (matrix_instr_nonkdim >= N + and matrix_instr_nonkdim != BLOCK_SIZE_N): + continue + # Skip BLOCK_SIZE that is too large compare to M/N + # unless BLOCK_SIZE is already small enough + if M * 2 < BLOCK_SIZE_M and BLOCK_SIZE_M != 16: + continue + if N * 2 < BLOCK_SIZE_N and BLOCK_SIZE_N != 16: + continue + # skip large split_k when not necessary + if SPLIT_K != 1 and not need_split_k(M, N, K): + continue + # skip split_k that leads to EVEN_K = false + leap = SPLIT_K * BLOCK_SIZE_K + modv = K % leap + if modv != 0: + continue + # skip large GROUP_M + if GROUP_M * BLOCK_SIZE_M > M and GROUP_M != 1: + continue + # out of shared memory resource + # TODO (zhanglx): This does not consider the LDS usage in the epilogue + LDS = (BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a + + BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b) + if LDS > 65536: + continue + # Skip small block sizes and num_warps for large gemm + # For fp16 and f8, we want to only use BLOCK_SIZE >= 64 + if large_gemm: + if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64: + continue + if BLOCK_SIZE_K < 64: + continue + if num_warps < 4: + continue + + pruned_configs.append(config) + + return pruned_configs + + +def need_split_k(SIZE_M, SIZE_N, SIZE_K): + return (SIZE_M < 64 or SIZE_N < 64) and SIZE_K > 1024 + + +def merge_unique_dicts(list1, list2): + result = [] + combined_list = list1.copy() + combined_list.extend(list2) + for dictionary in combined_list: + if dictionary not in result: + result.append(dictionary) + return result + + @ray.remote(num_gpus=1) class BenchmarkWorker: @@ -170,6 +318,10 @@ class BenchmarkWorker: torch.set_default_device("cuda") current_platform.seed_everything(seed) self.seed = seed + # Get the device ID to allocate tensors and kernels + # on the respective GPU. This is required for Ray to work + # correctly with multi-GPU tuning on the ROCm platform. + self.device_id = int(ray.get_gpu_ids()[0]) def benchmark( self, @@ -217,25 +369,33 @@ class BenchmarkWorker: ) -> Dict[str, int]: best_config = None best_time = float("inf") - for config in tqdm(search_space): - try: - kernel_time = benchmark_config(config, - num_tokens, - num_experts, - shard_intermediate_size, - hidden_size, - topk, - dtype, - use_fp8_w8a8, - use_int8_w8a16, - num_iters=10) - except triton.runtime.autotuner.OutOfResources: - # Some configurations may be invalid and fail to compile. - continue + if current_platform.is_rocm(): + is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) + search_space = prune_rocm_search_space(num_tokens, + shard_intermediate_size, + hidden_size, search_space, + is_fp16) - if kernel_time < best_time: - best_time = kernel_time - best_config = config + with torch.cuda.device(self.device_id): + for config in tqdm(search_space): + try: + kernel_time = benchmark_config(config, + num_tokens, + num_experts, + shard_intermediate_size, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + num_iters=20) + except triton.runtime.autotuner.OutOfResources: + # Some configurations may be invalid and fail to compile. + continue + + if kernel_time < best_time: + best_time = kernel_time + best_config = config now = datetime.now() print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}") assert best_config is not None @@ -244,12 +404,27 @@ class BenchmarkWorker: def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: return { - "BLOCK_SIZE_M": config["BLOCK_SIZE_M"], - "BLOCK_SIZE_N": config["BLOCK_SIZE_N"], - "BLOCK_SIZE_K": config["BLOCK_SIZE_K"], - "GROUP_SIZE_M": config["GROUP_SIZE_M"], - "num_warps": config["num_warps"], - "num_stages": config["num_stages"], + "BLOCK_SIZE_M": + config["BLOCK_SIZE_M"], + "BLOCK_SIZE_N": + config["BLOCK_SIZE_N"], + "BLOCK_SIZE_K": + config["BLOCK_SIZE_K"], + "GROUP_SIZE_M": + config["GROUP_SIZE_M"], + "num_warps": + config["num_warps"], + "num_stages": + config["num_stages"], + **({ + "waves_per_eu": config["waves_per_eu"] + } if "waves_per_eu" in config else {}), + **({ + "matrix_instr_nonkdim": config["matrix_instr_nonkdim"] + } if "matrix_instr_nonkdim" in config else {}), + **({ + "kpack": config["kpack"] + } if "kpack" in config else {}), } @@ -294,7 +469,7 @@ def main(args: argparse.Namespace): shard_intermediate_size = 2 * intermediate_size // args.tp_size hidden_size = config.hidden_size - dtype = config.torch_dtype + dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" @@ -322,7 +497,8 @@ def main(args: argparse.Namespace): return ray.get(outputs) if args.tune: - search_space = get_configs_compute_bound() + is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) + search_space = get_configs_compute_bound(is_fp16) print(f"Start tuning over {len(search_space)} configurations...") start = time.time() diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py new file mode 100644 index 0000000000000..fee877b6f76fa --- /dev/null +++ b/benchmarks/kernels/utils.py @@ -0,0 +1,210 @@ +import dataclasses +from typing import Any, Callable, Iterable, Optional + +import torch +import torch.utils.benchmark as TBenchmark +from torch.utils.benchmark import Measurement as TMeasurement + + +@dataclasses.dataclass +class CudaGraphBenchParams: + num_ops_in_cuda_graph: int + + +@dataclasses.dataclass +class ArgPool: + """ + When some argument of the benchmarking function is annotated with this type, + the benchmarking class (BenchMM) will collapse the argument to a pick a + single value from the given list of values, during function invocation. + For every invocation during a benchmarking run, it will choose a + different value from the list. + """ + values: Iterable[Any] + + def __getitem__(self, index): + return self.values[index] + + +class Bench: + + class ArgsIterator: + + def __init__(self, args_list, kwargs_list): + assert len(args_list) == len(kwargs_list) + self.args_list = args_list + self.kwargs_list = kwargs_list + self.n = len(self.args_list) + self.idx = 0 + + def __next__(self): + while True: + yield (self.args_list[self.idx], self.kwargs_list[self.idx]) + self.idx += 1 + self.idx = self.idx % self.n + + def reset(self): + self.idx = 0 + + @property + def n_args(self): + return self.n + + def __init__(self, cuda_graph_params: Optional[CudaGraphBenchParams], + label: str, sub_label: str, description: str, fn: Callable, + *args, **kwargs): + + self.cuda_graph_params = cuda_graph_params + self.use_cuda_graph = self.cuda_graph_params is not None + self.label = label + self.sub_label = sub_label + self.description = description + self.fn = fn + + # Process args + self._args = args + self._kwargs = kwargs + self.args_list, self.kwargs_list = self.collapse_argpool( + *args, **kwargs) + self.args_iterator = self.ArgsIterator(self.args_list, + self.kwargs_list) + + # Cudagraph runner + self.g = None + if self.use_cuda_graph: + self.g = self.get_cuda_graph_runner() + + # benchmark run params + self.min_run_time = 1 + + def collapse_argpool(self, *args, **kwargs): + argpool_args = [arg for arg in args if isinstance(arg, ArgPool)] + [ + arg for arg in kwargs.values() if isinstance(arg, ArgPool) + ] + if len(argpool_args) == 0: + return [args], [kwargs] + + # Make sure all argpools are of the same size + argpool_size = len(argpool_args[0].values) + assert all([argpool_size == len(arg.values) for arg in argpool_args]) + + # create copies of the args + args_list = [] + kwargs_list = [] + for _ in range(argpool_size): + args_list.append(args) + kwargs_list.append(kwargs.copy()) + + for i in range(argpool_size): + # collapse args; Just pick the ith value + args_list[i] = tuple([ + arg[i] if isinstance(arg, ArgPool) else arg + for arg in args_list[i] + ]) + + # collapse kwargs + kwargs_i = kwargs_list[i] + arg_pool_keys = [ + k for k, v in kwargs_i.items() if isinstance(v, ArgPool) + ] + for k in arg_pool_keys: + # again just pick the ith value + kwargs_i[k] = kwargs_i[k][i] + kwargs_list[i] = kwargs_i + + return args_list, kwargs_list + + def get_cuda_graph_runner(self): + assert self.use_cuda_graph + assert self.args_iterator is not None + + num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph + + # warmup + args_it = self.args_iterator.__next__() + for _ in range(2): + args, kwargs = next(args_it) + self.fn(*args, **kwargs) + + self.args_iterator.reset() + args_it = self.args_iterator.__next__() + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g): + for _ in range(num_graph_ops): + args, kwargs = next(args_it) + self.fn(*args, **kwargs) + return g + + def run_cudagrah(self) -> TMeasurement: + assert self.use_cuda_graph + globals = {'g': self.g} + + return TBenchmark.Timer( + stmt="g.replay()", + globals=globals, + label=( + f"{self.label}" + f" | cugraph {self.cuda_graph_params.num_ops_in_cuda_graph} ops" + ), + sub_label=self.sub_label, + description=self.description, + ).blocked_autorange(min_run_time=self.min_run_time) + + def run_eager(self) -> TMeasurement: + setup = None + stmt = None + globals = None + + has_arg_pool = self.args_iterator.n_args > 1 + if has_arg_pool: + setup = ''' + args_iterator.reset() + args_it = args_iterator.__next__() + ''' + stmt = ''' + args, kwargs = next(args_it) + fn(*args, **kwargs) + ''' + globals = {'fn': self.fn, 'args_iterator': self.args_iterator} + else: + # no arg pool. Just use the args and kwargs directly + self.args_iterator.reset() + args_it = self.args_iterator.__next__() + args, kwargs = next(args_it) + + setup = "" + stmt = ''' + fn(*args, **kwargs) + ''' + globals = {'fn': self.fn, 'args': args, 'kwargs': kwargs} + + return TBenchmark.Timer( + stmt=stmt, + setup=setup, + globals=globals, + label=self.label, + sub_label=self.sub_label, + description=self.description, + ).blocked_autorange(min_run_time=self.min_run_time) + + def run(self) -> TMeasurement: + timer = None + if self.use_cuda_graph: # noqa SIM108 + timer = self.run_cudagrah() + else: + timer = self.run_eager() + if not timer.meets_confidence() or timer.has_warnings: + print("Doesn't meet confidence - re-running bench ...") + return self.run() + return timer + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + if exc_type: + print(f"exc type {exc_type}") + print(f"exc value {exc_value}") + print(f"exc traceback {traceback}") diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 40430dae10c5b..15b09395a889f 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -58,8 +58,8 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS) # set(SRCS ${ORIG_SRCS}) set(CXX_SRCS ${ORIG_SRCS}) - list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$") - list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$") + list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)|(hip)$") + list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)|(hip)$") # # Generate ROCm/HIP source file names from CUDA file names. diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 839dc36ba4e29..88275dbdd83a1 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -9,8 +9,16 @@ namespace vllm { +template +__device__ __forceinline__ scalar_t compute(const scalar_t& x, + const scalar_t& y) { + return act_first ? ACT_FN(x) * y : x * ACT_FN(y); +} // Activation and gating kernel template. -template + +template __global__ void act_and_mul_kernel( scalar_t* __restrict__ out, // [..., d] const scalar_t* __restrict__ input, // [..., 2, d] @@ -19,7 +27,7 @@ __global__ void act_and_mul_kernel( for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]); const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]); - out[token_idx * d + idx] = ACT_FN(x) * y; + out[token_idx * d + idx] = compute(x, y); } } @@ -55,7 +63,9 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) { } // namespace vllm // Launch activation and gating kernel. -#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \ +// Use ACT_FIRST (bool) indicating whether to apply the activation function +// first. +#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, ACT_FIRST) \ int d = input.size(-1) / 2; \ int64_t num_tokens = input.numel() / input.size(-1); \ dim3 grid(num_tokens); \ @@ -64,7 +74,7 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) { const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ VLLM_DISPATCH_FLOATING_TYPES( \ input.scalar_type(), "act_and_mul_kernel", [&] { \ - vllm::act_and_mul_kernel> \ + vllm::act_and_mul_kernel, ACT_FIRST> \ <<>>(out.data_ptr(), \ input.data_ptr(), d); \ }); @@ -72,19 +82,27 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) { void silu_and_mul(torch::Tensor& out, // [..., d] torch::Tensor& input) // [..., 2 * d] { - LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel); + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, true); +} + +void mul_and_silu(torch::Tensor& out, // [..., d] + torch::Tensor& input) // [..., 2 * d] +{ + // The difference between mul_and_silu and silu_and_mul is that mul_and_silu + // applies the silu to the latter half of the input. + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, false); } void gelu_and_mul(torch::Tensor& out, // [..., d] torch::Tensor& input) // [..., 2 * d] { - LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel); + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel, true); } void gelu_tanh_and_mul(torch::Tensor& out, // [..., d] torch::Tensor& input) // [..., 2 * d] { - LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel); + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel, true); } namespace vllm { diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp index 408e736d5bc0f..c2ae554c9f8e8 100644 --- a/csrc/core/scalar_type.hpp +++ b/csrc/core/scalar_type.hpp @@ -32,7 +32,7 @@ class ScalarType { signed_(signed_), bias(bias), finite_values_only(finite_values_only), - nan_repr(nan_repr){}; + nan_repr(nan_repr) {}; static constexpr ScalarType int_(uint8_t size_bits, int32_t bias = 0) { return ScalarType(0, size_bits - 1, true, bias); diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp index 28db0479748bf..a71815106133a 100644 --- a/csrc/cpu/cpu_types.hpp +++ b/csrc/cpu/cpu_types.hpp @@ -2,13 +2,13 @@ #define CPU_TYPES_HPP #if defined(__x86_64__) - //x86 implementation + // x86 implementation #include "cpu_types_x86.hpp" #elif defined(__POWER9_VECTOR__) - //ppc implementation + // ppc implementation #include "cpu_types_vsx.hpp" #elif defined(__aarch64__) - //arm implementation + // arm implementation #include "cpu_types_arm.hpp" #else #warning "unsupported vLLM cpu implementation" diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp index ae062a5b86892..990e99f2fc069 100644 --- a/csrc/cpu/cpu_types_arm.hpp +++ b/csrc/cpu/cpu_types_arm.hpp @@ -1,48 +1,50 @@ #include -#include +#include #include namespace vec_op { #ifdef ARM_BF16_SUPPORT - #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) + #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) #else - #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) #endif -#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) #ifndef CPU_OP_GUARD -#define CPU_KERNEL_GUARD_IN(NAME) -#define CPU_KERNEL_GUARD_OUT(NAME) + #define CPU_KERNEL_GUARD_IN(NAME) + #define CPU_KERNEL_GUARD_OUT(NAME) #else -#define CPU_KERNEL_GUARD_IN(NAME) \ - std::cout << #NAME << " invoked." << std::endl; -#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl; + #define CPU_KERNEL_GUARD_IN(NAME) \ + std::cout << #NAME << " invoked." << std::endl; + #define CPU_KERNEL_GUARD_OUT(NAME) \ + std::cout << #NAME << " exit." << std::endl; #endif #define FORCE_INLINE __attribute__((always_inline)) inline namespace { - template - constexpr void unroll_loop_item(std::integer_sequence, F &&f) { - (f(std::integral_constant{}), ...); - }; -}; +template +constexpr void unroll_loop_item(std::integer_sequence, F&& f) { + (f(std::integral_constant{}), ...); +}; +}; // namespace template >> -constexpr void unroll_loop(F &&f) { +constexpr void unroll_loop(F&& f) { unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); } -template struct Vec { +template +struct Vec { constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }; }; @@ -54,127 +56,124 @@ struct FP16Vec8 : public Vec { float16x8_t reg; - explicit FP16Vec8(const void *ptr) - : reg(vld1q_f16(static_cast(ptr))) {}; + explicit FP16Vec8(const void* ptr) + : reg(vld1q_f16(static_cast(ptr))) {}; - explicit FP16Vec8(const FP32Vec8 &); + explicit FP16Vec8(const FP32Vec8&); - void save(void *ptr) const { - vst1q_f16(static_cast<__fp16 *>(ptr), reg); - } + void save(void* ptr) const { vst1q_f16(static_cast<__fp16*>(ptr), reg); } }; struct FP16Vec16 : public Vec { - constexpr static int VEC_ELEM_NUM = 16; - - float16x8x2_t reg; - - explicit FP16Vec16(const void *ptr) { - reg.val[0] = vld1q_f16(reinterpret_cast(ptr)); - reg.val[1] = vld1q_f16(reinterpret_cast(ptr) + 8); - } - - explicit FP16Vec16(const FP32Vec16& vec); - - void save(void *ptr) const { - vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); - vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); - } - - void save(void *ptr, const int elem_num) const { - int full_blocks = elem_num / 8; - int remainder = elem_num % 8; - - if (full_blocks > 0) { - vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); - if (full_blocks > 1) { - vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); - } - } + constexpr static int VEC_ELEM_NUM = 16; - // Note: below is the unrolled version of the following code: - // - // for (int i = 0; i < remainder; ++i) { - // reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = - // vgetq_lane_f16(temp, i); - // } - // - // For macOS build (Clang), the arm/neon intrinsics function - // `vgetq_lane_f16` needs the parameter `i` to be constant at compile - // time. - - if (remainder > 0) { - float16x8_t temp = reg.val[full_blocks]; - __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr); - switch (remainder) - { - case 1: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - break; - case 2: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - break; - case 3: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); - break; - case 4: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); - fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); - break; - case 5: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); - fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); - fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); - break; - case 6: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); - fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); - fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); - fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); - break; - case 7: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); - fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); - fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); - fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); - fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6); - break; - - default: - break; - } - } + float16x8x2_t reg; + + explicit FP16Vec16(const void* ptr) { + reg.val[0] = vld1q_f16(reinterpret_cast(ptr)); + reg.val[1] = vld1q_f16(reinterpret_cast(ptr) + 8); + } + + explicit FP16Vec16(const FP32Vec16& vec); + + void save(void* ptr) const { + vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); + vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); + } + + void save(void* ptr, const int elem_num) const { + int full_blocks = elem_num / 8; + int remainder = elem_num % 8; + + if (full_blocks > 0) { + vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); + if (full_blocks > 1) { + vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); + } } + + // Note: below is the unrolled version of the following code: + // + // for (int i = 0; i < remainder; ++i) { + // reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = + // vgetq_lane_f16(temp, i); + // } + // + // For macOS build (Clang), the arm/neon intrinsics function + // `vgetq_lane_f16` needs the parameter `i` to be constant at compile + // time. + + if (remainder > 0) { + float16x8_t temp = reg.val[full_blocks]; + __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr); + switch (remainder) { + case 1: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + break; + case 2: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + break; + case 3: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + break; + case 4: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + break; + case 5: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + break; + case 6: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); + break; + case 7: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); + fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6); + break; + + default: + break; + } + } + } }; - #ifdef ARM_BF16_SUPPORT struct BF16Vec8 : public Vec { constexpr static int VEC_ELEM_NUM = 8; bfloat16x8_t reg; - explicit BF16Vec8(const void *ptr) - : reg(*reinterpret_cast(ptr)) {}; + explicit BF16Vec8(const void* ptr) + : reg(*reinterpret_cast(ptr)) {}; explicit BF16Vec8(bfloat16x8_t data) : reg(data) {}; - explicit BF16Vec8(const FP32Vec8 &); + explicit BF16Vec8(const FP32Vec8&); - explicit BF16Vec8(float32x4x2_t v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {}; + explicit BF16Vec8(float32x4x2_t v) + : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {}; - void save(void *ptr) const { *reinterpret_cast(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } }; struct BF16Vec16 : public Vec { @@ -182,19 +181,18 @@ struct BF16Vec16 : public Vec { bfloat16x8x2_t reg; - explicit BF16Vec16(const void *ptr) - : reg(*reinterpret_cast(ptr)) {}; + explicit BF16Vec16(const void* ptr) + : reg(*reinterpret_cast(ptr)) {}; explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {}; - explicit BF16Vec16(const FP32Vec16 &); + explicit BF16Vec16(const FP32Vec16&); - explicit BF16Vec16(float32x4x4_t v) : reg({ - vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]), - vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3]) - }){}; + explicit BF16Vec16(float32x4x4_t v) + : reg({vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]), + vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3])}) {}; - void save(void *ptr) const { *reinterpret_cast(ptr) = reg; }; + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; }; }; struct BF16Vec32 : public Vec { @@ -202,19 +200,15 @@ struct BF16Vec32 : public Vec { bfloat16x8x4_t reg; - explicit BF16Vec32(const void *ptr) - : reg(*reinterpret_cast(ptr)) {}; + explicit BF16Vec32(const void* ptr) + : reg(*reinterpret_cast(ptr)) {}; explicit BF16Vec32(bfloat16x8x4_t data) : reg(data) {}; - explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({ - vec8_data.reg, - vec8_data.reg, - vec8_data.reg, - vec8_data.reg - }) {}; + explicit BF16Vec32(const BF16Vec8& vec8_data) + : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {}; - void save(void *ptr) const { *reinterpret_cast(ptr) = reg; }; + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; }; }; #endif @@ -232,11 +226,11 @@ struct FP32Vec4 : public Vec { explicit FP32Vec4() : reg(vdupq_n_f32(0.0f)) {}; - explicit FP32Vec4(const float *ptr) : reg(vld1q_f32(ptr)) {}; + explicit FP32Vec4(const float* ptr) : reg(vld1q_f32(ptr)) {}; explicit FP32Vec4(float32x4_t data) : reg(data) {}; - explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}; + explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {}; }; struct FP32Vec8 : public Vec { @@ -252,32 +246,37 @@ struct FP32Vec8 : public Vec { explicit FP32Vec8() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {}; - explicit FP32Vec8(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {}; + explicit FP32Vec8(const float* ptr) + : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {}; explicit FP32Vec8(float32x4x2_t data) : reg(data) {}; - explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}; + explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {}; - explicit FP32Vec8(const FP16Vec8 &v) { - reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg)); - reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg)); - }; + explicit FP32Vec8(const FP16Vec8& v) { + reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg)); + reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg)); + }; - explicit FP32Vec8(float16x8_t v) : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {}; + explicit FP32Vec8(float16x8_t v) + : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {}; - #ifdef ARM_BF16_SUPPORT +#ifdef ARM_BF16_SUPPORT - explicit FP32Vec8(bfloat16x8_t v) : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {}; + explicit FP32Vec8(bfloat16x8_t v) + : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {}; - explicit FP32Vec8(const BF16Vec8 &v) : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {}; + explicit FP32Vec8(const BF16Vec8& v) + : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {}; - #endif +#endif float reduce_sum() const { AliasReg ar; ar.reg = reg; float answer = 0; - unroll_loop([&answer, &ar](int i) { answer += ar.values[i]; }); + unroll_loop( + [&answer, &ar](int i) { answer += ar.values[i]; }); return answer; } @@ -324,10 +323,14 @@ struct FP32Vec8 : public Vec { AliasReg ar; ar.reg = reg; - float32x2_t er_vec0 = {static_cast(erf(ar.values[0])), static_cast(erf(ar.values[1]))}; - float32x2_t er_vec1 = {static_cast(erf(ar.values[2])), static_cast(erf(ar.values[3]))}; - float32x2_t er_vec2 = {static_cast(erf(ar.values[4])), static_cast(erf(ar.values[5]))}; - float32x2_t er_vec3 = {static_cast(erf(ar.values[6])), static_cast(erf(ar.values[7]))}; + float32x2_t er_vec0 = {static_cast(erf(ar.values[0])), + static_cast(erf(ar.values[1]))}; + float32x2_t er_vec1 = {static_cast(erf(ar.values[2])), + static_cast(erf(ar.values[3]))}; + float32x2_t er_vec2 = {static_cast(erf(ar.values[4])), + static_cast(erf(ar.values[5]))}; + float32x2_t er_vec3 = {static_cast(erf(ar.values[6])), + static_cast(erf(ar.values[7]))}; float32x4_t result0 = vcombine_f32(er_vec0, er_vec1); float32x4_t result1 = vcombine_f32(er_vec2, er_vec3); @@ -337,25 +340,29 @@ struct FP32Vec8 : public Vec { result.val[1] = result1; return FP32Vec8(result); - } - - FP32Vec8 operator*(const FP32Vec8 &b) const { - return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), vmulq_f32(reg.val[1], b.reg.val[1])})); } - FP32Vec8 operator+(const FP32Vec8 &b) const { - return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]), vaddq_f32(reg.val[1], b.reg.val[1])})); + FP32Vec8 operator*(const FP32Vec8& b) const { + return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), + vmulq_f32(reg.val[1], b.reg.val[1])})); } - FP32Vec8 operator-(const FP32Vec8 &b) const { - return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]), vsubq_f32(reg.val[1], b.reg.val[1])})); + FP32Vec8 operator+(const FP32Vec8& b) const { + return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]), + vaddq_f32(reg.val[1], b.reg.val[1])})); } - FP32Vec8 operator/(const FP32Vec8 &b) const { - return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]), vdivq_f32(reg.val[1], b.reg.val[1])})); + FP32Vec8 operator-(const FP32Vec8& b) const { + return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]), + vsubq_f32(reg.val[1], b.reg.val[1])})); } - void save(float *ptr) const { + FP32Vec8 operator/(const FP32Vec8& b) const { + return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]), + vdivq_f32(reg.val[1], b.reg.val[1])})); + } + + void save(float* ptr) const { vst1q_f32(ptr, reg.val[0]); vst1q_f32(ptr + 4, reg.val[1]); } @@ -370,103 +377,100 @@ struct FP32Vec16 : public Vec { float32x4x4_t reg; - explicit FP32Vec16(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {} + explicit FP32Vec16(float v) + : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {} - explicit FP32Vec16() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {} + explicit FP32Vec16() + : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0), + vmovq_n_f32(0.0)}) {} - explicit FP32Vec16(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), vld1q_f32(ptr + 12)}) {} + explicit FP32Vec16(const float* ptr) + : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), + vld1q_f32(ptr + 12)}) {} explicit FP32Vec16(float32x4x4_t data) : reg(data) {} - explicit FP32Vec16(const FP32Vec8 &data) { - reg.val[0] = data.reg.val[0]; - reg.val[1] = data.reg.val[1]; - reg.val[2] = data.reg.val[0]; - reg.val[3] = data.reg.val[1]; + explicit FP32Vec16(const FP32Vec8& data) { + reg.val[0] = data.reg.val[0]; + reg.val[1] = data.reg.val[1]; + reg.val[2] = data.reg.val[0]; + reg.val[3] = data.reg.val[1]; } - explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {} + explicit FP32Vec16(const FP32Vec16& data) : reg(data.reg) {} - explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v.reg)) {} + explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v.reg)) {} - #ifdef ARM_BF16_SUPPORT - explicit FP32Vec16(bfloat16x8x2_t v) : reg({ - vcvtq_low_f32_bf16(v.val[0]), - vcvtq_high_f32_bf16(v.val[0]), - vcvtq_low_f32_bf16(v.val[1]), - vcvtq_high_f32_bf16(v.val[1]) - }) {}; - #endif +#ifdef ARM_BF16_SUPPORT + explicit FP32Vec16(bfloat16x8x2_t v) + : reg({vcvtq_low_f32_bf16(v.val[0]), vcvtq_high_f32_bf16(v.val[0]), + vcvtq_low_f32_bf16(v.val[1]), vcvtq_high_f32_bf16(v.val[1])}) {}; +#endif - explicit FP32Vec16(const FP32Vec4 &data) { + explicit FP32Vec16(const FP32Vec4& data) { reg.val[0] = data.reg; reg.val[1] = data.reg; reg.val[2] = data.reg; reg.val[3] = data.reg; }; - #ifdef ARM_BF16_SUPPORT - explicit FP32Vec16(const BF16Vec16 &v) : reg({ - vcvtq_low_f32_bf16(v.reg.val[0]), - vcvtq_high_f32_bf16(v.reg.val[0]), - vcvtq_low_f32_bf16(v.reg.val[1]), - vcvtq_high_f32_bf16(v.reg.val[1]) - }) {}; +#ifdef ARM_BF16_SUPPORT + explicit FP32Vec16(const BF16Vec16& v) + : reg({vcvtq_low_f32_bf16(v.reg.val[0]), + vcvtq_high_f32_bf16(v.reg.val[0]), + vcvtq_low_f32_bf16(v.reg.val[1]), + vcvtq_high_f32_bf16(v.reg.val[1])}) {}; - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}; - #endif + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}; +#endif - explicit FP32Vec16(const FP16Vec16 &v) { - reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0])); - reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0])); - reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1])); - reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1])); + explicit FP32Vec16(const FP16Vec16& v) { + reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0])); + reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0])); + reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1])); + reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1])); }; - FP32Vec16 operator+(const FP32Vec16 &b) const { - return FP32Vec16(float32x4x4_t({ - vaddq_f32(reg.val[0], b.reg.val[0]), - vaddq_f32(reg.val[1], b.reg.val[1]), - vaddq_f32(reg.val[2], b.reg.val[2]), - vaddq_f32(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator+(const FP32Vec16& b) const { + return FP32Vec16(float32x4x4_t({vaddq_f32(reg.val[0], b.reg.val[0]), + vaddq_f32(reg.val[1], b.reg.val[1]), + vaddq_f32(reg.val[2], b.reg.val[2]), + vaddq_f32(reg.val[3], b.reg.val[3])})); }; - FP32Vec16 operator*(const FP32Vec16 &b) const { - return FP32Vec16(float32x4x4_t({ - vmulq_f32(reg.val[0], b.reg.val[0]), - vmulq_f32(reg.val[1], b.reg.val[1]), - vmulq_f32(reg.val[2], b.reg.val[2]), - vmulq_f32(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator*(const FP32Vec16& b) const { + return FP32Vec16(float32x4x4_t({vmulq_f32(reg.val[0], b.reg.val[0]), + vmulq_f32(reg.val[1], b.reg.val[1]), + vmulq_f32(reg.val[2], b.reg.val[2]), + vmulq_f32(reg.val[3], b.reg.val[3])})); }; - FP32Vec16 operator-(const FP32Vec16 &b) const { - return FP32Vec16(float32x4x4_t({ - vsubq_f32(reg.val[0], b.reg.val[0]), - vsubq_f32(reg.val[1], b.reg.val[1]), - vsubq_f32(reg.val[2], b.reg.val[2]), - vsubq_f32(reg.val[3], b.reg.val[3]) - })); + FP32Vec16 operator-(const FP32Vec16& b) const { + return FP32Vec16(float32x4x4_t({vsubq_f32(reg.val[0], b.reg.val[0]), + vsubq_f32(reg.val[1], b.reg.val[1]), + vsubq_f32(reg.val[2], b.reg.val[2]), + vsubq_f32(reg.val[3], b.reg.val[3])})); }; - FP32Vec16 operator/(const FP32Vec16 &b) const { - return FP32Vec16(float32x4x4_t({ - vdivq_f32(reg.val[0], b.reg.val[0]), - vdivq_f32(reg.val[1], b.reg.val[1]), - vdivq_f32(reg.val[2], b.reg.val[2]), - vdivq_f32(reg.val[3], b.reg.val[3]) - })); + FP32Vec16 operator/(const FP32Vec16& b) const { + return FP32Vec16(float32x4x4_t({vdivq_f32(reg.val[0], b.reg.val[0]), + vdivq_f32(reg.val[1], b.reg.val[1]), + vdivq_f32(reg.val[2], b.reg.val[2]), + vdivq_f32(reg.val[3], b.reg.val[3])})); }; float reduce_sum() const { AliasReg ar; ar.reg = reg; float answer = 0; - unroll_loop([&answer, &ar](int i) { answer += ar.values[i]; }); + unroll_loop( + [&answer, &ar](int i) { answer += ar.values[i]; }); return answer; }; - template float reduce_sub_sum(int idx) { + template + float reduce_sub_sum(int idx) { static_assert(VEC_ELEM_NUM % group_size == 0); AliasReg ar; @@ -479,7 +483,7 @@ struct FP32Vec16 : public Vec { return answer; }; - void save(float *ptr) const { + void save(float* ptr) const { vst1q_f32(ptr, reg.val[0]); vst1q_f32(ptr + 4, reg.val[1]); vst1q_f32(ptr + 8, reg.val[2]); @@ -487,43 +491,59 @@ struct FP32Vec16 : public Vec { }; }; -template struct VecType { using vec_type = void; }; +template +struct VecType { + using vec_type = void; +}; -template using vec_t = typename VecType::vec_type; +template +using vec_t = typename VecType::vec_type; -template <> struct VecType { using vec_type = FP32Vec8; }; +template <> +struct VecType { + using vec_type = FP32Vec8; +}; -template <> struct VecType { using vec_type = FP16Vec8; }; +template <> +struct VecType { + using vec_type = FP16Vec8; +}; #ifdef ARM_BF16_SUPPORT -template <> struct VecType { using vec_type = BF16Vec8; }; +template <> +struct VecType { + using vec_type = BF16Vec8; +}; #endif -template void storeFP32(float v, T *ptr) { *ptr = v; } - -template <> inline void storeFP32(float v, c10::Half *ptr) { - *reinterpret_cast<__fp16 *>(ptr) = v; +template +void storeFP32(float v, T* ptr) { + *ptr = v; } -inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) { - float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]); - float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]); - float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]); - float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]); +template <> +inline void storeFP32(float v, c10::Half* ptr) { + *reinterpret_cast<__fp16*>(ptr) = v; +} - reg.val[0] = vcombine_f16(low_0, high_0); - reg.val[1] = vcombine_f16(low_1, high_1); +inline FP16Vec16::FP16Vec16(const FP32Vec16& v) { + float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]); + float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]); + float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]); + float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]); + + reg.val[0] = vcombine_f16(low_0, high_0); + reg.val[1] = vcombine_f16(low_1, high_1); }; -inline FP16Vec8 :: FP16Vec8(const FP32Vec8 &v) { - float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]); - float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]); +inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) { + float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]); + float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]); - reg = vcombine_f16(lower_half, upper_half); + reg = vcombine_f16(lower_half, upper_half); }; -inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { - +inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a.reg.val[0], b.reg.val[0]); acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a.reg.val[1], b.reg.val[1]); acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a.reg.val[2], b.reg.val[2]); @@ -531,8 +551,7 @@ inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { }; #ifdef ARM_BF16_SUPPORT -inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { - +inline void fma(FP32Vec16& acc, BF16Vec32& a, BF16Vec32& b) { float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0])); float32x4_t a0_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[0])); float32x4_t a1_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[1])); @@ -551,22 +570,22 @@ inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { #endif #ifdef ARM_BF16_SUPPORT -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {}; +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) + : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) { + }; -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) : reg({ - vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]), - vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), v.reg.val[3]) - }){}; +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) + : reg({vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]), + vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), + v.reg.val[3])}) {}; #endif -inline void prefetch(const void *addr) { - __builtin_prefetch(addr, 0, 1); -}; +inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 1); }; #ifdef ARM_BF16_SUPPORT template <> -inline void storeFP32(float v, c10::BFloat16 *ptr) { - *reinterpret_cast<__bf16 *>(ptr) = vcvth_bf16_f32(v); +inline void storeFP32(float v, c10::BFloat16* ptr) { + *reinterpret_cast<__bf16*>(ptr) = vcvth_bf16_f32(v); }; #endif -}; \ No newline at end of file +}; // namespace vec_op \ No newline at end of file diff --git a/csrc/cpu/cpu_types_vsx.hpp b/csrc/cpu/cpu_types_vsx.hpp index b50bdadc5713d..a8e1be37eb418 100644 --- a/csrc/cpu/cpu_types_vsx.hpp +++ b/csrc/cpu/cpu_types_vsx.hpp @@ -9,38 +9,40 @@ namespace vec_op { // FIXME: FP16 is not fully supported in Torch-CPU -#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ +#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) -#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) #ifndef CPU_OP_GUARD -#define CPU_KERNEL_GUARD_IN(NAME) -#define CPU_KERNEL_GUARD_OUT(NAME) + #define CPU_KERNEL_GUARD_IN(NAME) + #define CPU_KERNEL_GUARD_OUT(NAME) #else -#define CPU_KERNEL_GUARD_IN(NAME) \ - std::cout << #NAME << " invoked." << std::endl; -#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl; + #define CPU_KERNEL_GUARD_IN(NAME) \ + std::cout << #NAME << " invoked." << std::endl; + #define CPU_KERNEL_GUARD_OUT(NAME) \ + std::cout << #NAME << " exit." << std::endl; #endif #define FORCE_INLINE __attribute__((always_inline)) inline namespace { template -constexpr void unroll_loop_item(std::integer_sequence, F &&f) { +constexpr void unroll_loop_item(std::integer_sequence, F&& f) { (f(std::integral_constant{}), ...); } -}; // namespace +}; // namespace template >> -constexpr void unroll_loop(F &&f) { +constexpr void unroll_loop(F&& f) { unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); } -template struct Vec { +template +struct Vec { constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; } }; @@ -68,12 +70,14 @@ struct BF16Vec8 : public Vec { __vector signed short reg; - explicit BF16Vec8(const void *ptr) - : reg((__vector signed short)vec_xl(0, (__vector signed short *)ptr)) {} + explicit BF16Vec8(const void* ptr) + : reg((__vector signed short)vec_xl(0, (__vector signed short*)ptr)) {} - explicit BF16Vec8(const FP32Vec8 &); + explicit BF16Vec8(const FP32Vec8&); - void save(void *ptr) const { *reinterpret_cast<__vector signed short *>(ptr) = reg; } + void save(void* ptr) const { + *reinterpret_cast<__vector signed short*>(ptr) = reg; + } }; struct BF16Vec16 : public Vec { @@ -81,18 +85,18 @@ struct BF16Vec16 : public Vec { ss16x8x2_t reg; - explicit BF16Vec16(const void *ptr) { + explicit BF16Vec16(const void* ptr) { // Load 256 bits in two parts - reg.val[0] = (__vector signed short)vec_xl(0, (signed short *)ptr); - reg.val[1] = (__vector signed short)vec_xl(16, (signed short *)ptr); + reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr); + reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr); } - explicit BF16Vec16(const FP32Vec16 &); + explicit BF16Vec16(const FP32Vec16&); - void save(void *ptr) const { + void save(void* ptr) const { // Save 256 bits in two parts - vec_xst(reg.val[0], 0, (signed short *)ptr); - vec_xst(reg.val[1], 16, (signed short *)ptr); + vec_xst(reg.val[0], 0, (signed short*)ptr); + vec_xst(reg.val[1], 16, (signed short*)ptr); } }; @@ -102,19 +106,15 @@ struct BF16Vec32 : public Vec { constexpr static int VEC_ELEM_NUM = 32; ss16x8x4_t reg; - explicit BF16Vec32(const void *ptr) - : reg(*reinterpret_cast(ptr)) {} + explicit BF16Vec32(const void* ptr) + : reg(*reinterpret_cast(ptr)) {} explicit BF16Vec32(ss16x8x4_t data) : reg(data) {} - explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({ - vec8_data.reg, - vec8_data.reg, - vec8_data.reg, - vec8_data.reg - }) {} + explicit BF16Vec32(const BF16Vec8& vec8_data) + : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {} - void save(void *ptr) const { *reinterpret_cast(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } }; struct FP32Vec4 : public Vec { @@ -130,11 +130,11 @@ struct FP32Vec4 : public Vec { explicit FP32Vec4() : reg(vec_splats(0.0f)) {} - explicit FP32Vec4(const float *ptr) : reg(vec_xl(0, ptr)) {} + explicit FP32Vec4(const float* ptr) : reg(vec_xl(0, ptr)) {} explicit FP32Vec4(__vector float data) : reg(data) {} - explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {} + explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {} }; struct FP32Vec8 : public Vec { @@ -156,19 +156,19 @@ struct FP32Vec8 : public Vec { reg.val[1] = vec_splats(0.0f); } - explicit FP32Vec8(const float *ptr) { + explicit FP32Vec8(const float* ptr) { reg.val[0] = vec_xl(0, ptr); reg.val[1] = vec_xl(16, ptr); } explicit FP32Vec8(f32x4x2_t data) : reg(data) {} - explicit FP32Vec8(const FP32Vec8 &data) { + explicit FP32Vec8(const FP32Vec8& data) { reg.val[0] = data.reg.val[0]; reg.val[1] = data.reg.val[1]; } - explicit FP32Vec8(const BF16Vec8 &v) { + explicit FP32Vec8(const BF16Vec8& v) { reg.val[0] = (__vector float)vec_mergeh(zero, v.reg); reg.val[1] = (__vector float)vec_mergel(zero, v.reg); } @@ -177,7 +177,8 @@ struct FP32Vec8 : public Vec { AliasReg ar; ar.reg = reg; float result = 0; - unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + unroll_loop( + [&result, &ar](int i) { result += ar.values[i]; }); return result; } @@ -230,23 +231,27 @@ struct FP32Vec8 : public Vec { return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]})); } - FP32Vec8 operator*(const FP32Vec8 &b) const { - return FP32Vec8({vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])}); + FP32Vec8 operator*(const FP32Vec8& b) const { + return FP32Vec8( + {vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])}); } - FP32Vec8 operator+(const FP32Vec8 &b) const { - return FP32Vec8({vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])}); + FP32Vec8 operator+(const FP32Vec8& b) const { + return FP32Vec8( + {vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])}); } - FP32Vec8 operator-(const FP32Vec8 &b) const { - return FP32Vec8({vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])}); + FP32Vec8 operator-(const FP32Vec8& b) const { + return FP32Vec8( + {vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])}); } - FP32Vec8 operator/(const FP32Vec8 &b) const { - return FP32Vec8({vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])}); + FP32Vec8 operator/(const FP32Vec8& b) const { + return FP32Vec8( + {vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])}); } - void save(float *ptr) const { + void save(float* ptr) const { vec_xst(reg.val[0], 0, ptr); vec_xst(reg.val[1], 16, ptr); } @@ -275,7 +280,7 @@ struct FP32Vec16 : public Vec { reg.val[3] = vec_splats(0.0f); } - explicit FP32Vec16(const float *ptr) { + explicit FP32Vec16(const float* ptr) { reg.val[0] = vec_xl(0, ptr); reg.val[1] = vec_xl(16, ptr); reg.val[2] = vec_xl(32, ptr); @@ -284,78 +289,76 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16(f32x4x4_t data) : reg(data) {} - explicit FP32Vec16(const FP32Vec16 &data) { + explicit FP32Vec16(const FP32Vec16& data) { reg.val[0] = data.reg.val[0]; reg.val[1] = data.reg.val[1]; reg.val[2] = data.reg.val[2]; reg.val[3] = data.reg.val[3]; } - explicit FP32Vec16(const FP32Vec4 &data) { + explicit FP32Vec16(const FP32Vec4& data) { reg.val[0] = data.reg; reg.val[1] = data.reg; reg.val[2] = data.reg; reg.val[3] = data.reg; } - explicit FP32Vec16(const FP32Vec8 &data) { + explicit FP32Vec16(const FP32Vec8& data) { reg.val[0] = data.reg.val[0]; reg.val[1] = data.reg.val[1]; reg.val[2] = data.reg.val[0]; reg.val[3] = data.reg.val[1]; } - explicit FP32Vec16(const BF16Vec16 &v) { + explicit FP32Vec16(const BF16Vec16& v) { reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]); reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]); reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]); reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]); } - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - FP32Vec16 operator*(const FP32Vec16 &b) const { - return FP32Vec16(f32x4x4_t({ - vec_mul(reg.val[0], b.reg.val[0]), - vec_mul(reg.val[1], b.reg.val[1]), - vec_mul(reg.val[2], b.reg.val[2]), - vec_mul(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator*(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]), + vec_mul(reg.val[1], b.reg.val[1]), + vec_mul(reg.val[2], b.reg.val[2]), + vec_mul(reg.val[3], b.reg.val[3])})); } - FP32Vec16 operator+(const FP32Vec16 &b) const { - return FP32Vec16(f32x4x4_t({ - vec_add(reg.val[0], b.reg.val[0]), - vec_add(reg.val[1], b.reg.val[1]), - vec_add(reg.val[2], b.reg.val[2]), - vec_add(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator+(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_add(reg.val[0], b.reg.val[0]), + vec_add(reg.val[1], b.reg.val[1]), + vec_add(reg.val[2], b.reg.val[2]), + vec_add(reg.val[3], b.reg.val[3])})); } - FP32Vec16 operator-(const FP32Vec16 &b) const { - return FP32Vec16(f32x4x4_t({ - vec_sub(reg.val[0], b.reg.val[0]), - vec_sub(reg.val[1], b.reg.val[1]), - vec_sub(reg.val[2], b.reg.val[2]), - vec_sub(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator-(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_sub(reg.val[0], b.reg.val[0]), + vec_sub(reg.val[1], b.reg.val[1]), + vec_sub(reg.val[2], b.reg.val[2]), + vec_sub(reg.val[3], b.reg.val[3])})); } - FP32Vec16 operator/(const FP32Vec16 &b) const { - return FP32Vec16(f32x4x4_t({ - vec_div(reg.val[0], b.reg.val[0]), - vec_div(reg.val[1], b.reg.val[1]), - vec_div(reg.val[2], b.reg.val[2]), - vec_div(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator/(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_div(reg.val[0], b.reg.val[0]), + vec_div(reg.val[1], b.reg.val[1]), + vec_div(reg.val[2], b.reg.val[2]), + vec_div(reg.val[3], b.reg.val[3])})); } float reduce_sum() const { AliasReg ar; ar.reg = reg; float result = 0; - unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + unroll_loop( + [&result, &ar](int i) { result += ar.values[i]; }); return result; } - template float reduce_sub_sum(int idx) { + template + float reduce_sub_sum(int idx) { static_assert(VEC_ELEM_NUM % group_size == 0); AliasReg ar; @@ -368,7 +371,7 @@ struct FP32Vec16 : public Vec { return result; } - void save(float *ptr) const { + void save(float* ptr) const { vec_xst(reg.val[0], 0, ptr); vec_xst(reg.val[1], 16, ptr); vec_xst(reg.val[2], 32, ptr); @@ -376,43 +379,62 @@ struct FP32Vec16 : public Vec { } }; -template struct VecType { using vec_type = void; }; +template +struct VecType { + using vec_type = void; +}; -template using vec_t = typename VecType::vec_type; +template +using vec_t = typename VecType::vec_type; -template <> struct VecType { using vec_type = FP32Vec8; }; +template <> +struct VecType { + using vec_type = FP32Vec8; +}; -template <> struct VecType { using vec_type = BF16Vec8; }; +template <> +struct VecType { + using vec_type = BF16Vec8; +}; -template void storeFP32(float v, T *ptr) { *ptr = v; } +template +void storeFP32(float v, T* ptr) { + *ptr = v; +} -inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { +inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { acc = acc + a * b; } -template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { - c10::BFloat16 __attribute__((__may_alias__)) *v_ptr = - reinterpret_cast(&v); +template <> +inline void storeFP32(float v, c10::BFloat16* ptr) { + c10::BFloat16 __attribute__((__may_alias__))* v_ptr = + reinterpret_cast(&v); *ptr = *(v_ptr + 1); } #ifndef __VEC_CLASS_FP_NAN -#define __VEC_CLASS_FP_NAN (1 << 6) + #define __VEC_CLASS_FP_NAN (1 << 6) #endif -const static __vector unsigned char omask = { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; +const static __vector unsigned char omask = {0, 1, 4, 5, 8, 9, 12, 13, + 16, 17, 20, 21, 24, 25, 28, 29}; #ifndef _ARCH_PWR10 -const static __vector unsigned int bias = { 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff }; -const static __vector unsigned int nan = { 0x7fc00000, 0x7fc00000, 0x7fc00000, 0x7fc00000 }; -const static __vector unsigned int sh16 = { 16, 16, 16, 16 }; -const static __vector unsigned int one = { 1, 1, 1, 1 }; +const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff, + 0x00007fff}; +const static __vector unsigned int nan = {0x7fc00000, 0x7fc00000, 0x7fc00000, + 0x7fc00000}; +const static __vector unsigned int sh16 = {16, 16, 16, 16}; +const static __vector unsigned int one = {1, 1, 1, 1}; #endif -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) { +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) { #ifdef _ARCH_PWR10 __vector signed short ret[2]; - ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]); - ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]); + ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[0]); + ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[1]); reg = vec_perm(ret[0], ret[1], omask); #elif defined(_ARCH_PWR9) __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]); @@ -425,8 +447,10 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) { __vector unsigned int rnd1 = vec_add(lsb1, bias); inp0 = vec_add(inp0, rnd0); inp1 = vec_add(inp1, rnd1); - __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); - __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); + __vector __bool int sel0 = + vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); + __vector __bool int sel1 = + vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); inp0 = vec_sel(inp0, nan, sel0); inp1 = vec_sel(inp1, nan, sel1); inp0 = vec_sr(inp0, sh16); @@ -435,13 +459,17 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) { #endif } -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) { #ifdef _ARCH_PWR10 __vector signed short ret[4]; - ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]); - ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]); - ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[2]); - ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[3]); + ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[0]); + ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[1]); + ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[2]); + ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[3]); reg.val[0] = vec_perm(ret[0], ret[1], omask); reg.val[1] = vec_perm(ret[2], ret[3], omask); #elif defined(_ARCH_PWR9) @@ -465,10 +493,14 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { inp1 = vec_add(inp1, rnd1); inp2 = vec_add(inp2, rnd2); inp3 = vec_add(inp3, rnd3); - __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); - __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); - __vector __bool int sel2 = vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN); - __vector __bool int sel3 = vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN); + __vector __bool int sel0 = + vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); + __vector __bool int sel1 = + vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); + __vector __bool int sel2 = + vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN); + __vector __bool int sel3 = + vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN); inp0 = vec_sel(inp0, nan, sel0); inp1 = vec_sel(inp1, nan, sel1); inp2 = vec_sel(inp2, nan, sel2); @@ -482,10 +514,10 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { #endif } -inline void prefetch(const void *addr) { +inline void prefetch(const void* addr) { __asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory"); } -}; // namespace vec_op +}; // namespace vec_op #endif diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index 4bb4eb0f491ac..a4ef2be2a58ca 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -11,39 +11,40 @@ static_assert(false, "AVX2 must be supported for the current implementation."); namespace vec_op { -#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ +#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) -#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) #ifndef CPU_OP_GUARD -#define CPU_KERNEL_GUARD_IN(NAME) -#define CPU_KERNEL_GUARD_OUT(NAME) + #define CPU_KERNEL_GUARD_IN(NAME) + #define CPU_KERNEL_GUARD_OUT(NAME) #else -#define CPU_KERNEL_GUARD_IN(NAME) \ - RECORD_FUNCTION(#NAME, c10::ArrayRef({})); -#define CPU_KERNEL_GUARD_OUT(NAME) + #define CPU_KERNEL_GUARD_IN(NAME) \ + RECORD_FUNCTION(#NAME, c10::ArrayRef({})); + #define CPU_KERNEL_GUARD_OUT(NAME) #endif #define FORCE_INLINE __attribute__((always_inline)) inline namespace { template -constexpr void unroll_loop_item(std::integer_sequence, F &&f) { +constexpr void unroll_loop_item(std::integer_sequence, F&& f) { (f(std::integral_constant{}), ...); } -}; // namespace +}; // namespace template >> -constexpr void unroll_loop(F &&f) { +constexpr void unroll_loop(F&& f) { unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); } -template struct Vec { +template +struct Vec { constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; } }; @@ -55,12 +56,12 @@ struct FP16Vec8 : public Vec { __m128i reg; - explicit FP16Vec8(const void *ptr) - : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {} + explicit FP16Vec8(const void* ptr) + : reg((__m128i)_mm_loadu_si128((__m128i*)ptr)) {} - explicit FP16Vec8(const FP32Vec8 &); + explicit FP16Vec8(const FP32Vec8&); - void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m128i*>(ptr) = reg; } }; struct FP16Vec16 : public Vec { @@ -68,12 +69,12 @@ struct FP16Vec16 : public Vec { __m256i reg; - explicit FP16Vec16(const void *ptr) - : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {} + explicit FP16Vec16(const void* ptr) + : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} - explicit FP16Vec16(const FP32Vec16 &); + explicit FP16Vec16(const FP32Vec16&); - void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } void save(void* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -87,12 +88,12 @@ struct BF16Vec8 : public Vec { __m128i reg; - explicit BF16Vec8(const void *ptr) - : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {} + explicit BF16Vec8(const void* ptr) + : reg((__m128i)_mm_loadu_si128((__m128i*)ptr)) {} - explicit BF16Vec8(const FP32Vec8 &); + explicit BF16Vec8(const FP32Vec8&); - void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m128i*>(ptr) = reg; } }; struct BF16Vec16 : public Vec { @@ -100,12 +101,12 @@ struct BF16Vec16 : public Vec { __m256i reg; - explicit BF16Vec16(const void *ptr) - : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {} + explicit BF16Vec16(const void* ptr) + : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} - explicit BF16Vec16(const FP32Vec16 &); + explicit BF16Vec16(const FP32Vec16&); - void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } void save(void* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -120,11 +121,11 @@ struct BF16Vec32 : public Vec { __m512i reg; - explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {} + explicit BF16Vec32(const void* ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {} explicit BF16Vec32(__m512i data) : reg(data) {} - explicit BF16Vec32(BF16Vec8 &vec8_data) + explicit BF16Vec32(BF16Vec8& vec8_data) : reg((__m512i)_mm512_inserti32x4( _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512( (__m128i)vec8_data.reg), @@ -132,7 +133,7 @@ struct BF16Vec32 : public Vec { (__m128i)vec8_data.reg, 2), (__m128i)vec8_data.reg, 3)) {} - void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m512i*>(ptr) = reg; } }; #else struct BF16Vec32 : public Vec { @@ -141,24 +142,24 @@ struct BF16Vec32 : public Vec { __m256i reg_low; __m256i reg_high; - explicit BF16Vec32(const void *ptr) - : reg_low(_mm256_loadu_si256((__m256i const *)ptr)), - reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {} + explicit BF16Vec32(const void* ptr) + : reg_low(_mm256_loadu_si256((__m256i const*)ptr)), + reg_high(_mm256_loadu_si256((__m256i const*)ptr + 1)) {} - explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low), - reg_high(high) {} + explicit BF16Vec32(__m256i low, __m256i high) + : reg_low(low), reg_high(high) {} - explicit BF16Vec32(BF16Vec8 &vec8_data) + explicit BF16Vec32(BF16Vec8& vec8_data) : reg_low((__m256i)_mm256_inserti32x4( - _mm256_castsi128_si256((__m128i)vec8_data.reg), - (__m128i)vec8_data.reg, 1)), + _mm256_castsi128_si256((__m128i)vec8_data.reg), + (__m128i)vec8_data.reg, 1)), reg_high((__m256i)_mm256_inserti32x4( - _mm256_castsi128_si256((__m128i)vec8_data.reg), - (__m128i)vec8_data.reg, 1)) {} + _mm256_castsi128_si256((__m128i)vec8_data.reg), + (__m128i)vec8_data.reg, 1)) {} - void save(void *ptr) const { - *reinterpret_cast<__m256i *>(ptr) = reg_low; - *reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high; + void save(void* ptr) const { + *reinterpret_cast<__m256i*>(ptr) = reg_low; + *reinterpret_cast<__m256i*>((__m256i*)ptr + 1) = reg_high; } }; #endif @@ -176,11 +177,11 @@ struct FP32Vec4 : public Vec { explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {} - explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {} + explicit FP32Vec4(const float* ptr) : reg(_mm_loadu_ps(ptr)) {} explicit FP32Vec4(__m128 data) : reg(data) {} - explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {} + explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {} }; struct FP32Vec8 : public Vec { @@ -196,15 +197,15 @@ struct FP32Vec8 : public Vec { explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {} - explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {} + explicit FP32Vec8(const float* ptr) : reg(_mm256_loadu_ps(ptr)) {} explicit FP32Vec8(__m256 data) : reg(data) {} - explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {} + explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {} - explicit FP32Vec8(const FP16Vec8 &v) : reg(_mm256_cvtph_ps(v.reg)) {} + explicit FP32Vec8(const FP16Vec8& v) : reg(_mm256_cvtph_ps(v.reg)) {} - explicit FP32Vec8(const BF16Vec8 &v) + explicit FP32Vec8(const BF16Vec8& v) : reg(_mm256_castsi256_ps( _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {} @@ -212,7 +213,8 @@ struct FP32Vec8 : public Vec { AliasReg ar; ar.reg = reg; float result = 0; - unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + unroll_loop( + [&result, &ar](int i) { result += ar.values[i]; }); return result; } @@ -244,27 +246,27 @@ struct FP32Vec8 : public Vec { erf(ar.values[1]), erf(ar.values[0]))); } - FP32Vec8 operator*(const FP32Vec8 &b) const { + FP32Vec8 operator*(const FP32Vec8& b) const { return FP32Vec8(_mm256_mul_ps(reg, b.reg)); } - FP32Vec8 operator+(const FP32Vec8 &b) const { + FP32Vec8 operator+(const FP32Vec8& b) const { return FP32Vec8(_mm256_add_ps(reg, b.reg)); } - FP32Vec8 operator-(const FP32Vec8 &b) const { + FP32Vec8 operator-(const FP32Vec8& b) const { return FP32Vec8(_mm256_sub_ps(reg, b.reg)); } - FP32Vec8 operator/(const FP32Vec8 &b) const { + FP32Vec8 operator/(const FP32Vec8& b) const { return FP32Vec8(_mm256_div_ps(reg, b.reg)); } - void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); } + void save(float* ptr) const { _mm256_storeu_ps(ptr, reg); } }; #ifdef __AVX512F__ -struct INT32Vec16: public Vec { +struct INT32Vec16 : public Vec { constexpr static int VEC_ELEM_NUM = 16; union AliasReg { __m512i reg; @@ -272,12 +274,11 @@ struct INT32Vec16: public Vec { }; __m512i reg; - - explicit INT32Vec16(const void* data_ptr) : reg(_mm512_loadu_epi32(data_ptr)) {} - void save(int32_t* ptr) const { - _mm512_storeu_epi32(ptr, reg); - } + explicit INT32Vec16(const void* data_ptr) + : reg(_mm512_loadu_epi32(data_ptr)) {} + + void save(int32_t* ptr) const { _mm512_storeu_epi32(ptr, reg); } void save(int32_t* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -301,11 +302,11 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {} - explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {} + explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {} explicit FP32Vec16(__m512 data) : reg(data) {} - explicit FP32Vec16(const FP32Vec4 &data) + explicit FP32Vec16(const FP32Vec4& data) : reg((__m512)_mm512_inserti32x4( _mm512_inserti32x4( _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg), @@ -313,36 +314,37 @@ struct FP32Vec16 : public Vec { (__m128i)data.reg, 2), (__m128i)data.reg, 3)) {} - explicit FP32Vec16(const FP32Vec8 &data) + explicit FP32Vec16(const FP32Vec8& data) : reg((__m512)_mm512_inserti32x8( _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {} - explicit FP32Vec16(const BF16Vec16 &v) + explicit FP32Vec16(const BF16Vec16& v) : reg(_mm512_castsi512_ps( _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {} - explicit FP32Vec16(const FP16Vec16 &v) : reg(_mm512_cvtph_ps(v.reg)) {} + explicit FP32Vec16(const FP16Vec16& v) : reg(_mm512_cvtph_ps(v.reg)) {} - explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - explicit FP32Vec16(const INT32Vec16 &v) - : reg(_mm512_cvt_roundepi32_ps(v.reg, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)) {} + explicit FP32Vec16(const INT32Vec16& v) + : reg(_mm512_cvt_roundepi32_ps( + v.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} - FP32Vec16 operator*(const FP32Vec16 &b) const { + FP32Vec16 operator*(const FP32Vec16& b) const { return FP32Vec16(_mm512_mul_ps(reg, b.reg)); } - FP32Vec16 operator+(const FP32Vec16 &b) const { + FP32Vec16 operator+(const FP32Vec16& b) const { return FP32Vec16(_mm512_add_ps(reg, b.reg)); } - FP32Vec16 operator-(const FP32Vec16 &b) const { + FP32Vec16 operator-(const FP32Vec16& b) const { return FP32Vec16(_mm512_sub_ps(reg, b.reg)); } - FP32Vec16 operator/(const FP32Vec16 &b) const { + FP32Vec16 operator/(const FP32Vec16& b) const { return FP32Vec16(_mm512_div_ps(reg, b.reg)); } @@ -370,9 +372,7 @@ struct FP32Vec16 : public Vec { return FP32Vec16(_mm512_mask_min_ps(reg, mask, reg, b.reg)); } - FP32Vec16 abs() const { - return FP32Vec16(_mm512_abs_ps(reg)); - } + FP32Vec16 abs() const { return FP32Vec16(_mm512_abs_ps(reg)); } float reduce_sum() const { return _mm512_reduce_add_ps(reg); } @@ -380,14 +380,15 @@ struct FP32Vec16 : public Vec { float reduce_min() const { return _mm512_reduce_min_ps(reg); } - template float reduce_sub_sum(int idx) { + template + float reduce_sub_sum(int idx) { static_assert(VEC_ELEM_NUM % group_size == 0); constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size)); return _mm512_mask_reduce_add_ps(mask, reg); } - void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); } + void save(float* ptr) const { _mm512_storeu_ps(ptr, reg); } void save(float* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -407,32 +408,30 @@ struct FP32Vec16 : public Vec { __m256 reg_low; __m256 reg_high; - explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)), - reg_high(_mm256_set1_ps(v)) {} + explicit FP32Vec16(float v) + : reg_low(_mm256_set1_ps(v)), reg_high(_mm256_set1_ps(v)) {} - explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)), - reg_high(_mm256_set1_ps(0.0)) {} + explicit FP32Vec16() + : reg_low(_mm256_set1_ps(0.0)), reg_high(_mm256_set1_ps(0.0)) {} - explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)), - reg_high(_mm256_loadu_ps(ptr + 8)) {} + explicit FP32Vec16(const float* ptr) + : reg_low(_mm256_loadu_ps(ptr)), reg_high(_mm256_loadu_ps(ptr + 8)) {} explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {} - explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low), - reg_high(data.reg_high) {} + explicit FP32Vec16(const FP32Vec16& data) + : reg_low(data.reg_low), reg_high(data.reg_high) {} - explicit FP32Vec16(const FP32Vec4 &data) + explicit FP32Vec16(const FP32Vec4& data) : reg_low((__m256)_mm256_inserti128_si256( - _mm256_castsi128_si256((__m128i)data.reg), - (__m128i)data.reg, 1)), + _mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)), reg_high((__m256)_mm256_inserti128_si256( - _mm256_castsi128_si256((__m128i)data.reg), - (__m128i)data.reg, 1)) {} + _mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)) {} - explicit FP32Vec16(const FP32Vec8 &data) + explicit FP32Vec16(const FP32Vec8& data) : reg_low(data.reg), reg_high(data.reg) {} - explicit FP32Vec16(const FP16Vec16 &v) { + explicit FP32Vec16(const FP16Vec16& v) { __m128i low = _mm256_extractf128_si256(v.reg, 0); __m128i high = _mm256_extractf128_si256(v.reg, 1); @@ -440,9 +439,9 @@ struct FP32Vec16 : public Vec { reg_high = _mm256_cvtph_ps(high); } - explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - explicit FP32Vec16(const BF16Vec16 &v) { + explicit FP32Vec16(const BF16Vec16& v) { __m128i low = _mm256_extractf128_si256(v.reg, 0); __m128i high = _mm256_extractf128_si256(v.reg, 1); @@ -456,24 +455,24 @@ struct FP32Vec16 : public Vec { reg_high = _mm256_castsi256_ps(v_high_shifted); } - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - FP32Vec16 operator*(const FP32Vec16 &b) const { + FP32Vec16 operator*(const FP32Vec16& b) const { return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low), _mm256_mul_ps(reg_high, b.reg_high)); } - FP32Vec16 operator+(const FP32Vec16 &b) const { + FP32Vec16 operator+(const FP32Vec16& b) const { return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low), _mm256_add_ps(reg_high, b.reg_high)); } - FP32Vec16 operator-(const FP32Vec16 &b) const { + FP32Vec16 operator-(const FP32Vec16& b) const { return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low), _mm256_sub_ps(reg_high, b.reg_high)); } - FP32Vec16 operator/(const FP32Vec16 &b) const { + FP32Vec16 operator/(const FP32Vec16& b) const { return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low), _mm256_div_ps(reg_high, b.reg_high)); } @@ -484,7 +483,8 @@ struct FP32Vec16 : public Vec { return low.reduce_sum() + high.reduce_sum(); } - template float reduce_sub_sum(int idx) { + template + float reduce_sub_sum(int idx) { float sum = 0.0; static_assert(VEC_ELEM_NUM % group_size == 0); constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); @@ -507,7 +507,7 @@ struct FP32Vec16 : public Vec { return sum; } - void save(float *ptr) const { + void save(float* ptr) const { _mm256_storeu_ps(ptr, reg_low); _mm256_storeu_ps(ptr + 8, reg_high); } @@ -515,7 +515,7 @@ struct FP32Vec16 : public Vec { #endif #ifdef __AVX512F__ -struct INT8Vec16: public Vec { +struct INT8Vec16 : public Vec { constexpr static int VEC_ELEM_NUM = 16; union AliasReg { __m128i reg; @@ -523,14 +523,12 @@ struct INT8Vec16: public Vec { }; __m128i reg; - - explicit INT8Vec16(const FP32Vec16& vec) : reg( - _mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32(vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) - ) {} - void save(int8_t* ptr) const { - _mm_storeu_epi8(ptr, reg); - } + explicit INT8Vec16(const FP32Vec16& vec) + : reg(_mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32( + vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))) {} + + void save(int8_t* ptr) const { _mm_storeu_epi8(ptr, reg); } void save(int8_t* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -540,71 +538,92 @@ struct INT8Vec16: public Vec { }; #endif -template struct VecType { using vec_type = void; }; +template +struct VecType { + using vec_type = void; +}; -template using vec_t = typename VecType::vec_type; +template +using vec_t = typename VecType::vec_type; -template <> struct VecType { using vec_type = FP32Vec8; }; +template <> +struct VecType { + using vec_type = FP32Vec8; +}; -template <> struct VecType { using vec_type = FP16Vec8; }; +template <> +struct VecType { + using vec_type = FP16Vec8; +}; -template <> struct VecType { using vec_type = BF16Vec8; }; +template <> +struct VecType { + using vec_type = BF16Vec8; +}; -template void storeFP32(float v, T *ptr) { *ptr = v; } +template +void storeFP32(float v, T* ptr) { + *ptr = v; +} -inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { +inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { acc = acc + a * b; } -template <> inline void storeFP32(float v, c10::Half *ptr) { - *reinterpret_cast(ptr) = +template <> +inline void storeFP32(float v, c10::Half* ptr) { + *reinterpret_cast(ptr) = _cvtss_sh(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } -inline FP16Vec8::FP16Vec8(const FP32Vec8 &v) +inline FP16Vec8::FP16Vec8(const FP32Vec8& v) : reg(_mm256_cvtps_ph(v.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} #ifdef __AVX512F__ -inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) +inline FP16Vec16::FP16Vec16(const FP32Vec16& v) : reg(_mm512_cvtps_ph(v.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} #else -inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) - : reg(_mm256_insertf128_si256(_mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {} +inline FP16Vec16::FP16Vec16(const FP32Vec16& v) + : reg(_mm256_insertf128_si256( + _mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), + FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {} #endif #ifdef __AVX512BF16__ -template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { - *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v); +template <> +inline void storeFP32(float v, c10::BFloat16* ptr) { + *reinterpret_cast<__bfloat16*>(ptr) = _mm_cvtness_sbh(v); } -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {} -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {} -inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { +inline void fma(FP32Vec16& acc, BF16Vec32& a, BF16Vec32& b) { acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg); } #else -template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { - c10::BFloat16 __attribute__((__may_alias__)) *v_ptr = - reinterpret_cast(&v); +template <> +inline void storeFP32(float v, c10::BFloat16* ptr) { + c10::BFloat16 __attribute__((__may_alias__))* v_ptr = + reinterpret_cast(&v); *ptr = *(v_ptr + 1); } -#ifdef __AVX512F__ -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) + #ifdef __AVX512F__ +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg(_mm256_cvtepi32_epi16( _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {} -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) : reg(_mm512_cvtepi32_epi16( _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {} -#else -namespace{ + #else +namespace { __m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) { __m256i ai = _mm256_castps_si256(a); ai = _mm256_srli_epi32(ai, 16); @@ -612,21 +631,21 @@ __m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) { ai = _mm256_permute4x64_epi64(ai, 0b00111001); return _mm256_extracti128_si256(ai, 0); } -} +} // namespace -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {} -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) { BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low)); BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high)); reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1); } -#endif // __AVX512F__ -#endif // __AVX512BF16__ + #endif // __AVX512F__ +#endif // __AVX512BF16__ -inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); } +inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); } -}; // namespace vec_op +}; // namespace vec_op #endif diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp index 85e359aa57113..07c9e46c27b06 100644 --- a/csrc/cutlass_extensions/common.hpp +++ b/csrc/cutlass_extensions/common.hpp @@ -27,8 +27,7 @@ inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) { int max_shared_mem_per_block_opt_in = 0; cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in, - cudaDevAttrMaxSharedMemoryPerBlockOptin, - device); + cudaDevAttrMaxSharedMemoryPerBlockOptin, device); return max_shared_mem_per_block_opt_in; } diff --git a/csrc/ops.h b/csrc/ops.h index 9efd9b0c24700..5a194a0dd3654 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -86,6 +86,8 @@ void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query, void silu_and_mul(torch::Tensor& out, torch::Tensor& input); +void mul_and_silu(torch::Tensor& out, torch::Tensor& input); + void gelu_and_mul(torch::Tensor& out, torch::Tensor& input); void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input); diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu index bd184ee22682e..c3902f4c2a163 100644 --- a/csrc/prepare_inputs/advance_step.cu +++ b/csrc/prepare_inputs/advance_step.cu @@ -95,6 +95,16 @@ __global__ void advance_step_flashinfer_kernel( long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr, int const* block_tables_ptr, int64_t const block_tables_stride, int* paged_kv_last_page_len_ptr, int* block_table_bound_ptr) { + int const n_pad = num_seqs - num_queries; + if (n_pad && blockIdx.x == 0) { + // Handle cuda graph padding + int const offset = num_queries; + for (int i = threadIdx.x; i < n_pad; i += blockDim.x) { + input_tokens_ptr[offset + i] = 0; + input_positions_ptr[offset + i] = 0; + slot_mapping_ptr[offset + i] = -1; + } + } int num_query_blocks = div_ceil(num_queries, num_threads); if (blockIdx.x < num_query_blocks) { diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 956258c1001d3..fb53d122487d3 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -55,6 +55,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()"); ops.impl("silu_and_mul", torch::kCUDA, &silu_and_mul); + ops.def("mul_and_silu(Tensor! out, Tensor input) -> ()"); + ops.impl("mul_and_silu", torch::kCUDA, &mul_and_silu); + // Activation function used in GeGLU with `none` approximation. ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()"); ops.impl("gelu_and_mul", torch::kCUDA, &gelu_and_mul); diff --git a/docs/README.md b/docs/README.md index 46488c9bb0b92..1a44c1341f4fb 100644 --- a/docs/README.md +++ b/docs/README.md @@ -16,4 +16,5 @@ make html ```bash python -m http.server -d build/html/ ``` + Launch your browser and open localhost:8000. diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 64cf6ef8fc19d..8217bc3ba3ded 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -3,6 +3,7 @@ sphinx-book-theme==1.0.1 sphinx-copybutton==0.5.2 myst-parser==3.0.1 sphinx-argparse==0.4.0 +sphinx-design==0.6.1 sphinx-togglebutton==0.3.2 msgspec cloudpickle diff --git a/docs/source/api/model/index.md b/docs/source/api/model/index.md index b8437e3c3517a..113792147be7c 100644 --- a/docs/source/api/model/index.md +++ b/docs/source/api/model/index.md @@ -9,4 +9,3 @@ interfaces_base interfaces adapters ``` - diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md index 51e24795a34cf..14efdb506d76f 100644 --- a/docs/source/api/multimodal/index.md +++ b/docs/source/api/multimodal/index.md @@ -7,7 +7,7 @@ vLLM provides experimental support for multi-modal models through the {mod}`vllm Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models) via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`. -Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs). +Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal). ## Module Contents diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md index 3d89666113229..76b2fb95a5009 100644 --- a/docs/source/api/multimodal/inputs.md +++ b/docs/source/api/multimodal/inputs.md @@ -3,7 +3,7 @@ ## User-facing inputs ```{eval-rst} -.. autodata:: vllm.multimodal.MultiModalDataDict +.. autodata:: vllm.multimodal.inputs.MultiModalDataDict ``` ## Internal data structures diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md index 9d2af4c13b088..fb93e65673dff 100644 --- a/docs/source/community/sponsors.md +++ b/docs/source/community/sponsors.md @@ -6,6 +6,7 @@ vLLM is a community project. Our compute resources for development and testing a Cash Donations: + - a16z - Dropbox - Sequoia Capital @@ -13,6 +14,7 @@ Cash Donations: - ZhenFund Compute Resources: + - AMD - Anyscale - AWS diff --git a/docs/source/conf.py b/docs/source/conf.py index 1ce11fe057071..7aa52db092e36 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -43,6 +43,7 @@ extensions = [ "sphinx.ext.autosummary", "myst_parser", "sphinxarg.ext", + "sphinx_design", "sphinx_togglebutton", ] myst_enable_extensions = [ @@ -55,7 +56,7 @@ templates_path = ['_templates'] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns: List[str] = ["**/*.template.md"] +exclude_patterns: List[str] = ["**/*.template.md", "**/*.inc.md"] # Exclude the prompt "$" when copying code copybutton_prompt_text = r"\$ " diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md index 002808ac5fbbd..b9b92fd027f6e 100644 --- a/docs/source/contributing/model/basic.md +++ b/docs/source/contributing/model/basic.md @@ -1,6 +1,6 @@ (new-model-basic)= -# Basic Implementation +# Implementing a Basic Model This guide walks you through the steps to implement a basic vLLM model. @@ -57,7 +57,17 @@ class MyModelForCausalLM(nn.Module): ### Computation Code -Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. +- Add a `get_input_embeddings` method inside `MyModel` module that returns the text embeddings given `input_ids`. This is equivalent to directly calling the text embedding layer, but provides a unified interface in case `MyModel` is used within a composite multimodal model. + +```python +class MyModel(nn.Module): + ... + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + ... +``` + +- Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. ```python def forward( diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md index a2d601c83cf47..fe018b61b08cf 100644 --- a/docs/source/contributing/model/index.md +++ b/docs/source/contributing/model/index.md @@ -2,7 +2,7 @@ # Adding a New Model -This section provides more information on how to integrate a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM. +This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM. ```{toctree} :caption: Contents @@ -10,6 +10,7 @@ This section provides more information on how to integrate a [HuggingFace Transf basic registration +tests multimodal ``` diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md index e5dcd1223b361..e5fd9a2877ceb 100644 --- a/docs/source/contributing/model/multimodal.md +++ b/docs/source/contributing/model/multimodal.md @@ -1,6 +1,6 @@ -(enabling-multimodal-inputs)= +(supports-multimodal)= -# Enabling Multimodal Inputs +# Multi-Modal Support This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](#multimodal-inputs). @@ -9,7 +9,78 @@ This document walks you through the steps to extend a basic model so that it acc It is assumed that you have already implemented the model in vLLM according to [these steps](#new-model-basic). Further update the model as follows: -- Implement the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. +- Reserve a keyword parameter in {meth}`~torch.nn.Module.forward` for each input tensor that corresponds to a multi-modal input, as shown in the following example: + + ```diff + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + + pixel_values: torch.Tensor, + ) -> SamplerOutput: + ``` + + More conveniently, you can simply pass `**kwargs` to the {meth}`~torch.nn.Module.forward` method and retrieve the keyword parameters for multimodal inputs from it. + +- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings` that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs. + + ```python + class YourModelForImage2Seq(nn.Module): + ... + + def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor: + + assert self.vision_encoder is not None + image_features = self.vision_encoder(image_input) + return self.multi_modal_projector(image_features) + + def get_multimodal_embeddings(self, **kwargs: object) -> Optional[NestedTensors]: + + # Validate the multimodal input keyword arguments + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + + # Run multimodal inputs through encoder and projector + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + ``` + + ```{important} + The returned `multimodal_embeddings` must be either a **3D {class}`torch.Tensor`** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D {class}`torch.Tensor`'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request. + ``` + +- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings` to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings. + + ```python + from .utils import merge_multimodal_embeddings + + class YourModelForImage2Seq(nn.Module): + ... + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + + # `get_input_embeddings` should already be implemented for the language + # model as one of the requirements of basic vLLM model implementation. + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids=input_ids, + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + placeholder_token_id=self.config.image_token_index) + + return inputs_embeds + ``` + +- Once the above steps are done, update the model class with the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. ```diff + from vllm.model_executor.models.interfaces import SupportsMultiModal @@ -23,117 +94,359 @@ Further update the model as follows: Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples. ``` -- If you haven't already done so, reserve a keyword parameter in {meth}`~torch.nn.Module.forward` - for each input tensor that corresponds to a multi-modal input, as shown in the following example: +## 2. Specify processing information - ```diff - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - + pixel_values: torch.Tensor, - ) -> SamplerOutput: - ``` +Next, create a subclass of {class}`~vllm.multimodal.processing.BaseProcessingInfo` +to provide basic information related to HF processing. -## 2. Register input mappers +### Maximum number of input items -For each modality type that the model accepts as input, decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_input_mapper `. -This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in {meth}`~torch.nn.Module.forward`. +You need to override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_supported_mm_limits` +to return the maximum number of input items for each modality supported by the model. + +For example, if the model supports any number of images but only one video per prompt: + +```python +def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": 1} +``` + +### Maximum number of placeholder feature tokens + +Also, override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item` +to return the maximum number of placeholder feature tokens per input item for each modality. + +When calling the model, the output embeddings from the visual encoder are assigned to the input positions +containing placeholder feature tokens. Therefore, the number of placeholder feature tokens should be equal +to the size of the output embeddings. + +::::{tab-set} +:::{tab-item} Basic example: LLaVA +:sync: llava + +Looking at the code of HF's `LlavaForConditionalGeneration`: + +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544 +n_image_tokens = (input_ids == self.config.image_token_index).sum().item() +n_image_features = image_features.shape[0] * image_features.shape[1] + +if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) +special_image_mask = ( + (input_ids == self.config.image_token_index) + .unsqueeze(-1) + .expand_as(inputs_embeds) + .to(inputs_embeds.device) +) +image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) +inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) +``` + +The number of placeholder feature tokens per image is `image_features.shape[1]`. +`image_features` is calculated inside the `get_image_features` method: + +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300 +image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) + +selected_image_feature = image_outputs.hidden_states[vision_feature_layer] +if vision_feature_select_strategy == "default": + selected_image_feature = selected_image_feature[:, 1:] +elif vision_feature_select_strategy == "full": + selected_image_feature = selected_image_feature +else: + raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") +image_features = self.multi_modal_projector(selected_image_feature) +return image_features +``` + +We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower +(`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model). +Moreover, we only need the sequence length (the second dimension of the tensor) to get `image_features.shape[1]`. +The sequence length is determined by the initial hidden states in `CLIPVisionTransformer` since the attention +mechanism doesn't change the sequence length of the output hidden states. + +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L1094-L1102 +hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) +hidden_states = self.pre_layrnorm(hidden_states) + +encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, +) +``` + +To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`: + +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257 +target_dtype = self.patch_embedding.weight.dtype +patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] +patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + +class_embeds = self.class_embedding.expand(batch_size, 1, -1) +embeddings = torch.cat([class_embeds, patch_embeds], dim=1) +if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) +else: + embeddings = embeddings + self.position_embedding(self.position_ids) +return embeddings +``` + +We can infer that `embeddings.shape[1] == self.num_positions`, where + +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L195-L196 +self.num_patches = (self.image_size // self.patch_size) ** 2 +self.num_positions = self.num_patches + 1 +``` + +Overall, the number of placeholder feature tokens for an image can be calculated as: + +```python +def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, +) -> int: + hf_config = self.get_hf_config() + hf_processor = self.get_hf_processor() + + image_size = hf_config.vision_config.image_size + patch_size = hf_config.vision_config.patch_size + + num_image_tokens = (image_size // patch_size) ** 2 + 1 + if hf_processor.vision_feature_select_strategy == "default": + num_image_tokens -= 1 + + return num_image_tokens +``` + +Notice that the number of image tokens doesn't depend on the image width and height. +So, we can calculate the maximum number of image tokens using any image size: + +```python +def get_image_size_with_most_features(self) -> ImageSize: + hf_config = self.get_hf_config() + width = height = hf_config.image_size + return ImageSize(width=width, height=height) + +def get_max_image_tokens(self) -> int: + target_width, target_height = self.get_image_size_with_most_features() + + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + ) +``` + +And thus, we can override the method as: + +```python +def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return {"image": self.get_max_image_tokens()} +``` + +```{note} +Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP. +``` + +::: +:::: + +## 3. Specify dummy inputs + +Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for +HF processing as well as memory profiling. + +### For memory profiling + +Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs` +to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of +the model so that vLLM can reserve the correct amount of memory for it. + +Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed based +on the code for {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`. + +::::{tab-set} +:::{tab-item} Basic example: LLaVA +:sync: llava +Making use of the `get_image_size_with_most_features` method implemented in the previous section: + +```python +def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], +) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + + processor = self.info.get_hf_processor() + image_token = processor.image_token + + hf_config = self.get_hf_config() + target_width, target_height = self.info.get_image_size_with_most_features() + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text=image_token * num_images, + mm_data=mm_data, + ) +``` + +::: +:::: + +## 4. Specify processing details + +Afterwards, create a subclass of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` +to fill in the missing details about HF processing. + +```{seealso} +[Multi-Modal Data Processing](#mm-processing) +``` + +### Multi-modal fields + +Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` to +return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items. + +::::{tab-set} +:::{tab-item} Basic example: LLaVA +:sync: llava + +Looking at the model's `forward` method: + +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L387-L404 +def forward( + self, + input_ids: torch.LongTensor = None, + pixel_values: torch.FloatTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[int] = None, + vision_feature_select_strategy: Optional[str] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + num_logits_to_keep: int = 0, +) -> Union[Tuple, LlavaCausalLMOutputWithPast]: +``` + +The only related keyword argument is `pixel_values` which directly corresponds to input images. +The shape of `pixel_values` is `(N, C, H, W)` where `N` is the number of images. +So, we override the method as follows: + +```python +def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], +) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + ) +``` + +```{note} +Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports +pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument. +``` + +::: +:::: + +### Prompt replacements + +Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements` to +return a list of {class}`~vllm.multimodal.processing.PromptReplacement` instances. + +Each {class}`~vllm.multimodal.processing.PromptReplacement` instance specifies a find-and-replace +operation performed by the HF processor. + +::::{tab-set} +:::{tab-item} Basic example: LLaVA +:sync: llava + +Looking at HF's `LlavaProcessor`: + +```python +# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/processing_llava.py#L167-L170 +prompt_strings = [] +for sample in text: + sample = sample.replace(self.image_token, self.image_token * num_image_tokens) + prompt_strings.append(sample) +``` + +It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`). +Based on this, we override the method as follows: + +```python +def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, +) -> list[PromptReplacement]: + hf_config = self.info.get_hf_config() + image_token_id = hf_config.image_token_index + + def get_replacement(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + + image_size = images.get_image_size(item_idx) + num_image_tokens = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + ) + + return [image_token_id] * num_image_tokens + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=get_replacement, + ), + ] +``` + +::: +:::: + +## 5. Register processor-related classes + +After you have defined {class}`~vllm.multimodal.processing.BaseProcessingInfo` (Step 2), +{class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` (Step 3), +and {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` (Step 4), +decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor ` +to register them to the multi-modal registry: ```diff from vllm.model_executor.models.interfaces import SupportsMultiModal + from vllm.multimodal import MULTIMODAL_REGISTRY -+ @MULTIMODAL_REGISTRY.register_image_input_mapper() ++ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor, ++ info=YourProcessingInfo, ++ dummy_inputs=YourDummyInputsBuilder) class YourModelForImage2Seq(nn.Module, SupportsMultiModal): ``` - -A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function. - -```{seealso} -[Input Processing Pipeline](#input-processing-pipeline) -``` - -## 3. Register maximum number of multi-modal tokens - -For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item -and register it via {meth}`INPUT_REGISTRY.register_dummy_data `. - -```diff - from vllm.inputs import INPUT_REGISTRY - from vllm.model_executor.models.interfaces import SupportsMultiModal - from vllm.multimodal import MULTIMODAL_REGISTRY - - @MULTIMODAL_REGISTRY.register_image_input_mapper() -+ @MULTIMODAL_REGISTRY.register_max_image_tokens() - @INPUT_REGISTRY.register_dummy_data() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): -``` - -Here are some examples: - -- Image inputs (static feature size): [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py) -- Image inputs (dynamic feature size): [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py) - -```{seealso} -[Input Processing Pipeline](#input-processing-pipeline) -``` - -## 4. (Optional) Register dummy data - -During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models. -In such cases, you can define your own dummy data by registering a factory method via {meth}`INPUT_REGISTRY.register_dummy_data `. - -```diff - from vllm.inputs import INPUT_REGISTRY - from vllm.model_executor.models.interfaces import SupportsMultiModal - from vllm.multimodal import MULTIMODAL_REGISTRY - - @MULTIMODAL_REGISTRY.register_image_input_mapper() - @MULTIMODAL_REGISTRY.register_max_image_tokens() -+ @INPUT_REGISTRY.register_dummy_data() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): -``` - -```{note} -The dummy data should have the maximum possible number of multi-modal tokens, as described in the previous step. -``` - -Here are some examples: - -- Image inputs (static feature size): [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py) -- Image inputs (dynamic feature size): [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py) - -```{seealso} -[Input Processing Pipeline](#input-processing-pipeline) -``` - -## 5. (Optional) Register input processor - -Sometimes, there is a need to process inputs at the {class}`~vllm.LLMEngine` level before they are passed to the model executor. -This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's {meth}`~torch.nn.Module.forward` call. -You can register input processors via {meth}`INPUT_REGISTRY.register_input_processor `. - -```diff - from vllm.inputs import INPUT_REGISTRY - from vllm.model_executor.models.interfaces import SupportsMultiModal - from vllm.multimodal import MULTIMODAL_REGISTRY - - @MULTIMODAL_REGISTRY.register_image_input_mapper() - @MULTIMODAL_REGISTRY.register_max_image_tokens() - @INPUT_REGISTRY.register_dummy_data() -+ @INPUT_REGISTRY.register_input_processor() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): -``` - -A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation. -Here are some examples: - -- Insert static number of image tokens: [LLaVA-1.5 Model](gh-file:vllm/model_executor/models/llava.py) -- Insert dynamic number of image tokens: [LLaVA-NeXT Model](gh-file:vllm/model_executor/models/llava_next.py) - -```{seealso} -[Input Processing Pipeline](#input-processing-pipeline) -``` diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md index fe5aa94c52896..d6c9e4181dfee 100644 --- a/docs/source/contributing/model/registration.md +++ b/docs/source/contributing/model/registration.md @@ -1,6 +1,6 @@ (new-model-registration)= -# Model Registration +# Registering a Model to vLLM vLLM relies on a model registry to determine how to run each model. A list of pre-registered architectures can be found [here](#supported-models). @@ -15,7 +15,6 @@ This gives you the ability to modify the codebase and test your model. After you have implemented your model (see [tutorial](#new-model-basic)), put it into the directory. Then, add your model class to `_VLLM_MODELS` in so that it is automatically registered upon importing vLLM. -You should also include an example HuggingFace repository for this model in to run the unit tests. Finally, update our [list of supported models](#supported-models) to promote your model! ```{important} @@ -48,7 +47,7 @@ ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCaus ```{important} If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. -Read more about that [here](#enabling-multimodal-inputs). +Read more about that [here](#supports-multimodal). ``` ```{note} diff --git a/docs/source/contributing/model/tests.md b/docs/source/contributing/model/tests.md new file mode 100644 index 0000000000000..74c933b2f45da --- /dev/null +++ b/docs/source/contributing/model/tests.md @@ -0,0 +1,63 @@ +(new-model-tests)= + +# Writing Unit Tests + +This page explains how to write unit tests to verify the implementation of your model. + +## Required Tests + +These tests are necessary to get your PR merged into vLLM library. +Without them, the CI for your PR will fail. + +### Model loading + +Include an example HuggingFace repository for your model in . +This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM. + +```{important} +The list of models in each section should be maintained in alphabetical order. +``` + +```{tip} +If your model requires a development version of HF Transformers, you can set +`min_transformers_version` to skip the test in CI until the model is released. +``` + +## Optional Tests + +These tests are optional to get your PR merged into vLLM library. +Passing these tests provides more confidence that your implementation is correct, and helps avoid future regressions. + +### Model correctness + +These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of . + +#### Generative models + +For [generative models](#generative-models), there are two levels of correctness tests, as defined in : + +- Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF. +- Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa. + +#### Pooling models + +For [pooling models](#pooling-models), we simply check the cosine similarity, as defined in . + +(mm-processing-tests)= + +### Multi-modal processing + +#### Common tests + +Adding your model to verifies that the following input combinations result in the same outputs: + +- Text + multi-modal data +- Tokens + multi-modal data +- Text + cached multi-modal data +- Tokens + cached multi-modal data + +#### Model-specific tests + +You can add a new file under to run tests that only apply to your model. + +For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in . diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md index c960790f47a13..36cf8e7440eca 100644 --- a/docs/source/contributing/overview.md +++ b/docs/source/contributing/overview.md @@ -25,10 +25,12 @@ Check out the [building from source](#build-from-source) documentation for detai ```bash pip install -r requirements-dev.txt -# linting and formatting -bash format.sh -# Static type checking -mypy +# Linting, formatting and static type checking +pre-commit install + +# You can manually run pre-commit with +pre-commit run --all-files + # Unit tests pytest tests/ ``` @@ -37,8 +39,6 @@ pytest tests/ Currently, the repository is not fully checked by `mypy`. ``` -# Contribution Guidelines - ## Issues If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. @@ -90,7 +90,8 @@ If the PR spans more than one category, please include all relevant prefixes. The PR needs to meet the following code quality standards: - We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html). -- Pass all linter checks. Please use to format your code. +- Pass all linter checks. Please use `pre-commit` to format your code. See + if `pre-commit` is new to you. - The code needs to be well-documented to ensure future contributors can easily understand the code. - Include sufficient tests to ensure the project stays correct and robust. This diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md index 97de40ff469f1..001db86bdf555 100644 --- a/docs/source/contributing/profiling/profiling_index.md +++ b/docs/source/contributing/profiling/profiling_index.md @@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve ### Offline Inference -Refer to for an example. +Refer to for an example. ### OpenAI Server diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md index 2df1aca27f1e6..438be47316f3b 100644 --- a/docs/source/deployment/docker.md +++ b/docs/source/deployment/docker.md @@ -2,6 +2,8 @@ # Using Docker +(deployment-docker-pre-built-image)= + ## Use vLLM's Official Docker Image vLLM offers an official Docker image for deployment. @@ -17,25 +19,32 @@ $ docker run --runtime nvidia --gpus all \ --model mistralai/Mistral-7B-v0.1 ``` +You can add any other you need after the image tag (`vllm/vllm-openai:latest`). + ```{note} You can either use the `ipc=host` flag or `--shm-size` flag to allow the container to access the host's shared memory. vLLM uses PyTorch, which uses shared memory to share data between processes under the hood, particularly for tensor parallel inference. ``` +(deployment-docker-build-image-from-source)= + ## Building vLLM's Docker Image from Source You can build and run vLLM from source via the provided . To build vLLM: ```console -$ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 -$ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai +# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 +DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai ``` ```{note} By default vLLM will build for all GPU types for widest distribution. If you are just building for the current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""` for vLLM to find the current GPU type and build for that. + +If you are using Podman instead of Docker, you might need to disable SELinux labeling by +adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184). ``` ## Building for Arm64/aarch64 diff --git a/docs/source/deployment/frameworks/cerebrium.md b/docs/source/deployment/frameworks/cerebrium.md index be018dfb75d7a..5787c4a407bfb 100644 --- a/docs/source/deployment/frameworks/cerebrium.md +++ b/docs/source/deployment/frameworks/cerebrium.md @@ -13,14 +13,14 @@ vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebr To install the Cerebrium client, run: ```console -$ pip install cerebrium -$ cerebrium login +pip install cerebrium +cerebrium login ``` Next, create your Cerebrium project, run: ```console -$ cerebrium init vllm-project +cerebrium init vllm-project ``` Next, to install the required packages, add the following to your cerebrium.toml: @@ -58,10 +58,10 @@ def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): Then, run the following code to deploy it to the cloud: ```console -$ cerebrium deploy +cerebrium deploy ``` -If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case` /run`) +If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`) ```python curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ diff --git a/docs/source/deployment/frameworks/dstack.md b/docs/source/deployment/frameworks/dstack.md index 4142c1d9f1f60..b42a34125c6d7 100644 --- a/docs/source/deployment/frameworks/dstack.md +++ b/docs/source/deployment/frameworks/dstack.md @@ -13,16 +13,16 @@ vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), To install dstack client, run: ```console -$ pip install "dstack[all] -$ dstack server +pip install "dstack[all] +dstack server ``` Next, to configure your dstack project, run: ```console -$ mkdir -p vllm-dstack -$ cd vllm-dstack -$ dstack init +mkdir -p vllm-dstack +cd vllm-dstack +dstack init ``` Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/source/deployment/frameworks/skypilot.md index 657e7f2bc72cc..051fc2f2a8d4e 100644 --- a/docs/source/deployment/frameworks/skypilot.md +++ b/docs/source/deployment/frameworks/skypilot.md @@ -334,12 +334,12 @@ run: | 1. Start the chat web UI: -```console -sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) -``` + ```console + sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) + ``` 2. Then, we can access the GUI at the returned gradio link: -```console -| INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live -``` + ```console + | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live + ``` diff --git a/docs/source/deployment/integrations/llamastack.md b/docs/source/deployment/integrations/llamastack.md index 474d2bdfa9580..a6c3569637abf 100644 --- a/docs/source/deployment/integrations/llamastack.md +++ b/docs/source/deployment/integrations/llamastack.md @@ -7,7 +7,7 @@ vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-sta To install Llama Stack, run ```console -$ pip install llama-stack -q +pip install llama-stack -q ``` ## Inference using OpenAI Compatible API diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md index 760214e112fba..cbc95c20ff4b3 100644 --- a/docs/source/deployment/k8s.md +++ b/docs/source/deployment/k8s.md @@ -14,234 +14,235 @@ Before you begin, ensure that you have the following: ## Deployment Steps -1. **Create a PVC , Secret and Deployment for vLLM** +1. Create a PVC, Secret and Deployment for vLLM -PVC is used to store the model cache and it is optional, you can use hostPath or other storage options + PVC is used to store the model cache and it is optional, you can use hostPath or other storage options -```yaml -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: mistral-7b - namespace: default -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi - storageClassName: default - volumeMode: Filesystem -``` - -Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models - -```yaml -apiVersion: v1 -kind: Secret -metadata: - name: hf-token-secret - namespace: default -type: Opaque -stringData: - token: "REPLACE_WITH_TOKEN" -``` - -Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model. - -Here are two examples for using NVIDIA GPU and AMD GPU. - -- NVIDIA GPU - -```yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: mistral-7b - namespace: default - labels: - app: mistral-7b -spec: - replicas: 1 - selector: - matchLabels: - app: mistral-7b - template: - metadata: - labels: - app: mistral-7b - spec: - volumes: - - name: cache-volume - persistentVolumeClaim: - claimName: mistral-7b - # vLLM needs to access the host's shared memory for tensor parallel inference. - - name: shm - emptyDir: - medium: Memory - sizeLimit: "2Gi" - containers: - - name: mistral-7b - image: vllm/vllm-openai:latest - command: ["/bin/sh", "-c"] - args: [ - "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" - ] - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - ports: - - containerPort: 8000 + ```yaml + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: mistral-7b + namespace: default + spec: + accessModes: + - ReadWriteOnce resources: - limits: - cpu: "10" - memory: 20G - nvidia.com/gpu: "1" requests: - cpu: "2" - memory: 6G - nvidia.com/gpu: "1" - volumeMounts: - - mountPath: /root/.cache/huggingface - name: cache-volume - - name: shm - mountPath: /dev/shm - livenessProbe: - httpGet: - path: /health - port: 8000 - initialDelaySeconds: 60 - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 8000 - initialDelaySeconds: 60 - periodSeconds: 5 -``` + storage: 50Gi + storageClassName: default + volumeMode: Filesystem + ``` -- AMD GPU + Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models -You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X. + ```yaml + apiVersion: v1 + kind: Secret + metadata: + name: hf-token-secret + namespace: default + type: Opaque + stringData: + token: "REPLACE_WITH_TOKEN" + ``` -```yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: mistral-7b - namespace: default - labels: - app: mistral-7b -spec: - replicas: 1 - selector: - matchLabels: - app: mistral-7b - template: - metadata: - labels: - app: mistral-7b - spec: - volumes: - # PVC - - name: cache-volume - persistentVolumeClaim: - claimName: mistral-7b - # vLLM needs to access the host's shared memory for tensor parallel inference. - - name: shm - emptyDir: - medium: Memory - sizeLimit: "8Gi" - hostNetwork: true - hostIPC: true - containers: - - name: mistral-7b - image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 - securityContext: - seccompProfile: - type: Unconfined - runAsGroup: 44 - capabilities: - add: - - SYS_PTRACE - command: ["/bin/sh", "-c"] - args: [ - "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" - ] - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token + Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model. + + Here are two examples for using NVIDIA GPU and AMD GPU. + + NVIDIA GPU: + + ```yaml + apiVersion: apps/v1 + kind: Deployment + metadata: + name: mistral-7b + namespace: default + labels: + app: mistral-7b + spec: + replicas: 1 + selector: + matchLabels: + app: mistral-7b + template: + metadata: + labels: + app: mistral-7b + spec: + volumes: + - name: cache-volume + persistentVolumeClaim: + claimName: mistral-7b + # vLLM needs to access the host's shared memory for tensor parallel inference. + - name: shm + emptyDir: + medium: Memory + sizeLimit: "2Gi" + containers: + - name: mistral-7b + image: vllm/vllm-openai:latest + command: ["/bin/sh", "-c"] + args: [ + "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" + ] + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + ports: + - containerPort: 8000 + resources: + limits: + cpu: "10" + memory: 20G + nvidia.com/gpu: "1" + requests: + cpu: "2" + memory: 6G + nvidia.com/gpu: "1" + volumeMounts: + - mountPath: /root/.cache/huggingface + name: cache-volume + - name: shm + mountPath: /dev/shm + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 5 + ``` + + AMD GPU: + + You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X. + + ```yaml + apiVersion: apps/v1 + kind: Deployment + metadata: + name: mistral-7b + namespace: default + labels: + app: mistral-7b + spec: + replicas: 1 + selector: + matchLabels: + app: mistral-7b + template: + metadata: + labels: + app: mistral-7b + spec: + volumes: + # PVC + - name: cache-volume + persistentVolumeClaim: + claimName: mistral-7b + # vLLM needs to access the host's shared memory for tensor parallel inference. + - name: shm + emptyDir: + medium: Memory + sizeLimit: "8Gi" + hostNetwork: true + hostIPC: true + containers: + - name: mistral-7b + image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 + securityContext: + seccompProfile: + type: Unconfined + runAsGroup: 44 + capabilities: + add: + - SYS_PTRACE + command: ["/bin/sh", "-c"] + args: [ + "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" + ] + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + ports: + - containerPort: 8000 + resources: + limits: + cpu: "10" + memory: 20G + amd.com/gpu: "1" + requests: + cpu: "6" + memory: 6G + amd.com/gpu: "1" + volumeMounts: + - name: cache-volume + mountPath: /root/.cache/huggingface + - name: shm + mountPath: /dev/shm + ``` + + You can get the full example with steps and sample yaml files from . + +2. Create a Kubernetes Service for vLLM + + Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: + + ```yaml + apiVersion: v1 + kind: Service + metadata: + name: mistral-7b + namespace: default + spec: ports: - - containerPort: 8000 - resources: - limits: - cpu: "10" - memory: 20G - amd.com/gpu: "1" - requests: - cpu: "6" - memory: 6G - amd.com/gpu: "1" - volumeMounts: - - name: cache-volume - mountPath: /root/.cache/huggingface - - name: shm - mountPath: /dev/shm -``` -You can get the full example with steps and sample yaml files from . + - name: http-mistral-7b + port: 80 + protocol: TCP + targetPort: 8000 + # The label selector should match the deployment labels & it is useful for prefix caching feature + selector: + app: mistral-7b + sessionAffinity: None + type: ClusterIP + ``` -2. **Create a Kubernetes Service for vLLM** +3. Deploy and Test -Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: + Apply the deployment and service configurations using `kubectl apply -f `: -```yaml -apiVersion: v1 -kind: Service -metadata: - name: mistral-7b - namespace: default -spec: - ports: - - name: http-mistral-7b - port: 80 - protocol: TCP - targetPort: 8000 - # The label selector should match the deployment labels & it is useful for prefix caching feature - selector: - app: mistral-7b - sessionAffinity: None - type: ClusterIP -``` + ```console + kubectl apply -f deployment.yaml + kubectl apply -f service.yaml + ``` -3. **Deploy and Test** + To test the deployment, run the following `curl` command: -Apply the deployment and service configurations using `kubectl apply -f `: + ```console + curl http://mistral-7b.default.svc.cluster.local/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "mistralai/Mistral-7B-Instruct-v0.3", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' + ``` -```console -kubectl apply -f deployment.yaml -kubectl apply -f service.yaml -``` - -To test the deployment, run the following `curl` command: - -```console -curl http://mistral-7b.default.svc.cluster.local/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "mistralai/Mistral-7B-Instruct-v0.3", - "prompt": "San Francisco is a", - "max_tokens": 7, - "temperature": 0 - }' -``` - -If the service is correctly deployed, you should receive a response from the vLLM model. + If the service is correctly deployed, you should receive a response from the vLLM model. ## Conclusion diff --git a/docs/source/design/automatic_prefix_caching.md b/docs/source/design/automatic_prefix_caching.md index 4398536b2b4ad..3928e0c16568b 100644 --- a/docs/source/design/automatic_prefix_caching.md +++ b/docs/source/design/automatic_prefix_caching.md @@ -2,11 +2,11 @@ # Automatic Prefix Caching -The core idea of [PagedAttention](#design-paged-attention) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand. +The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand. To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block. -``` +```text Block 1 Block 2 Block 3 [A gentle breeze stirred] [the leaves as children] [laughed in the distance] Block 1: |<--- block tokens ---->| @@ -14,19 +14,16 @@ Block 2: |<------- prefix ------>| |<--- block tokens --->| Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->| ``` - In the example above, the KV cache in the first block can be uniquely identified with the tokens “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the following one-to-one mapping: -``` +```text hash(prefix tokens + block tokens) <--> KV Block ``` With this mapping, we can add another indirection in vLLM’s KV cache management. Previously, each sequence in vLLM maintained a mapping from their logical KV blocks to physical blocks. To achieve automatic caching of KV blocks, we map the logical KV blocks to their hash value and maintain a global hash table of all the physical blocks. In this way, all the KV blocks sharing the same hash value (e.g., shared prefix blocks across two requests) can be mapped to the same physical block and share the memory space. - This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system. - ## Generalized Caching Policy Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full. @@ -41,5 +38,5 @@ Note that this eviction policy effectively implements the exact policy as in [Ra However, the hash-based KV cache management gives us the flexibility to handle more complicated serving scenarios and implement more complicated eviction policies beyond the policy above: -- Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency. -- Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images. +* Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency. +* Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images. diff --git a/docs/source/design/input_processing/input_processing_pipeline.md b/docs/source/design/input_processing/input_processing_pipeline.md deleted file mode 100644 index bb16920e3d0c0..0000000000000 --- a/docs/source/design/input_processing/input_processing_pipeline.md +++ /dev/null @@ -1,19 +0,0 @@ -(input-processing-pipeline)= - -# Input Processing Pipeline - -1. Input data is passed to {class}`~vllm.LLMEngine` (or {class}`~vllm.AsyncLLMEngine`). - -2. Tokenize the data if necessary. - -3. Process the inputs using {meth}`INPUT_REGISTRY.process_input `. - - - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings. - -4. Send the processed inputs to {class}`~vllm.executor.executor_base.ExecutorBase`. - -5. Distribute the inputs via {class}`~vllm.worker.worker_base.WorkerBase` to {class}`~vllm.worker.model_runner_base.ModelRunnerBase`. - -6. If the data contains multi-modal data, convert it into keyword arguments using {meth}`MULTIMODAL_REGISTRY.map_input `. - - - For example, convert a {class}`PIL.Image.Image` input to its pixel values for a vision model. diff --git a/docs/source/design/input_processing/model_inputs_index.md b/docs/source/design/input_processing/model_inputs_index.md deleted file mode 100644 index cb415366e5a66..0000000000000 --- a/docs/source/design/input_processing/model_inputs_index.md +++ /dev/null @@ -1,43 +0,0 @@ -(input-processing)= - -# Input Processing - -```{eval-rst} -.. currentmodule:: vllm.inputs -``` - -Each model can override parts of vLLM's [input processing pipeline](#input-processing-pipeline) via -{data}`~vllm.inputs.INPUT_REGISTRY` and {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`. - -Currently, this mechanism is only utilized in [multi-modal](#multi-modality) models for preprocessing multi-modal input -data in addition to input prompt, but it can be extended to text-only language models when needed. - -## Guides - -```{toctree} -:maxdepth: 1 - -input_processing_pipeline -``` - -## Module Contents - -### LLM Engine Inputs - -```{eval-rst} -.. autoclass:: vllm.inputs.DecoderOnlyInputs - :members: - :show-inheritance: -``` - -### Registry - -```{eval-rst} -.. autodata:: vllm.inputs.INPUT_REGISTRY -``` - -```{eval-rst} -.. automodule:: vllm.inputs.registry - :members: - :show-inheritance: -``` diff --git a/docs/source/design/mm_processing.md b/docs/source/design/mm_processing.md new file mode 100644 index 0000000000000..a0d01205e638c --- /dev/null +++ b/docs/source/design/mm_processing.md @@ -0,0 +1,64 @@ +(mm-processing)= + +# Multi-Modal Data Processing + +To enable various optimizations in vLLM such as [chunked prefill](#chunked-prefill) and [prefix caching](#automatic-prefix-caching), we use {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` to provide the correspondence between placeholder feature tokens (e.g. ``) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor. + +Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`: + +## Prompt Replacement Detection + +One of the main responsibilies of HF processor is to replace input placeholder tokens (e.g. `` for a single image) with feature placeholder tokens (e.g. `...`, the number of which equals to the feature size). The information about which tokens have been replaced is key to finding the correspondence between placeholder feature tokens and multi-modal inputs. + +In vLLM, this information is specified using {class}`~vllm.multimodal.processing.PromptReplacement` in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements`. Given this specification, we can automatically detect whether HF has replaced the input placeholder tokens by checking whether the feature placeholder tokens exist in the prompt. + +## Tokenized Prompt Inputs + +To enable tokenization in a separate process, we support passing input token IDs alongside multi-modal data. + +### The problem + +Consider that HF processors follow these main steps: + +1. Tokenize the text +2. Process multi-modal inputs +3. Perform prompt replacement + +And we require that: + +- For text + multi-modal inputs, apply all steps 1--3. +- For tokenized + multi-modal inputs, apply only steps 2--3. + +How can we achieve this without rewriting HF processors? We can try to call the HF processor several times on different inputs: + +- For text + multi-modal inputs, simply call the HF processor directly. +- For tokenized + multi-modal inputs, call the processor only on the multi-modal inputs. + +While HF processors support text + multi-modal inputs natively, this is not so for tokenized + multi-modal inputs: an error is thrown if the number of input placeholder tokens do not correspond to the number of multi-modal inputs. + +Moreover, since the tokenized text has not passed through the HF processor, we have to apply Step 3 by ourselves to keep the output tokens and multi-modal data consistent with each other. + +(mm-dummy-text)= + +### Dummy text + +We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data. + +(mm-automatic-prompt-replacement)= + +### Automatic prompt replacement + +We address the second issue by implementing model-agnostic code in +{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_replacements` to automatically replace input placeholder tokens with feature placeholder tokens based on the specification outputted by {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements`. + +### Summary + +With the help of dummy text and automatic prompt replacement, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main`. + +## Processor Output Caching + +Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again. + +When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache. + +Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#mm-dummy-text) to avoid HF errors. Since this skips HF's prompt replacement code, we apply [automatic prompt replacement](#mm-automatic-prompt-replacement) afterwards to keep the output tokens and multi-modal data consistent with each other. diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md index 8d8f7dca2e5b5..86a82eb36df33 100644 --- a/docs/source/features/compatibility_matrix.md +++ b/docs/source/features/compatibility_matrix.md @@ -322,7 +322,9 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar ``` -### Feature x Hardware +(feature-x-hardware)= + +## Feature x Hardware ```{list-table} :header-rows: 1 @@ -359,7 +361,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - - [✗](gh-pr:4830) + - ✅ - ✅ * - prmpt adptr - ✅ diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md index 3679595e3d4d0..404505eb3890e 100644 --- a/docs/source/features/quantization/auto_awq.md +++ b/docs/source/features/quantization/auto_awq.md @@ -15,7 +15,7 @@ The main benefits are lower latency and memory usage. You can quantize your own models by installing AutoAWQ or picking one of the [400+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq). ```console -$ pip install autoawq +pip install autoawq ``` After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: @@ -47,7 +47,7 @@ print(f'Model is quantized and saved at "{quant_path}"') To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: ```console -$ python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq +python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq ``` AWQ models are also supported directly through the LLM entrypoint: diff --git a/docs/source/features/quantization/bnb.md b/docs/source/features/quantization/bnb.md index f7f41726f3725..7525e8e7866c3 100644 --- a/docs/source/features/quantization/bnb.md +++ b/docs/source/features/quantization/bnb.md @@ -9,7 +9,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal Below are the steps to utilize BitsAndBytes with vLLM. ```console -$ pip install bitsandbytes>=0.45.0 +pip install bitsandbytes>=0.45.0 ``` vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. @@ -17,7 +17,7 @@ vLLM reads the model's config file and supports both in-flight quantization and You can find bitsandbytes quantized models on . And usually, these repositories have a config.json file that includes a quantization_config section. -## Read quantized checkpoint. +## Read quantized checkpoint ```python from vllm import LLM @@ -37,10 +37,11 @@ model_id = "huggyllama/llama-7b" llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ quantization="bitsandbytes", load_format="bitsandbytes") ``` + ## OpenAI Compatible Server Append the following to your 4bit model arguments: -``` +```console --quantization bitsandbytes --load-format bitsandbytes ``` diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md index b2eda74fd1e3b..1398e8a324201 100644 --- a/docs/source/features/quantization/fp8.md +++ b/docs/source/features/quantization/fp8.md @@ -41,7 +41,7 @@ Currently, we load the model at original precision before quantizing down to 8-b To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: ```console -$ pip install llmcompressor +pip install llmcompressor ``` ## Quantization Process @@ -54,16 +54,15 @@ The quantization process involves three main steps: ### 1. Loading the Model -Use `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM`, for saving and loading quantized models: +Load your model and tokenizer using the standard `transformers` AutoModel classes: ```python -from llmcompressor.transformers import SparseAutoModelForCausalLM -from transformers import AutoTokenizer +from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" - -model = SparseAutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto") +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto", +) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` @@ -98,7 +97,7 @@ tokenizer.save_pretrained(SAVE_DIR) Install `vllm` and `lm-evaluation-harness`: ```console -$ pip install vllm lm-eval==0.4.4 +pip install vllm lm-eval==0.4.4 ``` Load and run the model in `vllm`: diff --git a/docs/source/features/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md index 50edaf81fddd3..1cd67cb8fd336 100644 --- a/docs/source/features/quantization/fp8_e4m3_kvcache.md +++ b/docs/source/features/quantization/fp8_e4m3_kvcache.md @@ -17,7 +17,7 @@ unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO). To install AMMO (AlgorithMic Model Optimization): ```console -$ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo +pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo ``` Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon diff --git a/docs/source/features/quantization/gguf.md b/docs/source/features/quantization/gguf.md index eebf11dfc1b2b..640997cf4bc39 100644 --- a/docs/source/features/quantization/gguf.md +++ b/docs/source/features/quantization/gguf.md @@ -13,16 +13,16 @@ Currently, vllm only supports loading single-file GGUF models. If you have a mul To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command: ```console -$ wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf -$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. -$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 +wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf +# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. +vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 ``` You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs: ```console -$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. -$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 +# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. +vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 ``` ```{warning} diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md index 1ac50ba987dda..592a60d3988b2 100644 --- a/docs/source/features/quantization/int8.md +++ b/docs/source/features/quantization/int8.md @@ -16,7 +16,7 @@ INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turi To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: ```console -$ pip install llmcompressor +pip install llmcompressor ``` ## Quantization Process @@ -30,14 +30,13 @@ The quantization process involves four main steps: ### 1. Loading the Model -Use `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM`, for saving and loading quantized models: +Load your model and tokenizer using the standard `transformers` AutoModel classes: ```python -from llmcompressor.transformers import SparseAutoModelForCausalLM -from transformers import AutoTokenizer +from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" -model = SparseAutoModelForCausalLM.from_pretrained( +model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md index 988288a82d9bc..f5c0a95ea426e 100644 --- a/docs/source/features/quantization/supported_hardware.md +++ b/docs/source/features/quantization/supported_hardware.md @@ -113,7 +113,7 @@ The table below shows the compatibility of various quantization implementations - ✅︎ - ✅︎ - ✅︎ - - ✗ + - ✅︎ - ✗ - ✗ - ✗ diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md index 903acadb71426..ab7b2f302bd13 100644 --- a/docs/source/features/spec_decode.md +++ b/docs/source/features/spec_decode.md @@ -192,11 +192,11 @@ A few important things to consider when using the EAGLE based draft models: 1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) cannot be used directly with vLLM due to differences in the expected layer names and model definition. - To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) + To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert them. Note that this script does not modify the model's weights. In the above example, use the script to first convert - the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model + the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model and then use the converted checkpoint as the draft model in vLLM. 2. The EAGLE based draft models need to be run without tensor parallelism @@ -207,7 +207,6 @@ A few important things to consider when using the EAGLE based draft models: reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under investigation and tracked here: [https://github.com/vllm-project/vllm/issues/9565](https://github.com/vllm-project/vllm/issues/9565). - A variety of EAGLE draft models are available on the Hugging Face hub: | Base Model | EAGLE on Hugging Face | # EAGLE Parameters | @@ -224,7 +223,6 @@ A variety of EAGLE draft models are available on the Hugging Face hub: | Qwen2-7B-Instruct | yuhuili/EAGLE-Qwen2-7B-Instruct | 0.26B | | Qwen2-72B-Instruct | yuhuili/EAGLE-Qwen2-72B-Instruct | 1.05B | - ## Lossless guarantees of Speculative Decoding In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of @@ -250,8 +248,6 @@ speculative decoding, breaking down the guarantees into three key areas: same request across runs. For more details, see the FAQ section titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). -**Conclusion** - While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding can occur due to following factors: @@ -259,8 +255,6 @@ can occur due to following factors: - **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially due to non-deterministic behavior in batched operations or numerical instability. -**Mitigation Strategies** - For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). ## Resources for vLLM contributors diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md index ccd9a6a1b1a14..1d77c7339a33f 100644 --- a/docs/source/features/structured_outputs.md +++ b/docs/source/features/structured_outputs.md @@ -5,7 +5,7 @@ vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines), [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer), or [xgrammar](https://github.com/mlc-ai/xgrammar) as backends for the guided decoding. This document shows you some examples of the different options that are available to generate structured outputs. -## Online Inference (OpenAI API) +## Online Serving (OpenAI API) You can generate structured outputs using the OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API. @@ -239,7 +239,7 @@ The main available options inside `GuidedDecodingParams` are: - `backend` - `whitespace_pattern` -These parameters can be used in the same way as the parameters from the Online Inference examples above. +These parameters can be used in the same way as the parameters from the Online Serving examples above. One example for the usage of the `choices` parameter is shown below: ```python @@ -257,4 +257,4 @@ outputs = llm.generate( print(outputs[0].outputs[0].text) ``` -Full example: +Full example: diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md index 062f2021eb62a..027ddb6d5eda3 100644 --- a/docs/source/features/tool_calling.md +++ b/docs/source/features/tool_calling.md @@ -55,21 +55,24 @@ print(f"Result: {get_weather(**json.loads(tool_call.arguments))}") ``` Example output: -``` + +```text Function called: get_weather Arguments: {"location": "San Francisco, CA", "unit": "fahrenheit"} Result: Getting the weather for San Francisco, CA in fahrenheit... ``` This example demonstrates: -- Setting up the server with tool calling enabled -- Defining an actual function to handle tool calls -- Making a request with `tool_choice="auto"` -- Handling the structured response and executing the corresponding function + +* Setting up the server with tool calling enabled +* Defining an actual function to handle tool calls +* Making a request with `tool_choice="auto"` +* Handling the structured response and executing the corresponding function You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests. Remember that it's the callers responsibility to: + 1. Define appropriate tools in the request 2. Include relevant context in the chat messages 3. Handle the tool calls in your application logic @@ -77,20 +80,21 @@ Remember that it's the callers responsibility to: For more advanced usage, including parallel tool calls and different model-specific parsers, see the sections below. ## Named Function Calling + vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a high-quality one. -vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. +vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend. To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. - ## Automatic Function Calling To enable this feature, you should set the following flags: + * `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it deems appropriate. * `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers @@ -104,28 +108,28 @@ from HuggingFace; and you can find an example of this in a `tokenizer_config.jso If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template! - ### Hermes Models (`hermes`) All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported. + * `NousResearch/Hermes-2-Pro-*` * `NousResearch/Hermes-2-Theta-*` * `NousResearch/Hermes-3-*` - _Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge step in their creation_. Flags: `--tool-call-parser hermes` - ### Mistral Models (`mistral`) Supported models: + * `mistralai/Mistral-7B-Instruct-v0.3` (confirmed) * Additional mistral function-calling models are compatible as well. Known issues: + 1. Mistral 7B struggles to generate parallel tool calls correctly. 2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is much shorter than what vLLM generates. Since an exception is thrown when this condition @@ -136,13 +140,12 @@ it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated * `examples/tool_chat_template_mistral_parallel.jinja` - this is a "better" version that adds a tool-use system prompt when tools are provided, that results in much better reliability when working with parallel tool calling. - Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` - ### Llama Models (`llama3_json`) Supported models: + * `meta-llama/Meta-Llama-3.1-8B-Instruct` * `meta-llama/Meta-Llama-3.1-70B-Instruct` * `meta-llama/Meta-Llama-3.1-405B-Instruct` @@ -152,6 +155,7 @@ The tool calling that is supported is the [JSON based tool calling](https://llam Other tool calling formats like the built in python tool calling or custom tool calling are not supported. Known issues: + 1. Parallel tool calls are not supported. 2. The model can generate parameters with a wrong format, such as generating an array serialized as string instead of an array. @@ -164,6 +168,7 @@ Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool #### IBM Granite Supported models: + * `ibm-granite/granite-3.0-8b-instruct` Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja` @@ -182,42 +187,45 @@ Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/t `examples/tool_chat_template_granite_20b_fc.jinja`: this is a modified chat template from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported. - ### InternLM Models (`internlm`) Supported models: + * `internlm/internlm2_5-7b-chat` (confirmed) * Additional internlm2.5 function-calling models are compatible as well Known issues: + * Although this implementation also supports InternLM2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model. Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja` - ### Jamba Models (`jamba`) + AI21's Jamba-1.5 models are supported. + * `ai21labs/AI21-Jamba-1.5-Mini` * `ai21labs/AI21-Jamba-1.5-Large` - Flags: `--tool-call-parser jamba` - ### Models with Pythonic Tool Calls (`pythonic`) A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models. As a concrete example, these models may look up the weather in San Francisco and Seattle by generating: + ```python [get_weather(city='San Francisco', metric='celsius'), get_weather(city='Seattle', metric='celsius')] ``` Limitations: + * The model must not generate both text and tool calls in the same generation. This may not be hard to change for a specific model, but the community currently lacks consensus on which tokens to emit when starting and ending tool calls. (In particular, the Llama 3.2 models emit no such tokens.) * Llama's smaller models struggle to use tools effectively. Example supported models: + * `meta-llama/Llama-3.2-1B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) * `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) * `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`) @@ -231,7 +239,6 @@ Llama's smaller models frequently fail to emit tool calls in the correct format. --- - ## How to write a tool parser plugin A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py. @@ -284,7 +291,8 @@ class ExampleToolParser(ToolParser): ``` Then you can use this plugin in the command line like this. -``` + +```console --enable-auto-tool-choice \ --tool-parser-plugin --tool-call-parser example \ diff --git a/docs/source/getting_started/faq.md b/docs/source/getting_started/faq.md index fde2954f10c59..4751b325e6fc4 100644 --- a/docs/source/getting_started/faq.md +++ b/docs/source/getting_started/faq.md @@ -30,7 +30,7 @@ changes in batch size, or batch expansion in speculative decoding. These batchin can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in different tokens being sampled. Once a different token is sampled, further divergence is likely. -**Mitigation Strategies** +## Mitigation Strategies - For improved stability and reduced variance, use `float32`. Note that this will require more memory. - If using `bfloat16`, switching to `float16` can also help. diff --git a/docs/source/getting_started/installation/hpu-gaudi.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md similarity index 89% rename from docs/source/getting_started/installation/hpu-gaudi.md rename to docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md index 1d50cef3bdc83..b4695d504b601 100644 --- a/docs/source/getting_started/installation/hpu-gaudi.md +++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md @@ -1,10 +1,13 @@ -(installation-gaudi)= +# Installation -# Installation for Intel® Gaudi® +This tab provides instructions on running vLLM with Intel Gaudi devices. -This README provides instructions on running vLLM with Intel Gaudi devices. +## Requirements -## Requirements and Installation +- OS: Ubuntu 22.04 LTS +- Python: 3.10 +- Intel Gaudi accelerator +- Intel Gaudi software version 1.18.0 Please follow the instructions provided in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) @@ -12,42 +15,24 @@ to set up the execution environment. To achieve the best performance, please follow the methods outlined in the [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). -### Requirements +## Configure a new environment -- OS: Ubuntu 22.04 LTS -- Python: 3.10 -- Intel Gaudi accelerator -- Intel Gaudi software version 1.18.0 - -### Quick start using Dockerfile - -```console -$ docker build -f Dockerfile.hpu -t vllm-hpu-env . -$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env -``` - -```{tip} -If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered. -``` - -### Build from source - -#### Environment verification +### Environment verification To verify that the Intel Gaudi software was correctly installed, run: ```console -$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible -$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed -$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed -$ pip list | grep neural # verify that neural_compressor is installed +hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible +apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed +pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed +pip list | grep neural # verify that neural_compressor is installed ``` Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade) for more details. -#### Run Docker Image +### Run Docker Image It is highly recommended to use the latest Docker image from Intel Gaudi vault. Refer to the [Intel Gaudi @@ -57,33 +42,58 @@ for more details. Use the following commands to run a Docker image: ```console -$ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest -$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest ``` -#### Build and Install vLLM +## Set up using Python + +### Pre-built wheels + +Currently, there are no pre-built Intel Gaudi wheels. + +### Build wheel from source To build and install vLLM from source, run: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ python setup.py develop +git clone https://github.com/vllm-project/vllm.git +cd vllm +python setup.py develop ``` Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: ```console -$ git clone https://github.com/HabanaAI/vllm-fork.git -$ cd vllm-fork -$ git checkout habana_main -$ python setup.py develop +git clone https://github.com/HabanaAI/vllm-fork.git +cd vllm-fork +git checkout habana_main +python setup.py develop ``` -## Supported Features +## Set up using Docker + +### Pre-built images + +Currently, there are no pre-built Intel Gaudi images. + +### Build image from source + +```console +docker build -f Dockerfile.hpu -t vllm-hpu-env . +docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env +``` + +```{tip} +If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered. +``` + +## Extra information + +## Supported features - [Offline inference](#offline-inference) -- Online inference via [OpenAI-Compatible Server](#openai-compatible-server) +- Online serving via [OpenAI-Compatible Server](#openai-compatible-server) - HPU autodetection - no need to manually select device within vLLM - Paged KV cache with algorithms enabled for Intel Gaudi accelerators - Custom Intel Gaudi implementations of Paged Attention, KV cache ops, @@ -94,14 +104,14 @@ $ python setup.py develop for accelerating low-batch latency and throughput - Attention with Linear Biases (ALiBi) -## Unsupported Features +## Unsupported features - Beam search - LoRA adapters - Quantization - Prefill chunking (mixed-batch inferencing) -## Supported Configurations +## Supported configurations The following configurations have been validated to be function with Gaudi2 devices. Configurations that are not listed may or may not work. @@ -137,7 +147,7 @@ Gaudi2 devices. Configurations that are not listed may or may not work. - [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -## Performance Tuning +## Performance tuning ### Execution modes @@ -181,7 +191,7 @@ Bucketing allows us to reduce the number of required graphs significantly, but i Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: -``` +```text INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] @@ -192,7 +202,7 @@ INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 1 Example (with ramp-up) -``` +```text min = 2, step = 32, max = 64 => ramp_up = (2, 4, 8, 16) => stable = (32, 64) @@ -201,7 +211,7 @@ min = 2, step = 32, max = 64 Example (without ramp-up) -``` +```text min = 128, step = 128, max = 512 => ramp_up = () => stable = (128, 256, 384, 512) @@ -224,7 +234,7 @@ Bucketing is transparent to a client -- padding in sequence length dimension is Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: -``` +```text INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB @@ -273,7 +283,7 @@ When there's large amount of requests pending, vLLM scheduler will attempt to fi Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): -``` +```text INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] @@ -349,26 +359,26 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi - Default values: - Prompt: - : - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` - - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` - - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)` - - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` - - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` - - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len` + - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` + - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)` + - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` + - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len` - Decode: - : - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` - - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` - - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` - - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` - - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` - - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)` + - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` + - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` + - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` + - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)` Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: - `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default - `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs -## Troubleshooting: Tweaking HPU Graphs +## Troubleshooting: tweaking HPU graphs If you experience device out-of-memory issues or want to attempt inference at higher batch sizes, try tweaking HPU Graphs by following @@ -385,5 +395,5 @@ the below: completely. With HPU Graphs disabled, you are trading latency and throughput at lower batches for potentially higher throughput on higher batches. You can do that by adding `--enforce-eager` flag to - server (for online inference), or by passing `enforce_eager=True` + server (for online serving), or by passing `enforce_eager=True` argument to LLM constructor (for offline inference). diff --git a/docs/source/getting_started/installation/ai_accelerator/index.md b/docs/source/getting_started/installation/ai_accelerator/index.md new file mode 100644 index 0000000000000..a6c4c44305a4c --- /dev/null +++ b/docs/source/getting_started/installation/ai_accelerator/index.md @@ -0,0 +1,375 @@ +# Other AI accelerators + +vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions: + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::: + +## Requirements + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "## Requirements" +:end-before: "## Configure a new environment" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "## Requirements" +:end-before: "## Configure a new environment" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "## Requirements" +:end-before: "## Configure a new environment" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" +``` + +::: + +:::: + +## Configure a new environment + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "## Configure a new environment" +:end-before: "## Set up using Python" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "## Configure a new environment" +:end-before: "## Set up using Python" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "## Configure a new environment" +:end-before: "## Set up using Python" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} ../python_env_setup.inc.md +``` + +::: + +:::: + +## Set up using Python + +### Pre-built wheels + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::: + +### Build wheel from source + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::: + +## Set up using Docker + +### Pre-built images + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::: + +### Build image from source + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "### Build image from source" +:end-before: "## Extra information" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "### Build image from source" +:end-before: "## Extra information" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "### Build image from source" +:end-before: "## Extra information" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "### Build image from source" +:end-before: "## Extra information" +``` + +::: + +:::: + +## Extra information + +::::{tab-set} +:sync-group: device + +:::{tab-item} TPU +:sync: tpu + +```{include} tpu.inc.md +:start-after: "## Extra information" +``` + +::: + +:::{tab-item} Intel Gaudi +:sync: hpu-gaudi + +```{include} hpu-gaudi.inc.md +:start-after: "## Extra information" +``` + +::: + +:::{tab-item} Neuron +:sync: neuron + +```{include} neuron.inc.md +:start-after: "## Extra information" +``` + +::: + +:::{tab-item} OpenVINO +:sync: openvino + +```{include} openvino.inc.md +:start-after: "## Extra information" +``` + +::: + +:::: diff --git a/docs/source/getting_started/installation/neuron.md b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md similarity index 83% rename from docs/source/getting_started/installation/neuron.md rename to docs/source/getting_started/installation/ai_accelerator/neuron.inc.md index 431f90537f543..575a9f9c2e2f0 100644 --- a/docs/source/getting_started/installation/neuron.md +++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md @@ -1,6 +1,4 @@ -(installation-neuron)= - -# Installation for Neuron +# Installation vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching. Paged Attention and Chunked Prefill are currently in development and will be available soon. @@ -14,28 +12,9 @@ Data types currently supported in Neuron SDK are FP16 and BF16. - Pytorch 2.0.1/2.1.1 - AWS Neuron SDK 2.16/2.17 (Verified on python 3.8) -Installation steps: +## Configure a new environment -- [Build from source](#build-from-source-neuron) - - - [Step 0. Launch Trn1/Inf2 instances](#launch-instances) - - [Step 1. Install drivers and tools](#install-drivers) - - [Step 2. Install transformers-neuronx and its dependencies](#install-tnx) - - [Step 3. Install vLLM from source](#install-vllm) - -(build-from-source-neuron)= - -```{note} -The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. -``` - -## Build from source - -Following instructions are applicable to Neuron SDK 2.16 and beyond. - -(launch-instances)= - -### Step 0. Launch Trn1/Inf2 instances +### Launch Trn1/Inf2 instances Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html). @@ -45,9 +24,7 @@ Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch N - When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB. - After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance -(install-drivers)= - -### Step 1. Install drivers and tools +### Install drivers and tools The installation of drivers and tools wouldn't be necessary, if [Deep Learning AMI Neuron](https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html) is installed. In case the drivers and tools are not installed on the operating system, follow the steps below: @@ -82,9 +59,21 @@ sudo apt-get install aws-neuronx-tools=2.* -y export PATH=/opt/aws/neuron/bin:$PATH ``` -(install-tnx)= +## Set up using Python -### Step 2. Install transformers-neuronx and its dependencies +### Pre-built wheels + +Currently, there are no pre-built Neuron wheels. + +### Build wheel from source + +```{note} +The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. +``` + +Following instructions are applicable to Neuron SDK 2.16 and beyond. + +#### Install transformers-neuronx and its dependencies [transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) will be the backend to support inference on trn1/inf2 instances. Follow the steps below to install transformer-neuronx package and its dependencies. @@ -116,17 +105,31 @@ python -m pip install awscli python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx ``` -(install-vllm)= - -### Step 3. Install vLLM from source +#### Install vLLM from source Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ pip install -U -r requirements-neuron.txt -$ VLLM_TARGET_DEVICE="neuron" pip install . +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -U -r requirements-neuron.txt +VLLM_TARGET_DEVICE="neuron" pip install . ``` If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed. + +## Set up using Docker + +### Pre-built images + +Currently, there are no pre-built Neuron images. + +### Build image from source + +See for instructions on building the Docker image. + +Make sure to use in place of the default Dockerfile. + +## Extra information + +There is no extra information for this device. diff --git a/docs/source/getting_started/installation/openvino.md b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md similarity index 67% rename from docs/source/getting_started/installation/openvino.md rename to docs/source/getting_started/installation/ai_accelerator/openvino.inc.md index 60f95fd1c4250..a7867472583d6 100644 --- a/docs/source/getting_started/installation/openvino.md +++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md @@ -1,63 +1,65 @@ -(installation-openvino)= +# Installation -# Installation for OpenVINO - -vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features: - -- Prefix caching (`--enable-prefix-caching`) -- Chunked prefill (`--enable-chunked-prefill`) - -**Table of contents**: - -- [Requirements](#openvino-backend-requirements) -- [Quick start using Dockerfile](#openvino-backend-quick-start-dockerfile) -- [Build from source](#install-openvino-backend-from-source) -- [Performance tips](#openvino-backend-performance-tips) -- [Limitations](#openvino-backend-limitations) - -(openvino-backend-requirements)= +vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). ## Requirements - OS: Linux - Instruction set architecture (ISA) requirement: at least AVX2. -(openvino-backend-quick-start-dockerfile)= +## Set up using Python -## Quick start using Dockerfile +### Pre-built wheels + +Currently, there are no pre-built OpenVINO wheels. + +### Build wheel from source + +First, install Python. For example, on Ubuntu 22.04, you can run: ```console -$ docker build -f Dockerfile.openvino -t vllm-openvino-env . -$ docker run -it --rm vllm-openvino-env +sudo apt-get update -y +sudo apt-get install python3 ``` -(install-openvino-backend-from-source)= +Second, install prerequisites vLLM OpenVINO backend installation: -## Install from source +```console +pip install --upgrade pip +pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu +``` -- First, install Python. For example, on Ubuntu 22.04, you can run: +Finally, install vLLM with OpenVINO backend: - ```console - $ sudo apt-get update -y - $ sudo apt-get install python3 - ``` +```console +PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . +``` -- Second, install prerequisites vLLM OpenVINO backend installation: +:::{tip} +To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html). +::: - ```console - $ pip install --upgrade pip - $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu - ``` +## Set up using Docker -- Finally, install vLLM with OpenVINO backend: +### Pre-built images - ```console - $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . - ``` +Currently, there are no pre-built OpenVINO images. -- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html). +### Build image from source -(openvino-backend-performance-tips)= +```console +docker build -f Dockerfile.openvino -t vllm-openvino-env . +docker run -it --rm vllm-openvino-env +``` + +## Extra information + +## Supported features + +OpenVINO vLLM backend supports the following advanced vLLM features: + +- Prefix caching (`--enable-prefix-caching`) +- Chunked prefill (`--enable-chunked-prefill`) ## Performance tips @@ -95,8 +97,6 @@ $ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json ``` -(openvino-backend-limitations)= - ## Limitations - LoRA serving is not supported. diff --git a/docs/source/getting_started/installation/tpu.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md similarity index 85% rename from docs/source/getting_started/installation/tpu.md rename to docs/source/getting_started/installation/ai_accelerator/tpu.inc.md index bc93c44fead30..6a911cc6b9eba 100644 --- a/docs/source/getting_started/installation/tpu.md +++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md @@ -1,6 +1,4 @@ -(installation-tpu)= - -# Installation for TPUs +# Installation Tensor Processing Units (TPUs) are Google's custom-developed application-specific integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs @@ -54,7 +52,16 @@ In all of the following commands, replace the ALL CAPS parameter names with appropriate values. See the parameter descriptions table for more information. ``` -## Provision a Cloud TPU with the queued resource API +### Provision Cloud TPUs with GKE + +For more information about using TPUs with GKE, see: +- +- +- + +## Configure a new environment + +### Provision a Cloud TPU with the queued resource API Create a TPU v5e with 4 TPU chips: @@ -102,6 +109,14 @@ Connect to your TPU using SSH: gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE ``` +## Set up using Python + +### Pre-built wheels + +Currently, there are no pre-built TPU wheels. + +### Build wheel from source + Install Miniconda: ```bash @@ -142,28 +157,25 @@ Run the setup script: VLLM_TARGET_DEVICE="tpu" python setup.py develop ``` -## Provision Cloud TPUs with GKE +## Set up using Docker -For more information about using TPUs with GKE, see - - - +### Pre-built images -(build-docker-tpu)= +See for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`. -## Build a docker image with {code}`Dockerfile.tpu` +### Build image from source You can use to build a Docker image with TPU support. ```console -$ docker build -f Dockerfile.tpu -t vllm-tpu . +docker build -f Dockerfile.tpu -t vllm-tpu . ``` Run the Docker image with the following command: ```console -$ # Make sure to add `--privileged --net host --shm-size=16G`. -$ docker run --privileged --net host --shm-size=16G -it vllm-tpu +# Make sure to add `--privileged --net host --shm-size=16G`. +docker run --privileged --net host --shm-size=16G -it vllm-tpu ``` ```{note} @@ -189,3 +201,7 @@ Install OpenBLAS with the following command: $ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev ``` ```` + +## Extra information + +There is no extra information for this device. diff --git a/docs/source/getting_started/installation/cpu-arm.md b/docs/source/getting_started/installation/cpu-arm.md deleted file mode 100644 index e199073ed721f..0000000000000 --- a/docs/source/getting_started/installation/cpu-arm.md +++ /dev/null @@ -1,46 +0,0 @@ -(installation-arm)= - -# Installation for ARM CPUs - -vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM (which also apply to Apple Silicon, see [Installation for macOS](#installation-apple) for more). For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering: - -- CPU backend inference capabilities -- Relevant runtime environment variables -- Performance optimization tips - -ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. -Contents: - -1. [Requirements](#arm-backend-requirements) -2. [Quick Start with Dockerfile](#arm-backend-quick-start-dockerfile) -3. [Building from Source](#build-arm-backend-from-source) - -(arm-backend-requirements)= - -## Requirements - -- **Operating System**: Linux or macOS -- **Compilers**: `gcc/g++ >= 12.3.0` (optional, but recommended) or `Apple Clang >= 15.0.0` for macOS -- **Instruction Set Architecture (ISA)**: NEON support is required - -(arm-backend-quick-start-dockerfile)= - -## Quick Start with Dockerfile - -You can quickly set up vLLM on ARM using Docker: - -```console -$ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g . -$ docker run -it \ - --rm \ - --network=host \ - --cpuset-cpus= \ - --cpuset-mems= \ - vllm-cpu-env -``` - -(build-arm-backend-from-source)= - -## Building from Source - -To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility. diff --git a/docs/source/getting_started/installation/cpu-apple.md b/docs/source/getting_started/installation/cpu/apple.inc.md similarity index 58% rename from docs/source/getting_started/installation/cpu-apple.md rename to docs/source/getting_started/installation/cpu/apple.inc.md index b55e4384d064d..56545253b1ef7 100644 --- a/docs/source/getting_started/installation/cpu-apple.md +++ b/docs/source/getting_started/installation/cpu/apple.inc.md @@ -1,42 +1,40 @@ -(installation-apple)= +# Installation -# Installation for macOS - -vLLM has experimental support for macOS with Apple Silicon. For now, users shall build from the source vLLM to natively run on macOS. For more details, like running on vLLM in a docker container, see [ARM CPU Documentation](installation-arm) +vLLM has experimental support for macOS with Apple silicon. For now, users shall build from the source vLLM to natively run on macOS. Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. ## Requirements -- **Operating System**: `macOS Sonoma` or later -- **SDK** `XCode 15.4` or later with Command Line Tools -- **Compilers**: `Apple Clang >= 15.0.0` +- OS: `macOS Sonoma` or later +- SDK: `XCode 15.4` or later with Command Line Tools +- Compiler: `Apple Clang >= 15.0.0` - +## Set up using Python -## Build and installation +### Pre-built wheels + +### Build wheel from source After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source. -``` -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ pip install -r requirements-cpu.txt -$ pip install -e . +```console +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -r requirements-cpu.txt +pip install -e . ``` ```{note} On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device. ``` +#### Troubleshooting - -## Troubleshooting - -If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your +If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your [Command Line Tools for Xcode](https://developer.apple.com/download/all/). -``` +```text [...] fatal error: 'map' file not found 1 | #include | ^~~~~ @@ -49,3 +47,10 @@ If the build has error like the following snippet where standard C++ headers can 1 error generated. ``` +## Set up using Docker + +### Pre-built images + +### Build image from source + +## Extra information diff --git a/docs/source/getting_started/installation/cpu/arm.inc.md b/docs/source/getting_started/installation/cpu/arm.inc.md new file mode 100644 index 0000000000000..08a764e1a25f4 --- /dev/null +++ b/docs/source/getting_started/installation/cpu/arm.inc.md @@ -0,0 +1,30 @@ +# Installation + +vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. + +ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. + +## Requirements + +- OS: Linux +- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended) +- Instruction Set Architecture (ISA): NEON support is required + +## Set up using Python + +### Pre-built wheels + +### Build wheel from source + +:::{include} build.inc.md +::: + +Testing has been conducted on AWS Graviton3 instances for compatibility. + +## Set up using Docker + +### Pre-built images + +### Build image from source + +## Extra information diff --git a/docs/source/getting_started/installation/cpu/build.inc.md b/docs/source/getting_started/installation/cpu/build.inc.md new file mode 100644 index 0000000000000..f8d1044a0d198 --- /dev/null +++ b/docs/source/getting_started/installation/cpu/build.inc.md @@ -0,0 +1,21 @@ +First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: + +```console +sudo apt-get update -y +sudo apt-get install -y gcc-12 g++-12 libnuma-dev +sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 +``` + +Second, install Python packages for vLLM CPU backend building: + +```console +pip install --upgrade pip +pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy +pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +``` + +Finally, build and install vLLM CPU backend: + +```console +VLLM_TARGET_DEVICE=cpu python setup.py install +``` diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu/index.md similarity index 62% rename from docs/source/getting_started/installation/cpu-x86.md rename to docs/source/getting_started/installation/cpu/index.md index bb046dd0fd9dc..4ec907c0e9fda 100644 --- a/docs/source/getting_started/installation/cpu-x86.md +++ b/docs/source/getting_started/installation/cpu/index.md @@ -1,35 +1,136 @@ -(installation-x86)= +# CPU -# Installation for x86 CPUs +vLLM is a Python library that supports the following CPU variants. Select your CPU type to see vendor specific instructions: -vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features: +::::{tab-set} +:sync-group: device -- Tensor Parallel -- Model Quantization (`INT8 W8A8, AWQ`) -- Chunked-prefill -- Prefix-caching -- FP8-E5M2 KV-Caching (TODO) +:::{tab-item} x86 +:sync: x86 -Table of contents: +```{include} x86.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` -1. [Requirements](#cpu-backend-requirements) -2. [Quick start using Dockerfile](#cpu-backend-quick-start-dockerfile) -3. [Build from source](#build-cpu-backend-from-source) -4. [Related runtime environment variables](#env-intro) -5. [Intel Extension for PyTorch](#ipex-guidance) -6. [Performance tips](#cpu-backend-performance-tips) +::: -(cpu-backend-requirements)= +:::{tab-item} ARM +:sync: arm + +```{include} arm.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::{tab-item} Apple silicon +:sync: apple + +```{include} apple.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::: ## Requirements -- OS: Linux -- Compiler: `gcc/g++>=12.3.0` (optional, recommended) -- Instruction set architecture (ISA) requirement: AVX512 (optional, recommended) +- Python: 3.9 -- 3.12 -(cpu-backend-quick-start-dockerfile)= +::::{tab-set} +:sync-group: device -## Quick start using Dockerfile +:::{tab-item} x86 +:sync: x86 + +```{include} x86.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" +``` + +::: + +:::{tab-item} ARM +:sync: arm + +```{include} arm.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" +``` + +::: + +:::{tab-item} Apple silicon +:sync: apple + +```{include} apple.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" +``` + +::: + +:::: + +## Set up using Python + +### Create a new Python environment + +```{include} ../python_env_setup.inc.md +``` + +### Pre-built wheels + +Currently, there are no pre-built CPU wheels. + +### Build wheel from source + +::::{tab-set} +:sync-group: device + +:::{tab-item} x86 +:sync: x86 + +```{include} x86.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::{tab-item} ARM +:sync: arm + +```{include} arm.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::{tab-item} Apple silicon +:sync: apple + +```{include} apple.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::: + +## Set up using Docker + +### Pre-built images + +Currently, there are no pre-build CPU images. + +### Build image from source ```console $ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g . @@ -41,69 +142,42 @@ $ docker run -it \ vllm-cpu-env ``` -(build-cpu-backend-from-source)= +:::{tip} +For ARM or Apple silicon, use `Dockerfile.arm` +::: -## Build from source +## Supported features -- First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: +vLLM CPU backend supports the following vLLM features: -```console -$ sudo apt-get update -y -$ sudo apt-get install -y gcc-12 g++-12 libnuma-dev -$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 -``` - -- Second, install Python packages for vLLM CPU backend building: - -```console -$ pip install --upgrade pip -$ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy -$ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu -``` - -- Finally, build and install vLLM CPU backend: - -```console -$ VLLM_TARGET_DEVICE=cpu python setup.py install -``` - -```{note} -- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. -- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building. -``` - -(env-intro)= +- Tensor Parallel +- Model Quantization (`INT8 W8A8, AWQ, GPTQ`) +- Chunked-prefill +- Prefix-caching +- FP8-E5M2 KV-Caching (TODO) ## Related runtime environment variables - `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. -(ipex-guidance)= - -## Intel Extension for PyTorch - -- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. - -(cpu-backend-performance-tips)= - ## Performance tips - We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run: ```console -$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library -$ find / -name *libtcmalloc* # find the dynamic link library path -$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD -$ python examples/offline_inference/offline_inference.py # run vLLM +sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library +find / -name *libtcmalloc* # find the dynamic link library path +export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD +python examples/offline_inference/basic.py # run vLLM ``` - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: ```console -$ export VLLM_CPU_KVCACHE_SPACE=40 -$ export VLLM_CPU_OMP_THREADS_BIND=0-29 -$ vllm serve facebook/opt-125m +export VLLM_CPU_KVCACHE_SPACE=40 +export VLLM_CPU_OMP_THREADS_BIND=0-29 +vllm serve facebook/opt-125m ``` - If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND`. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: @@ -132,23 +206,23 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 $ export VLLM_CPU_OMP_THREADS_BIND=0-7 -$ python examples/offline_inference/offline_inference.py +$ python examples/offline_inference/basic.py ``` - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access. -## CPU Backend Considerations +## Other considerations - The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance. - Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance. -- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel. +- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.inc.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel. - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: ```console - $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp + VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp ``` - - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md). + - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.inc.md). diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md new file mode 100644 index 0000000000000..e4f99d3cebdf2 --- /dev/null +++ b/docs/source/getting_started/installation/cpu/x86.inc.md @@ -0,0 +1,35 @@ +# Installation + +vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. + +## Requirements + +- OS: Linux +- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended) +- Instruction Set Architecture (ISA): AVX512 (optional, recommended) + +## Set up using Python + +### Pre-built wheels + +### Build wheel from source + +:::{include} build.inc.md +::: + +```{note} +- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. +- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building. +``` + +## Set up using Docker + +### Pre-built images + +### Build image from source + +## Extra information + +## Intel Extension for PyTorch + +- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. diff --git a/docs/source/getting_started/installation/device.template.md b/docs/source/getting_started/installation/device.template.md new file mode 100644 index 0000000000000..44f538da93659 --- /dev/null +++ b/docs/source/getting_started/installation/device.template.md @@ -0,0 +1,17 @@ +# Installation + +## Requirements + +## Set up using Python + +### Pre-built wheels + +### Build wheel from source + +## Set up using Docker + +### Pre-built images + +### Build image from source + +## Extra information diff --git a/docs/source/getting_started/installation/gpu-rocm.md b/docs/source/getting_started/installation/gpu-rocm.md deleted file mode 100644 index e36b92513e31d..0000000000000 --- a/docs/source/getting_started/installation/gpu-rocm.md +++ /dev/null @@ -1,163 +0,0 @@ -(installation-rocm)= - -# Installation for ROCm - -vLLM supports AMD GPUs with ROCm 6.2. - -## Requirements - -- OS: Linux -- Python: 3.9 -- 3.12 -- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) -- ROCm 6.2 - -Installation options: - -1. [Build from source with docker](#build-from-source-docker-rocm) -2. [Build from source](#build-from-source-rocm) - -(build-from-source-docker-rocm)= - -## Option 1: Build from source with docker (recommended) - -You can build and install vLLM from source. - -First, build a docker image from and launch a docker container from the image. -It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: - -```console -{ - "features": { - "buildkit": true - } -} -``` - - uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. -It provides flexibility to customize the build of docker image using the following arguments: - -- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image. -- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target. -- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942` -- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c` -- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1. - -Their values can be passed in when running `docker build` with `--build-arg` options. - -To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default: - -```console -$ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . -``` - -To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below: - -```console -$ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . -``` - -To run the above docker image `vllm-rocm`, use the below command: - -```console -$ docker run -it \ - --network=host \ - --group-add=video \ - --ipc=host \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --device /dev/kfd \ - --device /dev/dri \ - -v :/app/model \ - vllm-rocm \ - bash -``` - -Where the `` is the location where the model is stored, for example, the weights for llama2 or llama3 models. - -(build-from-source-rocm)= - -## Option 2: Build from source - -0. Install prerequisites (skip if you are already in an environment/docker with the following installed): - -- [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) -- [PyTorch](https://pytorch.org/) - -For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`. - -Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/) - -1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton) - -Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) - -```console -$ python3 -m pip install ninja cmake wheel pybind11 -$ pip uninstall -y triton -$ git clone https://github.com/OpenAI/triton.git -$ cd triton -$ git checkout e192dba -$ cd python -$ pip3 install . -$ cd ../.. -``` - -```{note} -- If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. -``` - -2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile) - -Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support) -Alternatively, wheels intended for vLLM use can be accessed under the releases. - -For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`. - -```console -$ git clone https://github.com/ROCm/flash-attention.git -$ cd flash-attention -$ git checkout 3cea2fb -$ git submodule update --init -$ GPU_ARCHS="gfx90a" python3 setup.py install -$ cd .. -``` - -```{note} -- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) -``` - -3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps: - -```bash -$ pip install --upgrade pip - -# Install PyTorch -$ pip uninstall torch -y -$ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 - -# Build & install AMD SMI -$ pip install /opt/rocm/share/amd_smi - -# Install dependencies -$ pip install --upgrade numba scipy huggingface-hub[cli] -$ pip install "numpy<2" -$ pip install -r requirements-rocm.txt - -# Build vLLM for MI210/MI250/MI300. -$ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" -$ python3 setup.py develop -``` - -This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation. - -```{tip} -- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. -- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. -- To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. -- The ROCm version of PyTorch, ideally, should match the ROCm driver version. -``` - -```{tip} -- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. - For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). -``` diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu/cuda.inc.md similarity index 63% rename from docs/source/getting_started/installation/gpu-cuda.md rename to docs/source/getting_started/installation/gpu/cuda.inc.md index 419b8163fc034..4cce65278c069 100644 --- a/docs/source/getting_started/installation/gpu-cuda.md +++ b/docs/source/getting_started/installation/gpu/cuda.inc.md @@ -1,72 +1,52 @@ -(installation-cuda)= +# Installation -# Installation for CUDA - -vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. +vLLM contains pre-compiled C++ and CUDA (12.1) binaries. ## Requirements -- OS: Linux -- Python: 3.9 -- 3.12 - GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) -## Install released versions +## Set up using Python ### Create a new Python environment -You can create a new Python environment using `conda`: - -```console -$ # (Recommended) Create a new conda environment. -$ conda create -n myenv python=3.12 -y -$ conda activate myenv -``` - ```{note} -[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages. In particular, the PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See for more details. -``` - -Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command: - -```console -$ # (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment. -$ uv venv myenv --python 3.12 --seed -$ source myenv/bin/activate +PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See for more details. ``` In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-from-source) for more details. -### Install vLLM +### Pre-built wheels You can install vLLM using either `pip` or `uv pip`: ```console -$ # Install vLLM with CUDA 12.1. -$ pip install vllm # If you are using pip. -$ uv pip install vllm # If you are using uv. +# Install vLLM with CUDA 12.1. +pip install vllm # If you are using pip. +uv pip install vllm # If you are using uv. ``` As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions: ```console -$ # Install vLLM with CUDA 11.8. -$ export VLLM_VERSION=0.6.1.post1 -$ export PYTHON_VERSION=310 -$ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +# Install vLLM with CUDA 11.8. +export VLLM_VERSION=0.6.1.post1 +export PYTHON_VERSION=310 +pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 ``` (install-the-latest-code)= -## Install the latest code +#### Install the latest code LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. -### Install the latest code using `pip` +##### Install the latest code using `pip` ```console -$ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly +pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly ``` `--pre` is required for `pip` to consider pre-released versions. @@ -74,82 +54,65 @@ $ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL: ```console -$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch -$ pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl ``` Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. -### Install the latest code using `uv` +##### Install the latest code using `uv` Another way to install the latest code is to use `uv`: ```console -$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly +uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly ``` If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL: ```console -$ export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch -$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} +export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch +uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} ``` The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. -### Install the latest code using `docker` +### Build wheel from source -Another way to access the latest code is to use the docker images: - -```console -$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch -$ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} -``` - -These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. - -The latest code can contain bugs and may not be stable. Please use it with caution. - -(build-from-source)= - -## Build from source - -(python-only-build)= - -### Python-only build (without compilation) +#### Set up using Python-only build (without compilation) If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ VLLM_USE_PRECOMPILED=1 pip install --editable . +git clone https://github.com/vllm-project/vllm.git +cd vllm +VLLM_USE_PRECOMPILED=1 pip install --editable . ``` -This will download the latest nightly wheel from https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation. +This will download the [latest nightly wheel](https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl) and use the compiled libraries from there in the installation. The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files): ```console -$ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl -$ pip install --editable . +export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl +pip install --editable . ``` -You can find more information about vLLM's wheels [above](#install-the-latest-code). +You can find more information about vLLM's wheels in . ```{note} There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. -It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [the section above](#install-the-latest-code) for instructions on how to install a specified wheel. +It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to for instructions on how to install a specified wheel. ``` -### Full build (with compilation) +#### Full build (with compilation) If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ pip install -e . +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -e . ``` ```{tip} @@ -162,7 +125,7 @@ As long as `which ccache` command can find the `ccache` binary, it will be used The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`. ``` -#### Use an existing PyTorch installation +##### Use an existing PyTorch installation There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.: @@ -172,32 +135,32 @@ There are scenarios where the PyTorch dependency cannot be easily installed via To build vLLM using an existing PyTorch installation: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ python use_existing_torch.py -$ pip install -r requirements-build.txt -$ pip install -e . --no-build-isolation +git clone https://github.com/vllm-project/vllm.git +cd vllm +python use_existing_torch.py +pip install -r requirements-build.txt +pip install -e . --no-build-isolation ``` -#### Use the local cutlass for compilation +##### Use the local cutlass for compilation Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead. To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory. ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . +git clone https://github.com/vllm-project/vllm.git +cd vllm +VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . ``` -#### Troubleshooting +##### Troubleshooting To avoid your system being overloaded, you can limit the number of compilation jobs to be run simultaneously, via the environment variable `MAX_JOBS`. For example: ```console -$ export MAX_JOBS=6 -$ pip install -e . +export MAX_JOBS=6 +pip install -e . ``` This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory. @@ -206,31 +169,56 @@ A side effect is a much slower build process. Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. ```console -$ # Use `--ipc=host` to make sure the shared memory is large enough. -$ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 +# Use `--ipc=host` to make sure the shared memory is large enough. +docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 ``` If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.: ```console -$ export CUDA_HOME=/usr/local/cuda -$ export PATH="${CUDA_HOME}/bin:$PATH" +export CUDA_HOME=/usr/local/cuda +export PATH="${CUDA_HOME}/bin:$PATH" ``` Here is a sanity check to verify that the CUDA Toolkit is correctly installed: ```console -$ nvcc --version # verify that nvcc is in your PATH -$ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME +nvcc --version # verify that nvcc is in your PATH +${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME ``` -### Unsupported OS build +#### Unsupported OS build vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing: ```console -$ export VLLM_TARGET_DEVICE=empty -$ pip install -e . +export VLLM_TARGET_DEVICE=empty +pip install -e . ``` + +## Set up using Docker + +### Pre-built images + +See for instructions on using the official Docker image. + +Another way to access the latest code is to use the docker images: + +```console +export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} +``` + +These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. + +The latest code can contain bugs and may not be stable. Please use it with caution. + +### Build image from source + +See for instructions on building the Docker image. + +## Supported features + +See compatibility matrix for feature support information. diff --git a/docs/source/getting_started/installation/gpu/index.md b/docs/source/getting_started/installation/gpu/index.md new file mode 100644 index 0000000000000..6c007382b2c3d --- /dev/null +++ b/docs/source/getting_started/installation/gpu/index.md @@ -0,0 +1,300 @@ +# GPU + +vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions: + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "# Installation" +:end-before: "## Requirements" +``` + +::: + +:::: + +## Requirements + +- OS: Linux +- Python: 3.9 -- 3.12 + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "## Requirements" +:end-before: "## Set up using Python" +``` + +::: + +:::: + +## Set up using Python + +### Create a new Python environment + +```{include} ../python_env_setup.inc.md +``` + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "## Create a new Python environment" +:end-before: "### Pre-built wheels" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +There is no extra information on creating a new Python environment for this device. + +::: + +:::{tab-item} XPU +:sync: xpu + +There is no extra information on creating a new Python environment for this device. + +::: + +:::: + +### Pre-built wheels + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "### Pre-built wheels" +:end-before: "### Build wheel from source" +``` + +::: + +:::: + +(build-from-source)= + +### Build wheel from source + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "### Build wheel from source" +:end-before: "## Set up using Docker" +``` + +::: + +:::: + +## Set up using Docker + +### Pre-built images + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "### Pre-built images" +:end-before: "### Build image from source" +``` + +::: + +:::: + +### Build image from source + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "### Build image from source" +:end-before: "## Supported features" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "### Build image from source" +:end-before: "## Supported features" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "### Build image from source" +:end-before: "## Supported features" +``` + +::: + +:::: + +## Supported features + +::::{tab-set} +:sync-group: device + +:::{tab-item} CUDA +:sync: cuda + +```{include} cuda.inc.md +:start-after: "## Supported features" +``` + +::: + +:::{tab-item} ROCm +:sync: rocm + +```{include} rocm.inc.md +:start-after: "## Supported features" +``` + +::: + +:::{tab-item} XPU +:sync: xpu + +```{include} xpu.inc.md +:start-after: "## Supported features" +``` + +::: + +:::: diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md new file mode 100644 index 0000000000000..4256027e6c40e --- /dev/null +++ b/docs/source/getting_started/installation/gpu/rocm.inc.md @@ -0,0 +1,166 @@ +# Installation + +vLLM supports AMD GPUs with ROCm 6.2. + +## Requirements + +- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) +- ROCm 6.2 + +## Set up using Python + +### Pre-built wheels + +Currently, there are no pre-built ROCm wheels. + +### Build wheel from source + +0. Install prerequisites (skip if you are already in an environment/docker with the following installed): + +- [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) +- [PyTorch](https://pytorch.org/) + + For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`. + + Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/) + +1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton) + + Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) + + ```console + python3 -m pip install ninja cmake wheel pybind11 + pip uninstall -y triton + git clone https://github.com/OpenAI/triton.git + cd triton + git checkout e192dba + cd python + pip3 install . + cd ../.. + ``` + + ```{note} + - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. + ``` + +2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile) + + Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support) + Alternatively, wheels intended for vLLM use can be accessed under the releases. + + For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`. + + ```console + git clone https://github.com/ROCm/flash-attention.git + cd flash-attention + git checkout 3cea2fb + git submodule update --init + GPU_ARCHS="gfx90a" python3 setup.py install + cd .. + ``` + + ```{note} + - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) + ``` + +3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps: + + ```bash + $ pip install --upgrade pip + + # Install PyTorch + $ pip uninstall torch -y + $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/rocm6.2 + + # Build & install AMD SMI + $ pip install /opt/rocm/share/amd_smi + + # Install dependencies + $ pip install --upgrade numba scipy huggingface-hub[cli] + $ pip install "numpy<2" + $ pip install -r requirements-rocm.txt + + # Build vLLM for MI210/MI250/MI300. + $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" + $ python3 setup.py develop + ``` + + This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation. + + ```{tip} + - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. + - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. + - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. + - The ROCm version of PyTorch, ideally, should match the ROCm driver version. + ``` + +```{tip} +- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. + For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). +``` + +## Set up using Docker + +### Pre-built images + +Currently, there are no pre-built ROCm images. + +### Build image from source + +Building the Docker image from source is the recommended way to use vLLM with ROCm. + +First, build a docker image from and launch a docker container from the image. +It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: + +```console +{ + "features": { + "buildkit": true + } +} +``` + + uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. +It provides flexibility to customize the build of docker image using the following arguments: + +- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image. +- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target. +- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942` +- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c` +- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1. + +Their values can be passed in when running `docker build` with `--build-arg` options. + +To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default: + +```console +DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . +``` + +To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below: + +```console +DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . +``` + +To run the above docker image `vllm-rocm`, use the below command: + +```console +docker run -it \ + --network=host \ + --group-add=video \ + --ipc=host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --device /dev/kfd \ + --device /dev/dri \ + -v :/app/model \ + vllm-rocm \ + bash +``` + +Where the `` is the location where the model is stored, for example, the weights for llama2 or llama3 models. + +## Supported features + +See compatibility matrix for feature support information. diff --git a/docs/source/getting_started/installation/xpu.md b/docs/source/getting_started/installation/gpu/xpu.inc.md similarity index 61% rename from docs/source/getting_started/installation/xpu.md rename to docs/source/getting_started/installation/gpu/xpu.inc.md index c1ab5478eb652..577986eba74fd 100644 --- a/docs/source/getting_started/installation/xpu.md +++ b/docs/source/getting_started/installation/gpu/xpu.inc.md @@ -1,26 +1,47 @@ -(installation-xpu)= - -# Installation for XPUs +# Installation vLLM initially supports basic model inferencing and serving on Intel GPU platform. -Table of contents: - -1. [Requirements](#xpu-backend-requirements) -2. [Quick start using Dockerfile](#xpu-backend-quick-start-dockerfile) -3. [Build from source](#build-xpu-backend-from-source) - -(xpu-backend-requirements)= - ## Requirements -- OS: Linux - Supported Hardware: Intel Data Center GPU, Intel ARC GPU - OneAPI requirements: oneAPI 2024.2 -(xpu-backend-quick-start-dockerfile)= +## Set up using Python -## Quick start using Dockerfile +### Pre-built wheels + +Currently, there are no pre-built XPU wheels. + +### Build wheel from source + +- First, install required driver and intel OneAPI 2024.2 or later. +- Second, install Python packages for vLLM XPU backend building: + +```console +source /opt/intel/oneapi/setvars.sh +pip install --upgrade pip +pip install -v -r requirements-xpu.txt +``` + +- Finally, build and install vLLM XPU backend: + +```console +VLLM_TARGET_DEVICE=xpu python setup.py install +``` + +```{note} +- FP16 is the default data type in the current XPU backend. The BF16 data + type will be supported in the future. +``` + +## Set up using Docker + +### Pre-built images + +Currently, there are no pre-built XPU images. + +### Build image from source ```console $ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . @@ -32,43 +53,19 @@ $ docker run -it \ vllm-xpu-env ``` -(build-xpu-backend-from-source)= - -## Build from source - -- First, install required driver and intel OneAPI 2024.2 or later. -- Second, install Python packages for vLLM XPU backend building: - -```console -$ source /opt/intel/oneapi/setvars.sh -$ pip install --upgrade pip -$ pip install -v -r requirements-xpu.txt -``` - -- Finally, build and install vLLM XPU backend: - -```console -$ VLLM_TARGET_DEVICE=xpu python setup.py install -``` - -```{note} -- FP16 is the default data type in the current XPU backend. The BF16 data - type will be supported in the future. -``` - -## Distributed inference and serving +## Supported features XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following: ```console -$ python -m vllm.entrypoints.openai.api_server \ -$ --model=facebook/opt-13b \ -$ --dtype=bfloat16 \ -$ --device=xpu \ -$ --max_model_len=1024 \ -$ --distributed-executor-backend=ray \ -$ --pipeline-parallel-size=2 \ -$ -tp=8 +python -m vllm.entrypoints.openai.api_server \ + --model=facebook/opt-13b \ + --dtype=bfloat16 \ + --device=xpu \ + --max_model_len=1024 \ + --distributed-executor-backend=ray \ + --pipeline-parallel-size=2 \ + -tp=8 ``` By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md index 0ebadca2ccec9..bc1d268bf0c7e 100644 --- a/docs/source/getting_started/installation/index.md +++ b/docs/source/getting_started/installation/index.md @@ -7,14 +7,7 @@ vLLM supports the following hardware platforms: ```{toctree} :maxdepth: 1 -gpu-cuda -gpu-rocm -cpu-x86 -cpu-arm -cpu-apple -hpu-gaudi -tpu -xpu -openvino -neuron +gpu/index +cpu/index +ai_accelerator/index ``` diff --git a/docs/source/getting_started/installation/python_env_setup.inc.md b/docs/source/getting_started/installation/python_env_setup.inc.md new file mode 100644 index 0000000000000..25cfac5f58aa7 --- /dev/null +++ b/docs/source/getting_started/installation/python_env_setup.inc.md @@ -0,0 +1,19 @@ +You can create a new Python environment using `conda`: + +```console +# (Recommended) Create a new conda environment. +conda create -n myenv python=3.12 -y +conda activate myenv +``` + +```{note} +[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages. +``` + +Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command: + +```console +# (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment. +uv venv myenv --python 3.12 --seed +source myenv/bin/activate +``` diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index ea15d9ef065fa..8ac80e5e5c553 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -5,7 +5,7 @@ This guide will help you quickly get started with vLLM to perform: - [Offline batched inference](#quickstart-offline) -- [Online inference using OpenAI-compatible server](#quickstart-online) +- [Online serving using OpenAI-compatible server](#quickstart-online) ## Prerequisites @@ -19,17 +19,17 @@ If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/ It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands: ```console -$ uv venv myenv --python 3.12 --seed -$ source myenv/bin/activate -$ uv pip install vllm +uv venv myenv --python 3.12 --seed +source myenv/bin/activate +uv pip install vllm ``` You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. ```console -$ conda create -n myenv python=3.12 -y -$ conda activate myenv -$ pip install vllm +conda create -n myenv python=3.12 -y +conda activate myenv +pip install vllm ``` ```{note} @@ -40,7 +40,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in ## Offline Batched Inference -With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: +With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`: @@ -94,7 +94,7 @@ By default, it starts the server at `http://localhost:8000`. You can specify the Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model: ```console -$ vllm serve Qwen/Qwen2.5-1.5B-Instruct +vllm serve Qwen/Qwen2.5-1.5B-Instruct ``` ```{note} @@ -105,7 +105,7 @@ You can learn about overriding it [here](#chat-template). This server can be queried in the same format as OpenAI API. For example, to list the models: ```console -$ curl http://localhost:8000/v1/models +curl http://localhost:8000/v1/models ``` You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` to enable the server to check for API key in the header. @@ -115,14 +115,14 @@ You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` Once your server is started, you can query the model with input prompts: ```console -$ curl http://localhost:8000/v1/completions \ -$ -H "Content-Type: application/json" \ -$ -d '{ -$ "model": "Qwen/Qwen2.5-1.5B-Instruct", -$ "prompt": "San Francisco is a", -$ "max_tokens": 7, -$ "temperature": 0 -$ }' +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-1.5B-Instruct", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' ``` Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package: @@ -151,15 +151,15 @@ vLLM is designed to also support the OpenAI Chat Completions API. The chat inter You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model: ```console -$ curl http://localhost:8000/v1/chat/completions \ -$ -H "Content-Type: application/json" \ -$ -d '{ -$ "model": "Qwen/Qwen2.5-1.5B-Instruct", -$ "messages": [ -$ {"role": "system", "content": "You are a helpful assistant."}, -$ {"role": "user", "content": "Who won the world series in 2020?"} -$ ] -$ }' +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-1.5B-Instruct", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Who won the world series in 2020?"} + ] + }' ``` Alternatively, you can use the `openai` Python package: diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md index f5efe0bef7506..1e290d2b4c0bd 100644 --- a/docs/source/getting_started/troubleshooting.md +++ b/docs/source/getting_started/troubleshooting.md @@ -48,6 +48,7 @@ If vLLM crashes and the error trace captures it somewhere around `self.graph.rep To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. (troubleshooting-incorrect-hardware-driver)= + ## Incorrect hardware/driver If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. @@ -118,13 +119,13 @@ dist.destroy_process_group() If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use: ```console -$ NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py +NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py ``` If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address of the master node, reachable from all nodes. Then, run: ```console -$ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py +NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py ``` If the script runs successfully, you should see the message `sanity check is successful!`. @@ -141,6 +142,7 @@ Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup ``` (troubleshooting-python-multiprocessing)= + ## Python multiprocessing ### `RuntimeError` Exception diff --git a/docs/source/index.md b/docs/source/index.md index 23e4304fe29d9..d7a1117df9c27 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -1,4 +1,4 @@ -# Welcome to vLLM! +# Welcome to vLLM ```{figure} ./assets/logos/vllm-logo-text-light.png :align: center @@ -23,10 +23,12 @@ vLLM is a fast and easy-to-use library for LLM inference and serving. +Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry. + vLLM is fast with: - State-of-the-art serving throughput -- Efficient management of attention key and value memory with **PagedAttention** +- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) - Continuous batching of incoming requests - Fast model execution with CUDA/HIP graph - Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8 @@ -54,6 +56,8 @@ For more information, check out the following: ## Documentation +% How to start using vLLM? + ```{toctree} :caption: Getting Started :maxdepth: 1 @@ -65,6 +69,8 @@ getting_started/troubleshooting getting_started/faq ``` +% What does vLLM support? + ```{toctree} :caption: Models :maxdepth: 1 @@ -75,6 +81,8 @@ models/supported_models models/extensions/index ``` +% Additional capabilities + ```{toctree} :caption: Features :maxdepth: 1 @@ -89,6 +97,8 @@ features/spec_decode features/compatibility_matrix ``` +% Details about running vLLM + ```{toctree} :caption: Inference and Serving :maxdepth: 1 @@ -104,6 +114,8 @@ serving/usage_stats serving/integrations/index ``` +% Scaling up vLLM for production + ```{toctree} :caption: Deployment :maxdepth: 1 @@ -115,6 +127,8 @@ deployment/frameworks/index deployment/integrations/index ``` +% Making the most out of vLLM + ```{toctree} :caption: Performance :maxdepth: 1 @@ -123,28 +137,7 @@ performance/optimization performance/benchmarks ``` -% Community: User community resources - -```{toctree} -:caption: Community -:maxdepth: 1 - -community/meetups -community/sponsors -``` - -```{toctree} -:caption: API Reference -:maxdepth: 2 - -api/offline_inference/index -api/engine/index -api/inference_params -api/multimodal/index -api/model/index -``` - -% Design Documents: Details about vLLM internals +% Explanation of vLLM internals ```{toctree} :caption: Design Documents @@ -154,12 +147,12 @@ design/arch_overview design/huggingface_integration design/plugin_system design/kernel/paged_attention -design/input_processing/model_inputs_index +design/mm_processing design/automatic_prefix_caching design/multiprocessing ``` -% Developer Guide: How to contribute to the vLLM project +% How to contribute to the vLLM project ```{toctree} :caption: Developer Guide @@ -172,7 +165,30 @@ contributing/model/index contributing/vulnerability_management ``` -# Indices and tables +% Technical API specifications + +```{toctree} +:caption: API Reference +:maxdepth: 2 + +api/offline_inference/index +api/engine/index +api/inference_params +api/multimodal/index +api/model/index +``` + +% Latest news and acknowledgements + +```{toctree} +:caption: Community +:maxdepth: 1 + +community/meetups +community/sponsors +``` + +## Indices and tables - {ref}`genindex` - {ref}`modindex` diff --git a/docs/source/models/extensions/runai_model_streamer.md b/docs/source/models/extensions/runai_model_streamer.md index fe2701194a604..75f7a9fcad416 100644 --- a/docs/source/models/extensions/runai_model_streamer.md +++ b/docs/source/models/extensions/runai_model_streamer.md @@ -9,25 +9,25 @@ vLLM supports loading weights in Safetensors format using the Run:ai Model Strea You first need to install vLLM RunAI optional dependency: ```console -$ pip3 install vllm[runai] +pip3 install vllm[runai] ``` To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag: ```console -$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer ``` To run model from AWS S3 object store run: ```console -$ vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer +vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer ``` To run model from a S3 compatible object store run: ```console -$ RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer +RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer ``` ## Tunable parameters @@ -38,14 +38,14 @@ You can tune `concurrency` that controls the level of concurrency and number of For reading from S3, it will be the number of client instances the host is opening to the S3 server. ```console -$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' ``` You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit). ```console -$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' ``` ```{note} diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md index 6228c7c2ac957..e4b4cd03a90d2 100644 --- a/docs/source/models/generative_models.md +++ b/docs/source/models/generative_models.md @@ -8,14 +8,14 @@ In vLLM, generative models implement the {class}`~vllm.model_executor.models.Vll Based on the final hidden states of the input, these models output log probabilities of the tokens to generate, which are then passed through {class}`~vllm.model_executor.layers.Sampler` to obtain the final text. +For generative models, the only supported `--task` option is `"generate"`. +Usually, this is automatically inferred so you don't have to specify it. + ## Offline Inference The {class}`~vllm.LLM` class provides various methods for offline inference. See [Engine Arguments](#engine-args) for a list of options when initializing the model. -For generative models, the only supported {code}`task` option is {code}`"generate"`. -Usually, this is automatically inferred so you don't have to specify it. - ### `LLM.generate` The {class}`~vllm.LLM.generate` method is available to all generative models in vLLM. @@ -33,7 +33,7 @@ for output in outputs: ``` You can optionally control the language generation by passing {class}`~vllm.SamplingParams`. -For example, you can use greedy sampling by setting {code}`temperature=0`: +For example, you can use greedy sampling by setting `temperature=0`: ```python llm = LLM(model="facebook/opt-125m") @@ -46,7 +46,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -A code example can be found here: +A code example can be found here: ### `LLM.beam_search` @@ -103,7 +103,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -A code example can be found here: +A code example can be found here: If the model doesn't have a chat template or you want to specify another one, you can explicitly pass a chat template: @@ -118,7 +118,7 @@ print("Loaded chat template:", custom_template) outputs = llm.chat(conversation, chat_template=custom_template) ``` -## Online Inference +## Online Serving Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs: diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index 3e4407cfdc233..91db694be29a4 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -14,31 +14,54 @@ As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM feature pooling models as they only work on the generation or decode stage, so performance may not improve as much. ``` +For pooling models, we support the following `--task` options. +The selected option sets the default pooler used to extract the final hidden states: + +```{list-table} +:widths: 50 25 25 25 +:header-rows: 1 + +* - Task + - Pooling Type + - Normalization + - Softmax +* - Embedding (`embed`) + - `LAST` + - ✅︎ + - ✗ +* - Classification (`classify`) + - `LAST` + - ✗ + - ✅︎ +* - Sentence Pair Scoring (`score`) + - \* + - \* + - \* +* - Reward Modeling (`reward`) + - `ALL` + - ✗ + - ✗ +``` + +\*The default pooler is always defined by the model. + +```{note} +If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table. +``` + +When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, +we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`). + +```{tip} +You can customize the model's pooling method via the `--override-pooler-config` option, +which takes priority over both the model's and Sentence Transformers's defaults. +``` + ## Offline Inference The {class}`~vllm.LLM` class provides various methods for offline inference. See [Engine Arguments](#engine-args) for a list of options when initializing the model. -For pooling models, we support the following {code}`task` options: - -- Embedding ({code}`"embed"` / {code}`"embedding"`) -- Classification ({code}`"classify"`) -- Sentence Pair Scoring ({code}`"score"`) -- Reward Modeling ({code}`"reward"`) - -The selected task determines the default {class}`~vllm.model_executor.layers.Pooler` that is used: - -- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization. -- Classification: Extract only the hidden states corresponding to the last token, and apply softmax. -- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax. -- Reward Modeling: Extract all of the hidden states and return them directly. - -When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, -we attempt to override the default pooler based on its Sentence Transformers configuration file ({code}`modules.json`). - -You can customize the model's pooling method via the {code}`override_pooler_config` option, -which takes priority over both the model's and Sentence Transformers's defaults. - ### `LLM.encode` The {class}`~vllm.LLM.encode` method is available to all pooling models in vLLM. @@ -65,7 +88,7 @@ embeds = output.outputs.embedding print(f"Embeddings: {embeds!r} (size={len(embeds)})") ``` -A code example can be found here: +A code example can be found here: ### `LLM.classify` @@ -80,7 +103,7 @@ probs = output.outputs.probs print(f"Class Probabilities: {probs!r} (size={len(probs)})") ``` -A code example can be found here: +A code example can be found here: ### `LLM.score` @@ -102,9 +125,9 @@ score = output.outputs.score print(f"Score: {score}") ``` -A code example can be found here: +A code example can be found here: -## Online Inference +## Online Serving Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs: diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index acbe27a22a679..3da5aaf713c1f 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -45,7 +45,7 @@ Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable: ```shell -$ export VLLM_USE_MODELSCOPE=True +export VLLM_USE_MODELSCOPE=True ``` And use with `trust_remote_code=True`. @@ -216,6 +216,11 @@ See [this page](#generative-models) for more information on how to use generativ - `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. - ✅︎ - ✅︎ +* - `InternLM3ForCausalLM` + - InternLM3 + - `internlm/internlm3-8b-instruct`, etc. + - ✅︎ + - ✅︎ * - `JAISLMHeadModel` - Jais - `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. @@ -465,6 +470,11 @@ of the whole prompt are extracted from the normalized hidden state corresponding - `Qwen/Qwen2.5-Math-RM-72B`, etc. - ✅︎ - ✅︎ +* - `Qwen2ForProcessRewardModel` + - Qwen2-based + - `Qwen/Qwen2.5-Math-PRM-7B`, `Qwen/Qwen2.5-Math-PRM-72B`, etc. + - ✅︎ + - ✅︎ ``` If your model is not in the above list, we will try to automatically convert the model using @@ -552,7 +562,7 @@ See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the mod ````{important} To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference) -or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt: +or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt: Offline inference: ```python @@ -562,7 +572,7 @@ llm = LLM( ) ``` -Online inference: +Online serving: ```bash vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 ``` @@ -610,6 +620,13 @@ See [this page](#generative-models) for more information on how to use generativ - - ✅︎ - ✅︎ +* - `DeepseekVLV2ForCausalLM` + - DeepSeek-VL2 + - T + I+ + - `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note) + - + - ✅︎ + - ✅︎ * - `FuyuForCausalLM` - Fuyu - T + I @@ -742,7 +759,7 @@ See [this page](#generative-models) for more information on how to use generativ - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. - ✅︎ - ✅︎ - - + - ✅︎ * - `UltravoxModel` - Ultravox - T + AE+ @@ -756,7 +773,11 @@ See [this page](#generative-models) for more information on how to use generativ + Multiple items can be inputted per text prompt for this modality. ```{note} -To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. +To use `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM. +``` + +```{note} +To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. ``` ```{note} @@ -820,19 +841,22 @@ The following table lists those that are tested in vLLM. _________________ -# Model Support Policy +## Model Support Policy At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support: 1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated! + 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results. -```{tip} -When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. -``` + ```{tip} + When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. + ``` 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback. + 4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use. + 5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement. Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem. diff --git a/docs/source/performance/optimization.md b/docs/source/performance/optimization.md index 4fcde9b03b887..4fbc376e1aa39 100644 --- a/docs/source/performance/optimization.md +++ b/docs/source/performance/optimization.md @@ -8,7 +8,7 @@ Due to the auto-regressive nature of transformer architecture, there are times w The vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes available again. When this occurs, the following warning is printed: -``` +```text WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1 ``` diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index 4e0a9ef6ecf7d..daf6e2f250416 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -35,16 +35,16 @@ output = llm.generate("San Franciso is a") To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: ```console -$ vllm serve facebook/opt-13b \ -$ --tensor-parallel-size 4 + vllm serve facebook/opt-13b \ + --tensor-parallel-size 4 ``` You can also additionally specify `--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism: ```console -$ vllm serve gpt2 \ -$ --tensor-parallel-size 4 \ -$ --pipeline-parallel-size 2 + vllm serve gpt2 \ + --tensor-parallel-size 4 \ + --pipeline-parallel-size 2 ``` ## Running vLLM on multiple nodes @@ -56,21 +56,21 @@ The first step, is to start containers and organize them into a cluster. We have Pick a node as the head node, and run the following command: ```console -$ bash run_cluster.sh \ -$ vllm/vllm-openai \ -$ ip_of_head_node \ -$ --head \ -$ /path/to/the/huggingface/home/in/this/node +bash run_cluster.sh \ + vllm/vllm-openai \ + ip_of_head_node \ + --head \ + /path/to/the/huggingface/home/in/this/node ``` On the rest of the worker nodes, run the following command: ```console -$ bash run_cluster.sh \ -$ vllm/vllm-openai \ -$ ip_of_head_node \ -$ --worker \ -$ /path/to/the/huggingface/home/in/this/node +bash run_cluster.sh \ + vllm/vllm-openai \ + ip_of_head_node \ + --worker \ + /path/to/the/huggingface/home/in/this/node ``` Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct. @@ -80,16 +80,16 @@ Then, on any node, use `docker exec -it node /bin/bash` to enter the container, After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: ```console -$ vllm serve /path/to/the/model/in/the/container \ -$ --tensor-parallel-size 8 \ -$ --pipeline-parallel-size 2 + vllm serve /path/to/the/model/in/the/container \ + --tensor-parallel-size 8 \ + --pipeline-parallel-size 2 ``` You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16: ```console -$ vllm serve /path/to/the/model/in/the/container \ -$ --tensor-parallel-size 16 +vllm serve /path/to/the/model/in/the/container \ + --tensor-parallel-size 16 ``` To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. diff --git a/docs/source/serving/integrations/langchain.md b/docs/source/serving/integrations/langchain.md index 49ff6e0c32a72..03142d23b145a 100644 --- a/docs/source/serving/integrations/langchain.md +++ b/docs/source/serving/integrations/langchain.md @@ -7,7 +7,7 @@ vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain To install LangChain, run ```console -$ pip install langchain langchain_community -q +pip install langchain langchain_community -q ``` To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`. diff --git a/docs/source/serving/integrations/llamaindex.md b/docs/source/serving/integrations/llamaindex.md index 9961c181d7e1c..8c72605202cf5 100644 --- a/docs/source/serving/integrations/llamaindex.md +++ b/docs/source/serving/integrations/llamaindex.md @@ -7,7 +7,7 @@ vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index To install LlamaIndex, run ```console -$ pip install llama-index-llms-vllm -q +pip install llama-index-llms-vllm -q ``` To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`. diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md index e6ded2e6dd465..6c84f6d1350a6 100644 --- a/docs/source/serving/metrics.md +++ b/docs/source/serving/metrics.md @@ -7,7 +7,7 @@ OpenAI compatible API server. You can start the server using Python, or using [Docker](#deployment-docker): ```console -$ vllm serve unsloth/Llama-3.2-1B-Instruct +vllm serve unsloth/Llama-3.2-1B-Instruct ``` Then query the endpoint to get the latest metrics from the server: diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md index 9f5e1b908d786..0213b0a3388ea 100644 --- a/docs/source/serving/multimodal_inputs.md +++ b/docs/source/serving/multimodal_inputs.md @@ -14,7 +14,7 @@ and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/ch To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`: - `prompt`: The prompt should follow the format that is documented on HuggingFace. -- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.MultiModalDataDict`. +- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.inputs.MultiModalDataDict`. ### Image @@ -60,7 +60,7 @@ for o in outputs: print(generated_text) ``` -Full example: +Full example: To substitute multiple images inside the same text prompt, you can pass in a list of images instead: @@ -91,7 +91,7 @@ for o in outputs: print(generated_text) ``` -Full example: +Full example: Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: @@ -125,13 +125,13 @@ for o in outputs: You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary instead of using multi-image input. -Full example: +Full example: ### Audio You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary. -Full example: +Full example: ### Embedding @@ -199,7 +199,7 @@ for o in outputs: print(generated_text) ``` -## Online Inference +## Online Serving Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). @@ -303,6 +303,7 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model ``` Then, you can use the OpenAI client as follows: + ```python from openai import OpenAI diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md index 79092ab208784..1f5a54f755f13 100644 --- a/docs/source/serving/offline_inference.md +++ b/docs/source/serving/offline_inference.md @@ -31,6 +31,59 @@ Please refer to the above pages for more details about each API. This section lists the most common options for running the vLLM engine. For a full list, refer to the [Engine Arguments](#engine-args) page. +### Model resolution + +vLLM loads HuggingFace-compatible models by inspecting the `architectures` field in `config.json` of the model repository +and finding the corresponding implementation that is registered to vLLM. +Nevertheless, our model resolution may fail for the following reasons: + +- The `config.json` of the model repository lacks the `architectures` field. +- Unofficial repositories refer to a model using alternative names which are not recorded in vLLM. +- The same architecture name is used for multiple models, creating ambiguity as to which model should be loaded. + +In those cases, vLLM may throw an error like: + +```text +Traceback (most recent call last): +... + File "vllm/model_executor/models/registry.py", line xxx, in inspect_model_cls + for arch in architectures: +TypeError: 'NoneType' object is not iterable +``` + +or: + +```text + File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported + raise ValueError( +ValueError: Model architectures [''] are not supported for now. Supported architectures: [...] +``` + +:::{note} +The above error is distinct from the following similar but different error: + +```text + File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported + raise ValueError( +ValueError: Model architectures [''] failed to be inspected. Please check the logs for more details. +``` + +This error means that vLLM failed to import the model file. Usually, it is related to missing dependencies or outdated +binaries in the vLLM build. Please read the logs carefully to determine the real cause of the error. +::: + +To fix this, explicitly specify the model architecture by passing `config.json` overrides to the `hf_overrides` option. +For example: + +```python +model = LLM( + model="cerebras/Cerebras-GPT-1.3B", + hf_overrides={"architectures": ["GPT2LMHeadModel"]}, # GPT-2 +) +``` + +Our [list of supported models](#supported-models) shows the model architectures that are recognized by vLLM. + ### Reducing memory usage Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem. @@ -64,7 +117,7 @@ Dynamic quantization is also supported via the `quantization` option -- see [her #### Context length and batch size -You can further reduce memory usage by limit the context length of the model (`max_model_len` option) +You can further reduce memory usage by limiting the context length of the model (`max_model_len` option) and the maximum batch size (`max_num_seqs` option). ```python diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index ec5a367594743..e49bbb06695f8 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -5,11 +5,13 @@ vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](#deployment-docker): + ```bash vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 ``` To call the server, you can use the [official OpenAI Python client](https://github.com/openai/openai-python), or any other HTTP client. + ```python from openai import OpenAI client = OpenAI( @@ -50,6 +52,7 @@ In addition, we have the following custom APIs: - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). (chat-template)= + ## Chat Template In order for the language model to support chat protocol, vLLM requires the model to include @@ -71,6 +74,7 @@ vLLM community provides a set of chat templates for popular models. You can find With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies both a `type` and a `text` field. An example is provided below: + ```python completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", @@ -80,7 +84,7 @@ completion = client.chat.completions.create( ) ``` -Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like +Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like `meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the request. vLLM provides best-effort support to detect this automatically, which is logged as a string like *"Detected the chat template content format to be..."*, and internally converts incoming requests to match @@ -115,12 +119,12 @@ completion = client.chat.completions.create( ## Extra HTTP Headers Only `X-Request-Id` HTTP request header is supported for now. It can be enabled -with `--enable-request-id-headers`. +with `--enable-request-id-headers`. > Note that enablement of the headers can impact performance significantly at high QPS > rates. We recommend implementing HTTP headers at the router level (e.g. via Istio), > rather than within the vLLM layer for this reason. -> See https://github.com/vllm-project/vllm/pull/11529 for more details. +> See [this PR](https://github.com/vllm-project/vllm/pull/11529) for more details. ```python completion = client.chat.completions.create( @@ -147,6 +151,7 @@ print(completion._request_id) ## CLI Reference (vllm-serve)= + ### `vllm serve` The `vllm serve` command is used to launch the OpenAI-compatible server. @@ -175,7 +180,7 @@ uvicorn-log-level: "info" To use the above config file: ```bash -$ vllm serve SOME_MODEL --config config.yaml +vllm serve SOME_MODEL --config config.yaml ``` ```{note} @@ -186,6 +191,7 @@ The order of priorities is `command line > config file values > defaults`. ## API Reference (completions-api)= + ### Completions API Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions); @@ -212,6 +218,7 @@ The following extra parameters are supported: ``` (chat-api)= + ### Chat API Our Chat API is compatible with [OpenAI's Chat Completions API](https://platform.openai.com/docs/api-reference/chat); @@ -243,6 +250,7 @@ The following extra parameters are supported: ``` (embeddings-api)= + ### Embeddings API Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings); @@ -284,6 +292,7 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s ``` (tokenizer-api)= + ### Tokenizer API Our Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer). @@ -293,6 +302,7 @@ It consists of two endpoints: - `/detokenize` corresponds to calling `tokenizer.decode()`. (pooling-api)= + ### Pooling API Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states. @@ -302,6 +312,7 @@ The input format is the same as [Embeddings API](#embeddings-api), but the outpu Code example: (score-api)= + ### Score API Our Score API applies a cross-encoder model to predict scores for sentence pairs. diff --git a/examples/offline_inference/offline_inference_arctic.py b/examples/offline_inference/arctic.py similarity index 100% rename from examples/offline_inference/offline_inference_arctic.py rename to examples/offline_inference/arctic.py diff --git a/examples/offline_inference/offline_inference_audio_language.py b/examples/offline_inference/audio_language.py similarity index 100% rename from examples/offline_inference/offline_inference_audio_language.py rename to examples/offline_inference/audio_language.py diff --git a/examples/offline_inference/offline_inference.py b/examples/offline_inference/basic.py similarity index 100% rename from examples/offline_inference/offline_inference.py rename to examples/offline_inference/basic.py diff --git a/examples/offline_inference/offline_inference_with_default_generation_config.py b/examples/offline_inference/basic_with_model_default_sampling.py similarity index 100% rename from examples/offline_inference/offline_inference_with_default_generation_config.py rename to examples/offline_inference/basic_with_model_default_sampling.py diff --git a/examples/offline_inference/offline_inference_chat.py b/examples/offline_inference/chat.py similarity index 100% rename from examples/offline_inference/offline_inference_chat.py rename to examples/offline_inference/chat.py diff --git a/examples/offline_inference/offline_chat_with_tools.py b/examples/offline_inference/chat_with_tools.py similarity index 100% rename from examples/offline_inference/offline_chat_with_tools.py rename to examples/offline_inference/chat_with_tools.py diff --git a/examples/offline_inference/offline_inference_classification.py b/examples/offline_inference/classification.py similarity index 100% rename from examples/offline_inference/offline_inference_classification.py rename to examples/offline_inference/classification.py diff --git a/examples/offline_inference/offline_inference_cli.py b/examples/offline_inference/cli.py similarity index 100% rename from examples/offline_inference/offline_inference_cli.py rename to examples/offline_inference/cli.py diff --git a/examples/offline_inference/offline_inference_distributed.py b/examples/offline_inference/distributed.py similarity index 100% rename from examples/offline_inference/offline_inference_distributed.py rename to examples/offline_inference/distributed.py diff --git a/examples/offline_inference/offline_inference_embedding.py b/examples/offline_inference/embedding.py similarity index 100% rename from examples/offline_inference/offline_inference_embedding.py rename to examples/offline_inference/embedding.py diff --git a/examples/offline_inference/offline_inference_encoder_decoder.py b/examples/offline_inference/encoder_decoder.py similarity index 100% rename from examples/offline_inference/offline_inference_encoder_decoder.py rename to examples/offline_inference/encoder_decoder.py diff --git a/examples/offline_inference/florence2_inference.py b/examples/offline_inference/florence2_inference.py index 49dd2c331db5a..c24096e90004b 100644 --- a/examples/offline_inference/florence2_inference.py +++ b/examples/offline_inference/florence2_inference.py @@ -3,7 +3,7 @@ Demonstrate prompting of text-to-text encoder/decoder models, specifically Florence-2 ''' # TODO(Isotr0py): -# Move to offline_inference/offline_inference_vision_language.py +# Move to offline_inference/vision_language.py # after porting vision backbone from vllm import LLM, SamplingParams diff --git a/examples/offline_inference/gguf_inference.py b/examples/offline_inference/gguf_inference.py index 09a5fcc22e553..aa05c4c0bfaa5 100644 --- a/examples/offline_inference/gguf_inference.py +++ b/examples/offline_inference/gguf_inference.py @@ -3,27 +3,20 @@ from huggingface_hub import hf_hub_download from vllm import LLM, SamplingParams -def run_gguf_inference(model_path): - PROMPT_TEMPLATE = "<|system|>\n{system_message}\n<|user|>\n{prompt}\n<|assistant|>\n" # noqa: E501 - system_message = "You are a friendly chatbot who always responds in the style of a pirate." # noqa: E501 +def run_gguf_inference(model_path, tokenizer): # Sample prompts. prompts = [ "How many helicopters can a human eat in one sitting?", "What's the future of AI?", ] - prompts = [ - PROMPT_TEMPLATE.format(system_message=system_message, prompt=prompt) - for prompt in prompts - ] + prompts = [[{"role": "user", "content": prompt}] for prompt in prompts] # Create a sampling params object. sampling_params = SamplingParams(temperature=0, max_tokens=128) # Create an LLM. - llm = LLM(model=model_path, - tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0", - gpu_memory_utilization=0.95) + llm = LLM(model=model_path, tokenizer=tokenizer) - outputs = llm.generate(prompts, sampling_params) + outputs = llm.chat(prompts, sampling_params) # Print the outputs. for output in outputs: prompt = output.prompt @@ -32,7 +25,8 @@ def run_gguf_inference(model_path): if __name__ == "__main__": - repo_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" - filename = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf" + repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF" + filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf" + tokenizer = "microsoft/Phi-3-medium-4k-instruct" model = hf_hub_download(repo_id, filename=filename) - run_gguf_inference(model) + run_gguf_inference(model, tokenizer) diff --git a/examples/offline_inference/offline_inference_mlpspeculator.py b/examples/offline_inference/mlpspeculator.py similarity index 100% rename from examples/offline_inference/offline_inference_mlpspeculator.py rename to examples/offline_inference/mlpspeculator.py diff --git a/examples/offline_inference/offline_inference_neuron.py b/examples/offline_inference/neuron.py similarity index 100% rename from examples/offline_inference/offline_inference_neuron.py rename to examples/offline_inference/neuron.py diff --git a/examples/offline_inference/offline_inference_neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py similarity index 100% rename from examples/offline_inference/offline_inference_neuron_int8_quantization.py rename to examples/offline_inference/neuron_int8_quantization.py diff --git a/examples/offline_inference/offline_inference_openai/offline_inference_openai.md b/examples/offline_inference/openai/openai_batch.md similarity index 92% rename from examples/offline_inference/offline_inference_openai/offline_inference_openai.md rename to examples/offline_inference/openai/openai_batch.md index 6278a1943fe4a..a4774e57cd9a5 100644 --- a/examples/offline_inference/offline_inference_openai/offline_inference_openai.md +++ b/examples/offline_inference/openai/openai_batch.md @@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format The OpenAI batch file format consists of a series of json objects on new lines. -[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl) +[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai/openai_example_batch.jsonl) Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details. @@ -31,13 +31,13 @@ We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints To follow along with this example, you can download the example batch, or create your own batch file in your working directory. ``` -wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl ``` Once you've created your batch file it should look like this ``` -$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl +$ cat offline_inference/openai/openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` @@ -49,7 +49,7 @@ The batch running tool is designed to be used from the command line. You can run the batch with the following command, which will write its results to a file called `results.jsonl` ``` -python -m vllm.entrypoints.openai.run_batch -i offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct ``` ### Step 3: Check your results @@ -66,10 +66,10 @@ $ cat results.jsonl The batch runner supports remote input and output urls that are accessible via http/https. -For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl`, you can run +For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl`, you can run ``` -python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct ``` ## Example 3: Integrating with AWS S3 @@ -90,13 +90,13 @@ To integrate with cloud blob storage, we recommend using presigned urls. To follow along with this example, you can download the example batch, or create your own batch file in your working directory. ``` -wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl ``` Once you've created your batch file it should look like this ``` -$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl +$ cat offline_inference/openai/openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` @@ -104,7 +104,7 @@ $ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl Now upload your batch file to your S3 bucket. ``` -aws s3 cp offline_inference/offline_inference_openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl +aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl ``` ### Step 2: Generate your presigned urls diff --git a/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl b/examples/offline_inference/openai/openai_example_batch.jsonl similarity index 100% rename from examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl rename to examples/offline_inference/openai/openai_example_batch.jsonl diff --git a/examples/offline_inference/offline_inference_pixtral.py b/examples/offline_inference/pixtral.py similarity index 100% rename from examples/offline_inference/offline_inference_pixtral.py rename to examples/offline_inference/pixtral.py diff --git a/examples/offline_inference/offline_inference_with_prefix.py b/examples/offline_inference/prefix_caching.py similarity index 100% rename from examples/offline_inference/offline_inference_with_prefix.py rename to examples/offline_inference/prefix_caching.py diff --git a/examples/offline_inference/offline_profile.py b/examples/offline_inference/profiling.py similarity index 99% rename from examples/offline_inference/offline_profile.py rename to examples/offline_inference/profiling.py index 187a05e4d70a2..8a94b5c2a8623 100644 --- a/examples/offline_inference/offline_profile.py +++ b/examples/offline_inference/profiling.py @@ -363,7 +363,7 @@ Profile a model example: ``` - python examples/offline_inference/offline_profile.py \\ + python examples/offline_inference/profiling.py \\ --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\ --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\ --enforce-eager run_num_steps -n 2 diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py new file mode 100644 index 0000000000000..3bc303dad277f --- /dev/null +++ b/examples/offline_inference/rlhf.py @@ -0,0 +1,191 @@ +""" +a simple demonstration of RLHF with vLLM, inspired by +the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF . +It follows the design that, training processes and inference processes +are different, and they live on different GPUs. +Training processes send prompts to inference processes to generate data, +and also synchronize the weights of the model by broadcasting the weights +from the training process to the inference process. +Note that this is a simple demonstration of one training instance and one +inference instance. In practice, there could be multiple training instances +and multiple inference instances. For the full implementation, please refer +to the OpenRLHF framework. +""" +import os + +import ray +import torch +from ray.util.placement_group import placement_group +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy +from transformers import AutoModelForCausalLM + +from vllm import LLM, SamplingParams, configure_as_vllm_process +from vllm.utils import get_ip, get_open_port +from vllm.worker.worker import Worker + + +def stateless_init_process_group(master_address, master_port, rank, world_size, + device): + """ + vLLM provides `StatelessProcessGroup` to create a process group + without considering the global process group in torch.distributed. + It is recommended to create `StatelessProcessGroup`, and then initialize + the data-plane communication (NCCL) between external (train processes) + and vLLM workers. + """ + from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator + from vllm.distributed.utils import StatelessProcessGroup + pg = StatelessProcessGroup.create(host=master_address, + port=master_port, + rank=rank, + world_size=world_size) + pynccl = PyNcclCommunicator(pg, device=device) + return pynccl + + +class MyWorker(Worker): + """ + The `MyWorker` class inherits from `Worker` to provide custom functions. + For simplicity, we define the `MyWorker` class in this self-contained + script. Normally, we should define the `MyWorker` class in a separate + file and pass the qualified name of the class to the `worker_cls` + parameter. + """ + + def init_weight_update_group(self, master_address, master_port, + rank_offset, world_size): + from vllm.distributed.parallel_state import get_world_group + rank = get_world_group().rank + rank_offset + self.model_update_group = stateless_init_process_group( + master_address, + master_port, + rank, + world_size, + self.device, + ) + + def update_weight(self, name, dtype, shape): + weight = torch.empty(shape, dtype=dtype, device="cuda") + self.model_update_group.broadcast(weight, + src=0, + stream=torch.cuda.current_stream()) + + self.model_runner.model.load_weights(weights=[(name, weight)]) + + del weight + + def check_weights_changed(self): + """ + Check if the weights are updated to 0. + """ + weights_updated = True + for name, p in self.model_runner.model.named_parameters(): + weights_updated = weights_updated and torch.allclose( + p, torch.zeros_like(p)) + return weights_updated + + +class MyLLM(LLM): + + def __init__(self, *args, **kwargs): + # a hack to make the script work. + # stop ray from manipulating CUDA_VISIBLE_DEVICES + # at the top-level + del os.environ["CUDA_VISIBLE_DEVICES"] + super().__init__(*args, **kwargs) + + +""" +Start the training process, here we use huggingface transformers +as an example to hold a model on GPU 0. + +It is important for all the processes outside of vLLM to call +`configure_as_vllm_process` to set some common environment variables +the same as vLLM workers. +""" +configure_as_vllm_process() + +train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") +train_model.to("cuda:0") +""" +Start the inference process, here we use vLLM to hold a model on GPU 1 and +GPU 2. For the details on how to use ray, please refer to the ray +documentation https://docs.ray.io/en/latest/ . +""" +os.environ["CUDA_VISIBLE_DEVICES"] = "1,2" +ray.init() + +pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2) +ray.get(pg_inference.ready()) +scheduling_inference = PlacementGroupSchedulingStrategy( + placement_group=pg_inference, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=0, +) +""" +launch the vLLM inference engine. +here we use `enforce_eager` to reduce the start time. +""" +llm = ray.remote( + num_cpus=0, + num_gpus=0, + scheduling_strategy=scheduling_inference, +)(MyLLM).remote( + model="facebook/opt-125m", + enforce_eager=True, + worker_cls=MyWorker, + tensor_parallel_size=2, + distributed_executor_backend="ray", +) + +# Generate texts from the prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +sampling_params = SamplingParams(temperature=0) + +outputs = ray.get(llm.generate.remote(prompts, sampling_params)) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") + +# set up the communication between the training process +# and the inference engine. +master_address = get_ip() +master_port = get_open_port() + +handle = llm.collective_rpc.remote("init_weight_update_group", + args=(master_address, master_port, 1, 3)) +model_update_group = stateless_init_process_group(master_address, master_port, + 0, 3, torch.device("cuda:0")) +ray.get(handle) + +# simulate training, modify the weights of the model. +for name, p in train_model.named_parameters(): + p.data.zero_() + +# sync weight from the training process to the inference engine. +for name, p in train_model.named_parameters(): + handle = llm.collective_rpc.remote("update_weight", + args=(name, p.dtype, p.shape)) + model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream()) + ray.get(handle) + +# check if the weights are updated. +assert all(ray.get(llm.collective_rpc.remote("check_weights_changed"))) + +# use the updated model to generate texts, they will be nonsense +# because the weights are all zeros. +outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params)) +for output in outputs_updated: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") diff --git a/examples/offline_inference/offline_inference_scoring.py b/examples/offline_inference/scoring.py similarity index 100% rename from examples/offline_inference/offline_inference_scoring.py rename to examples/offline_inference/scoring.py diff --git a/examples/offline_inference/offline_inference_with_profiler.py b/examples/offline_inference/simple_profiling.py similarity index 100% rename from examples/offline_inference/offline_inference_with_profiler.py rename to examples/offline_inference/simple_profiling.py diff --git a/examples/offline_inference/offline_inference_structured_outputs.py b/examples/offline_inference/structured_outputs.py similarity index 100% rename from examples/offline_inference/offline_inference_structured_outputs.py rename to examples/offline_inference/structured_outputs.py diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py new file mode 100644 index 0000000000000..b6de73eb7266e --- /dev/null +++ b/examples/offline_inference/torchrun_example.py @@ -0,0 +1,64 @@ +""" +experimental support for tensor-parallel inference with torchrun, +see https://github.com/vllm-project/vllm/issues/11400 for +the motivation and use case for this example. +run the script with `torchrun --nproc-per-node=2 torchrun_example.py`, +the argument 2 should match the `tensor_parallel_size` below. +see `tests/distributed/test_torchrun_example.py` for the unit test. +""" + +from vllm import LLM, SamplingParams + +# Create prompts, the same across all ranks +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +# Create sampling parameters, the same across all ranks +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Use `distributed_executor_backend="external_launcher"` so that +# this llm engine/instance only creates one worker. +llm = LLM( + model="facebook/opt-125m", + tensor_parallel_size=2, + distributed_executor_backend="external_launcher", +) + +outputs = llm.generate(prompts, sampling_params) + +# all ranks will have the same outputs +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") +""" +Further tips: + +1. to communicate control messages across all ranks, use the cpu group, +a PyTorch ProcessGroup with GLOO backend. + +```python +from vllm.distributed.parallel_state import get_world_group +cpu_group = get_world_group().cpu_group +torch_rank = dist.get_rank(group=cpu_group) +if torch_rank == 0: + # do something for rank 0, e.g. saving the results to disk. +``` + +2. to communicate data across all ranks, use the model's device group, +a PyTorch ProcessGroup with NCCL backend. +```python +from vllm.distributed.parallel_state import get_world_group +device_group = get_world_group().device_group +``` + +3. to access the model directly in every rank, use the following code: +```python +llm.llm_engine.model_executor.driver_worker.worker.model_runner.model +``` +""" diff --git a/examples/offline_inference/offline_inference_tpu.py b/examples/offline_inference/tpu.py similarity index 100% rename from examples/offline_inference/offline_inference_tpu.py rename to examples/offline_inference/tpu.py diff --git a/examples/offline_inference/offline_inference_vision_language.py b/examples/offline_inference/vision_language.py similarity index 97% rename from examples/offline_inference/offline_inference_vision_language.py rename to examples/offline_inference/vision_language.py index b51bfae455267..69228bbf22949 100644 --- a/examples/offline_inference/offline_inference_vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -66,6 +66,23 @@ def run_chameleon(question: str, modality: str): return llm, prompt, stop_token_ids +# Deepseek-VL2 +def run_deepseek_vl2(question: str, modality: str): + assert modality == "image" + + model_name = "deepseek-ai/deepseek-vl2-tiny" + + llm = LLM(model=model_name, + max_model_len=4096, + max_num_seqs=2, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}) + + prompt = f"<|User|>: \n{question}\n\n<|Assistant|>:" + stop_token_ids = None + return llm, prompt, stop_token_ids + + # Fuyu def run_fuyu(question: str, modality: str): assert modality == "image" @@ -308,7 +325,6 @@ def run_mllama(question: str, modality: str): model=model_name, max_model_len=4096, max_num_seqs=16, - enforce_eager=True, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) @@ -498,6 +514,7 @@ model_example_map = { "aria": run_aria, "blip-2": run_blip2, "chameleon": run_chameleon, + "deepseek_vl_v2": run_deepseek_vl2, "fuyu": run_fuyu, "glm4v": run_glm4v, "h2ovl_chat": run_h2ovl, diff --git a/examples/offline_inference/offline_inference_vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py similarity index 100% rename from examples/offline_inference/offline_inference_vision_language_embedding.py rename to examples/offline_inference/vision_language_embedding.py diff --git a/examples/offline_inference/offline_inference_vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py similarity index 95% rename from examples/offline_inference/offline_inference_vision_language_multi_image.py rename to examples/offline_inference/vision_language_multi_image.py index cf2e90a325c6a..43c44fa867e0a 100644 --- a/examples/offline_inference/offline_inference_vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -54,6 +54,28 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData: ) +def load_deepseek_vl2(question: str, image_urls: List[str]): + model_name = "deepseek-ai/deepseek-vl2-tiny" + + llm = LLM(model=model_name, + max_model_len=4096, + max_num_seqs=2, + hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}, + limit_mm_per_prompt={"image": len(image_urls)}) + + placeholder = "".join(f"image_{i}:\n" + for i, _ in enumerate(image_urls, start=1)) + prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:" + + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=None, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None, + ) + + def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData: model_name = "h2oai/h2ovl-mississippi-2b" @@ -164,7 +186,6 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData: model=model_name, max_model_len=4096, max_num_seqs=16, - enforce_eager=True, limit_mm_per_prompt={"image": len(image_urls)}, ) @@ -372,6 +393,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: model_example_map = { "aria": load_aria, + "deepseek_vl_v2": load_deepseek_vl2, "h2ovl_chat": load_h2onvl, "idefics3": load_idefics3, "internvl_chat": load_internvl, diff --git a/examples/offline_inference/offline_inference_whisper.py b/examples/offline_inference/whisper.py similarity index 100% rename from examples/offline_inference/offline_inference_whisper.py rename to examples/offline_inference/whisper.py diff --git a/examples/online_serving/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh index 87155273a81d1..2bb2824c6c86f 100644 --- a/examples/online_serving/disaggregated_prefill.sh +++ b/examples/online_serving/disaggregated_prefill.sh @@ -3,6 +3,8 @@ # We will launch 2 vllm instances (1 for prefill and 1 for decode), # and then transfer the KV cache between them. +set -xe + echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧" sleep 1 @@ -69,7 +71,7 @@ wait_for_server 8200 # instance # NOTE: the usage of this API is subject to change --- in the future we will # introduce "vllm connect" to connect between prefill and decode instances -python3 ../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py & +python3 ../../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py & sleep 1 # serve two example requests diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index 213d075542e81..03cc037bb6779 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -1,5 +1,5 @@ """An example showing how to use vLLM to serve multimodal models -and run online inference with OpenAI client. +and run online serving with OpenAI client. Launch the vLLM server with the following command: @@ -309,7 +309,7 @@ def main(args) -> None: if __name__ == "__main__": parser = FlexibleArgumentParser( - description='Demo on using OpenAI client for online inference with ' + description='Demo on using OpenAI client for online serving with ' 'multimodal language models served with vLLM.') parser.add_argument('--chat-type', '-c', diff --git a/examples/template_deepseek_vl2.jinja b/examples/template_deepseek_vl2.jinja new file mode 100644 index 0000000000000..fbf3d320094d5 --- /dev/null +++ b/examples/template_deepseek_vl2.jinja @@ -0,0 +1,23 @@ +{%- if messages[0]['role'] == 'system' -%} + {%- set system_message = messages[0]['content'] -%} + {%- set messages = messages[1:] -%} +{%- else -%} + {% set system_message = '' -%} +{%- endif -%} + +{{ bos_token + system_message }} +{%- for message in messages -%} + {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {%- endif -%} + + {%- if message['role'] == 'user' -%} + {{ '<|User|>: ' + message['content'] + '\n' }} + {%- elif message['role'] == 'assistant' -%} + {{ '<|Assistant|>: ' + message['content'] + eos_token + '\n' }} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {{ '<|Assistant|>: ' }} +{% endif %} diff --git a/format.sh b/format.sh index 0b196de9d0773..4bcd0be0c96e5 100755 --- a/format.sh +++ b/format.sh @@ -1,321 +1,5 @@ -#!/usr/bin/env bash -# YAPF formatter, adapted from ray and skypilot. -# -# Usage: -# # Do work and commit your work. +#!/bin/bash -# # Format files that differ from origin/main. -# bash format.sh - -# # Commit changed files with message 'Run yapf and ruff' -# -# -# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase. -# You are encouraged to run this locally before pushing changes for review. - -# Cause the script to exit if a single command fails -set -eo pipefail - -# this stops git rev-parse from failing if we run this from the .git directory -builtin cd "$(dirname "${BASH_SOURCE:-$0}")" -ROOT="$(git rev-parse --show-toplevel)" -builtin cd "$ROOT" || exit 1 - -check_command() { - if ! command -v "$1" &> /dev/null; then - echo "❓❓$1 is not installed, please run \`pip install -r requirements-lint.txt\`" - exit 1 - fi -} - -check_command yapf -check_command ruff -check_command mypy -check_command codespell -check_command isort -check_command clang-format - -YAPF_VERSION=$(yapf --version | awk '{print $2}') -RUFF_VERSION=$(ruff --version | awk '{print $2}') -MYPY_VERSION=$(mypy --version | awk '{print $2}') -CODESPELL_VERSION=$(codespell --version) -ISORT_VERSION=$(isort --vn) -CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}') -SPHINX_LINT_VERSION=$(sphinx-lint --version | awk '{print $2}') - -# # params: tool name, tool version, required version -tool_version_check() { - expected=$(grep "$1" requirements-lint.txt | cut -d'=' -f3) - if [[ "$2" != "$expected" ]]; then - echo "❓❓Wrong $1 version installed: $expected is required, not $2." - exit 1 - fi -} - -tool_version_check "yapf" "$YAPF_VERSION" -tool_version_check "ruff" "$RUFF_VERSION" -tool_version_check "mypy" "$MYPY_VERSION" -tool_version_check "isort" "$ISORT_VERSION" -tool_version_check "codespell" "$CODESPELL_VERSION" -tool_version_check "clang-format" "$CLANGFORMAT_VERSION" -tool_version_check "sphinx-lint" "$SPHINX_LINT_VERSION" - -YAPF_FLAGS=( - '--recursive' - '--parallel' -) - -YAPF_EXCLUDES=( - '--exclude' 'build/**' -) - -# Format specified files -format() { - yapf --in-place "${YAPF_FLAGS[@]}" "$@" -} - -# Format files that differ from main branch. Ignores dirs that are not slated -# for autoformat yet. -format_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause yapf to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ - yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}" - fi - -} - -# Format all files -format_all() { - yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" . -} - -## This flag formats individual files. --files *must* be the first command line -## arg to use this option. -if [[ "$1" == '--files' ]]; then - format "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is formatted. -elif [[ "$1" == '--all' ]]; then - format_all -else - # Format only the files that changed in last commit. - format_changed -fi -echo 'vLLM yapf: Done' - -# Run mypy -echo 'vLLM mypy:' -tools/mypy.sh -echo 'vLLM mypy: Done' - - -# If git diff returns a file that is in the skip list, the file may be checked anyway: -# https://github.com/codespell-project/codespell/issues/1915 -# Avoiding the "./" prefix and using "/**" globs for directories appears to solve the problem -CODESPELL_EXCLUDES=( - '--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**' -) - -# check spelling of specified files -spell_check() { - codespell "$@" -} - -spell_check_all(){ - codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" -} - -# Spelling check of files that differ from main branch. -spell_check_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause ruff to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - codespell "${CODESPELL_EXCLUDES[@]}" - fi -} - -# Run Codespell -## This flag runs spell check of individual files. --files *must* be the first command line -## arg to use this option. -if [[ "$1" == '--files' ]]; then - spell_check "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is linted. -elif [[ "$1" == '--all' ]]; then - spell_check_all -else - # Check spelling only of the files that changed in last commit. - spell_check_changed -fi -echo 'vLLM codespell: Done' - - -# Lint specified files -lint() { - ruff check "$@" -} - -# Lint files that differ from main branch. Ignores dirs that are not slated -# for autolint yet. -lint_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause ruff to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - ruff check - fi - -} - -# Run Ruff -### This flag lints individual files. --files *must* be the first command line -### arg to use this option. -if [[ "$1" == '--files' ]]; then - lint "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is linted. -elif [[ "$1" == '--all' ]]; then - lint vllm tests -else - # Format only the files that changed in last commit. - lint_changed -fi -echo 'vLLM ruff: Done' - -# check spelling of specified files -isort_check() { - isort "$@" -} - -isort_check_all(){ - isort . -} - -# Spelling check of files that differ from main branch. -isort_check_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause ruff to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - isort - fi -} - -# Run Isort -# This flag runs spell check of individual files. --files *must* be the first command line -# arg to use this option. -if [[ "$1" == '--files' ]]; then - isort_check "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is linted. -elif [[ "$1" == '--all' ]]; then - isort_check_all -else - # Check spelling only of the files that changed in last commit. - isort_check_changed -fi -echo 'vLLM isort: Done' - -# Clang-format section -# Exclude some files for formatting because they are vendored -# NOTE: Keep up to date with .github/workflows/clang-format.yml -CLANG_FORMAT_EXCLUDES=( - 'csrc/moe/topk_softmax_kernels.cu' - 'csrc/quantization/gguf/ggml-common.h' - 'csrc/quantization/gguf/dequantize.cuh' - 'csrc/quantization/gguf/vecdotq.cuh' - 'csrc/quantization/gguf/mmq.cuh' - 'csrc/quantization/gguf/mmvq.cuh' -) - -# Format specified files with clang-format -clang_format() { - clang-format -i "$@" -} - -# Format files that differ from main branch with clang-format. -clang_format_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause clang-format to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - # Get the list of changed files, excluding the specified ones - changed_files=$(git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.h' '*.cpp' '*.cu' '*.cuh' | (grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") || echo -e)) - if [ -n "$changed_files" ]; then - echo "$changed_files" | xargs -P 5 clang-format -i - fi -} - -# Format all files with clang-format -clang_format_all() { - find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ - | grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") \ - | xargs clang-format -i -} - -# Run clang-format -if [[ "$1" == '--files' ]]; then - clang_format "${@:2}" -elif [[ "$1" == '--all' ]]; then - clang_format_all -else - clang_format_changed -fi -echo 'vLLM clang-format: Done' - -echo 'vLLM actionlint:' -tools/actionlint.sh -color -echo 'vLLM actionlint: Done' - -echo 'vLLM shellcheck:' -tools/shellcheck.sh -echo 'vLLM shellcheck: Done' - -echo 'excalidraw png check:' -tools/png-lint.sh -echo 'excalidraw png check: Done' - -if ! git diff --quiet &>/dev/null; then - echo - echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:" - git --no-pager diff --name-only - echo "🔍🔍Format checker passed, but please add, commit and push all the files above to include changes made by the format checker." - - exit 1 -else - echo "✨🎉 Format check passed! Congratulations! 🎉✨" -fi - -echo 'vLLM sphinx-lint:' -tools/sphinx-lint.sh -echo 'vLLM sphinx-lint: Done' +echo "vLLM linting system has been moved from format.sh to pre-commit hook." +echo "Please run 'pip install -r requirements-lint.txt' and 'pre-commit install' to install the pre-commit hook." +echo "Then linters will run automatically before each commit." diff --git a/pyproject.toml b/pyproject.toml index 0ac3f39ef7a5f..8f2e20d0f5800 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,11 @@ build-backend = "setuptools.build_meta" [tool.setuptools_scm] # version_file = "vllm/_version.py" # currently handled by `setup.py:get_version()` +[tool.yapfignore] +ignore_patterns = [ + "build/**", +] + [tool.ruff] # Allow lines to be as long as 80. line-length = 80 @@ -52,6 +57,9 @@ ignore = [ "B007", # f-string format "UP032", + # Python 3.8 typing + "UP006", "UP035", + ] [tool.mypy] @@ -101,3 +109,9 @@ markers = [ "skip_v1: do not run this test with v1", "optional: optional tests that are automatically skipped, include --optional to run them", ] + +[tool.pymarkdown] +plugins.md013.enabled = false # line-length +plugins.md041.enabled = false # first-line-h1 +plugins.md033.enabled = false # inline-html +plugins.md024.allow_different_nesting = true # no-duplicate-headers diff --git a/requirements-lint.txt b/requirements-lint.txt index 711bb50a0e936..62446f94048df 100644 --- a/requirements-lint.txt +++ b/requirements-lint.txt @@ -1,15 +1,2 @@ # formatting -yapf==0.32.0 -toml==0.10.2 -tomli==2.0.2 -ruff==0.6.5 -codespell==2.3.0 -isort==5.13.2 -clang-format==18.1.5 -sphinx-lint==1.0.0 - -# type checking -mypy==1.11.1 -types-PyYAML -types-requests -types-setuptools +pre-commit==4.0.1 diff --git a/requirements-test.in b/requirements-test.in index 4b4dc376d1fa5..bc76a91ad5356 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -29,4 +29,7 @@ lm-eval[api]==0.4.4 # required for model evaluation test bitsandbytes>=0.45.0 buildkite-test-collector==0.1.9 +genai_perf==0.0.8 +tritonclient==2.51.0 + numpy < 2.0.0 diff --git a/requirements-test.txt b/requirements-test.txt index f576e42afcbbf..09e009c2e21f4 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -37,7 +37,7 @@ audioread==3.0.1 # via librosa awscli==1.35.23 # via -r requirements-test.in -bitsandbytes>=0.45.0 +bitsandbytes==0.45.0 # via -r requirements-test.in black==24.10.0 # via datamodel-code-generator @@ -75,6 +75,8 @@ colorama==0.4.6 # tqdm-multiprocess contourpy==1.3.0 # via matplotlib +cramjam==2.9.0 + # via fastparquet cupy-cuda12x==13.3.0 # via ray cycler==0.12.1 @@ -109,6 +111,8 @@ email-validator==2.2.0 # via pydantic evaluate==0.4.3 # via lm-eval +fastparquet==2024.11.0 + # via genai-perf fastrlock==0.8.2 # via cupy-cuda12x filelock==3.16.1 @@ -130,8 +134,11 @@ fsspec[http]==2024.9.0 # via # datasets # evaluate + # fastparquet # huggingface-hub # torch +genai-perf==0.0.8 + # via -r requirements-test.in genson==1.3.0 # via datamodel-code-generator h11==0.14.0 @@ -186,6 +193,8 @@ jsonschema==4.23.0 # ray jsonschema-specifications==2024.10.1 # via jsonschema +kaleido==0.2.1 + # via genai-perf kiwisolver==1.4.7 # via matplotlib lazy-loader==0.4 @@ -200,6 +209,8 @@ lm-eval[api]==0.4.4 # via -r requirements-test.in lxml==5.3.0 # via sacrebleu +markdown-it-py==3.0.0 + # via rich markupsafe==3.0.2 # via jinja2 matplotlib==3.9.2 @@ -209,6 +220,8 @@ mbstrdecoder==1.1.3 # dataproperty # pytablewriter # typepy +mdurl==0.1.2 + # via markdown-it-py mistral-common[opencv]==1.5.1 # via # -r requirements-test.in @@ -249,6 +262,8 @@ numpy==1.26.4 # datasets # decord # evaluate + # fastparquet + # genai-perf # librosa # matplotlib # mistral-common @@ -256,15 +271,18 @@ numpy==1.26.4 # numexpr # opencv-python-headless # pandas + # patsy # peft # rouge-score # sacrebleu # scikit-learn # scipy # soxr + # statsmodels # tensorizer # torchvision # transformers + # tritonclient nvidia-cublas-cu12==12.4.5.8 # via # nvidia-cudnn-cu12 @@ -306,30 +324,39 @@ packaging==24.1 # datamodel-code-generator # datasets # evaluate + # fastparquet # huggingface-hub # lazy-loader # matplotlib # peft + # plotly # pooch # pytest # pytest-rerunfailures # ray + # statsmodels # transformers # typepy pandas==2.2.3 # via # datasets # evaluate + # fastparquet + # genai-perf + # statsmodels pathspec==0.12.1 # via black pathvalidate==3.2.1 # via pytablewriter +patsy==1.0.1 + # via statsmodels peft==0.13.2 # via # -r requirements-test.in # lm-eval pillow==10.4.0 # via + # genai-perf # matplotlib # mistral-common # sentence-transformers @@ -338,6 +365,8 @@ platformdirs==4.3.6 # via # black # pooch +plotly==5.24.1 + # via genai-perf pluggy==1.5.0 # via pytest pooch==1.8.2 @@ -360,7 +389,9 @@ psutil==6.1.0 py==1.11.0 # via pytest-forked pyarrow==18.0.0 - # via datasets + # via + # datasets + # genai-perf pyasn1==0.6.1 # via rsa pybind11==2.13.6 @@ -373,6 +404,8 @@ pydantic[email]==2.9.2 # mistral-common pydantic-core==2.23.4 # via pydantic +pygments==2.18.0 + # via rich pyparsing==3.2.0 # via matplotlib pytablewriter==1.2.0 @@ -381,14 +414,18 @@ pytest==8.3.3 # via # -r requirements-test.in # buildkite-test-collector + # genai-perf # pytest-asyncio # pytest-forked + # pytest-mock # pytest-rerunfailures # pytest-shard pytest-asyncio==0.24.0 # via -r requirements-test.in pytest-forked==1.6.0 # via -r requirements-test.in +pytest-mock==3.14.0 + # via genai-perf pytest-rerunfailures==14.0 # via -r requirements-test.in pytest-shard==0.1.2 @@ -399,6 +436,8 @@ python-dateutil==2.9.0.post0 # matplotlib # pandas # typepy +python-rapidjson==1.20 + # via tritonclient pytz==2024.2 # via # pandas @@ -409,9 +448,11 @@ pyyaml==6.0.2 # awscli # datamodel-code-generator # datasets + # genai-perf # huggingface-hub # peft # ray + # responses # timm # transformers ray[adag]==2.40.0 @@ -438,8 +479,13 @@ requests==2.32.3 # mistral-common # pooch # ray + # responses # tiktoken # transformers +responses==0.25.3 + # via genai-perf +rich==13.9.4 + # via genai-perf rouge-score==0.1.2 # via lm-eval rpds-py==0.20.1 @@ -470,6 +516,7 @@ scipy==1.13.1 # librosa # scikit-learn # sentence-transformers + # statsmodels sentence-transformers==3.2.1 # via -r requirements-test.in sentencepiece==0.2.0 @@ -490,6 +537,8 @@ soxr==0.5.0.post1 # via librosa sqlitedict==2.1.0 # via lm-eval +statsmodels==0.14.4 + # via genai-perf sympy==1.13.1 # via torch tabledata==1.3.3 @@ -499,7 +548,9 @@ tabulate==0.9.0 tcolorpy==0.1.6 # via pytablewriter tenacity==9.0.0 - # via lm-eval + # via + # lm-eval + # plotly tensorizer==2.9.0 # via -r requirements-test.in threadpoolctl==3.5.0 @@ -540,6 +591,7 @@ tqdm-multiprocess==0.0.11 # via lm-eval transformers==4.47.0 # via + # genai-perf # lm-eval # peft # sentence-transformers @@ -548,6 +600,10 @@ transformers-stream-generator==0.0.5 # via -r requirements-test.in triton==3.1.0 # via torch +tritonclient==2.51.0 + # via + # -r requirements-test.in + # genai-perf typepy[datetime]==1.3.2 # via # dataproperty @@ -555,6 +611,7 @@ typepy[datetime]==1.3.2 # tabledata typing-extensions==4.12.2 # via + # bitsandbytes # huggingface-hub # librosa # mistral-common @@ -563,10 +620,12 @@ typing-extensions==4.12.2 # torch tzdata==2024.2 # via pandas -urllib3==1.26.20 +urllib3==2.2.3 # via # botocore # requests + # responses + # tritonclient word2number==1.1 # via lm-eval xxhash==3.5.0 diff --git a/setup.py b/setup.py index b6c1f5bc8ac3f..978625a069778 100644 --- a/setup.py +++ b/setup.py @@ -324,21 +324,26 @@ class repackage_wheel(build_ext): def _is_hpu() -> bool: - is_hpu_available = True + # if VLLM_TARGET_DEVICE env var was set explicitly, skip HPU autodetection + if os.getenv("VLLM_TARGET_DEVICE", None) == VLLM_TARGET_DEVICE: + return VLLM_TARGET_DEVICE == "hpu" + + # if VLLM_TARGET_DEVICE was not set explicitly, check if hl-smi succeeds, + # and if it doesn't, check if habanalabs driver is loaded + is_hpu_available = False try: - subprocess.run(["hl-smi"], capture_output=True, check=True) + out = subprocess.run(["hl-smi"], capture_output=True, check=True) + is_hpu_available = out.returncode == 0 except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): - if not os.path.exists('/dev/accel/accel0') and not os.path.exists( - '/dev/accel/accel_controlD0'): - # last resort... + if sys.platform.startswith("linux"): try: output = subprocess.check_output( 'lsmod | grep habanalabs | wc -l', shell=True) is_hpu_available = int(output) > 0 except (ValueError, FileNotFoundError, PermissionError, subprocess.CalledProcessError): - is_hpu_available = False - return is_hpu_available or VLLM_TARGET_DEVICE == "hpu" + pass + return is_hpu_available def _no_device() -> bool: @@ -467,13 +472,9 @@ def get_gaudi_sw_version(): def get_vllm_version() -> str: - # TODO: Revisit this temporary approach: https://github.com/vllm-project/vllm/issues/9182#issuecomment-2404860236 - try: - version = get_version( - write_to="vllm/_version.py", # TODO: move this to pyproject.toml - ) - except LookupError: - version = "0.0.0" + version = get_version( + write_to="vllm/_version.py", # TODO: move this to pyproject.toml + ) sep = "+" if "+" not in version else "." # dev versions might contain + diff --git a/tests/conftest.py b/tests/conftest.py index 917151ddcb8d4..279c1bf9a3776 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -28,12 +28,13 @@ from vllm.distributed import (cleanup_dist_env_and_memory, init_distributed_environment, initialize_model_parallel) from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, - to_enc_dec_tuple_list, zip_enc_dec_prompts) + TokensPrompt, to_enc_dec_tuple_list, + zip_enc_dec_prompts) from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless, - identity) + identity, is_list_of) logger = init_logger(__name__) @@ -243,6 +244,7 @@ def video_assets() -> _VideoAssets: _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict) +_R = TypeVar("_R") class HfRunner: @@ -886,6 +888,12 @@ class VllmRunner: beam_width: int, max_tokens: int, ) -> List[Tuple[List[List[int]], List[str]]]: + if is_list_of(prompts, str, check="all"): + prompts = [TextPrompt(prompt=prompt) for prompt in prompts] + else: + prompts = [ + TokensPrompt(prompt_token_ids=tokens) for tokens in prompts + ] outputs = self.model.beam_search( prompts, BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens)) @@ -923,6 +931,10 @@ class VllmRunner: req_outputs = self.model.score(text_1, text_2) return [req_output.outputs.score for req_output in req_outputs] + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + executor = self.model.llm_engine.model_executor + return executor.apply_model(func) + def __enter__(self): return self diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py new file mode 100644 index 0000000000000..7aa03d7f0402a --- /dev/null +++ b/tests/distributed/test_torchrun_example.py @@ -0,0 +1,56 @@ +# unit test for `examples/offline_inference/torchrun_example.py` + +import random + +import torch.distributed as dist + +from vllm import LLM, SamplingParams +from vllm.distributed.parallel_state import get_world_group + +# Create prompts +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# set different `gpu_memory_utilization` and `swap_space` for different ranks, +# to test if all ranks agree on the same kv cache configuration. +llm = LLM(model="facebook/opt-125m", + tensor_parallel_size=2, + distributed_executor_backend="external_launcher", + gpu_memory_utilization=random.uniform(0.7, 0.9), + swap_space=random.randint(1, 4)) + +outputs = llm.generate(prompts, sampling_params) + +cpu_group = get_world_group().cpu_group + +torch_rank = dist.get_rank(group=cpu_group) + + +def test_consistent_across_ranks(obj): + if torch_rank == 0: + dist.broadcast_object_list([obj], src=0, group=cpu_group) + else: + container = [None] + dist.broadcast_object_list(container, src=0, group=cpu_group) + assert container[0] == obj + + +test_consistent_across_ranks( + llm.llm_engine.vllm_config.cache_config.num_cpu_blocks) +test_consistent_across_ranks( + llm.llm_engine.vllm_config.cache_config.num_gpu_blocks) + +# all ranks should have the same outputs +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + test_consistent_across_ranks(prompt) + test_consistent_across_ranks(generated_text) + print(f"Rank {torch_rank}, Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py index bbabb936e92ba..0e33f3662da82 100644 --- a/tests/engine/test_custom_executor.py +++ b/tests/engine/test_custom_executor.py @@ -1,12 +1,13 @@ import asyncio import os +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import pytest from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.llm_engine import LLMEngine -from vllm.executor.gpu_executor import GPUExecutor, GPUExecutorAsync +from vllm.executor.uniproc_executor import UniProcExecutor from vllm.sampling_params import SamplingParams @@ -14,21 +15,20 @@ class Mock: ... -class CustomGPUExecutor(GPUExecutor): +class CustomUniExecutor(UniProcExecutor): - def execute_model(self, *args, **kwargs): + def collective_rpc(self, + method: Union[str, Callable], + timeout: Optional[float] = None, + args: Tuple = (), + kwargs: Optional[Dict] = None) -> List[Any]: # Drop marker to show that this was ran with open(".marker", "w"): ... - return super().execute_model(*args, **kwargs) + return super().collective_rpc(method, timeout, args, kwargs) -class CustomGPUExecutorAsync(GPUExecutorAsync): - - async def execute_model_async(self, *args, **kwargs): - with open(".marker", "w"): - ... - return await super().execute_model_async(*args, **kwargs) +CustomUniExecutorAsync = CustomUniExecutor @pytest.mark.parametrize("model", ["facebook/opt-125m"]) @@ -41,10 +41,6 @@ def test_custom_executor_type_checking(model): engine_args = AsyncEngineArgs(model=model, distributed_executor_backend=Mock) AsyncLLMEngine.from_engine_args(engine_args) - with pytest.raises(TypeError): - engine_args = AsyncEngineArgs( - model=model, distributed_executor_backend=CustomGPUExecutor) - AsyncLLMEngine.from_engine_args(engine_args) @pytest.mark.parametrize("model", ["facebook/opt-125m"]) @@ -55,7 +51,9 @@ def test_custom_executor(model, tmp_path): assert not os.path.exists(".marker") engine_args = EngineArgs( - model=model, distributed_executor_backend=CustomGPUExecutor) + model=model, + distributed_executor_backend=CustomUniExecutor, + ) engine = LLMEngine.from_engine_args(engine_args) sampling_params = SamplingParams(max_tokens=1) @@ -75,7 +73,7 @@ def test_custom_executor_async(model, tmp_path): assert not os.path.exists(".marker") engine_args = AsyncEngineArgs( - model=model, distributed_executor_backend=CustomGPUExecutorAsync) + model=model, distributed_executor_backend=CustomUniExecutorAsync) engine = AsyncLLMEngine.from_engine_args(engine_args) sampling_params = SamplingParams(max_tokens=1) diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py index e07dd6deef5bf..04505fcaae24b 100644 --- a/tests/engine/test_multiproc_workers.py +++ b/tests/engine/test_multiproc_workers.py @@ -6,16 +6,15 @@ from typing import Any, List, Tuple import pytest +from vllm.config import VllmConfig from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper, ResultHandler, WorkerMonitor) +from vllm.worker.worker_base import WorkerWrapperBase -class DummyWorker: +class DummyWorkerWrapper(WorkerWrapperBase): """Dummy version of vllm.worker.worker.Worker""" - def __init__(self, rank: int): - self.rank = rank - def worker_method(self, worker_input: Any) -> Tuple[int, Any]: sleep(0.05) @@ -23,14 +22,15 @@ class DummyWorker: # simulate error case raise worker_input - return self.rank, input + return self.rpc_rank, input def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]: result_handler = ResultHandler() + vllm_config = VllmConfig() workers = [ - ProcessWorkerWrapper(result_handler, partial(DummyWorker, rank=rank)) - for rank in range(8) + ProcessWorkerWrapper(result_handler, DummyWorkerWrapper, vllm_config, + rank) for rank in range(8) ] worker_monitor = WorkerMonitor(workers, result_handler) diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py new file mode 100644 index 0000000000000..22473ce275295 --- /dev/null +++ b/tests/entrypoints/llm/test_collective_rpc.py @@ -0,0 +1,36 @@ +import pytest + +from vllm import LLM + +from ...utils import fork_new_process_for_each_test + + +@pytest.mark.parametrize("tp_size", [1, 2]) +@pytest.mark.parametrize("backend", ["mp", "ray"]) +@fork_new_process_for_each_test +def test_collective_rpc(tp_size, backend): + if tp_size == 1 and backend == "ray": + pytest.skip("Skip duplicate test case") + if tp_size == 1: + backend = None + + # intentionally define the method and class in the test function, + # to test if they can be serialized and sent to the workers + def echo_rank(self): + return self.rank + + from vllm.worker.worker import Worker + + class MyWorker(Worker): + + def echo_rank(self): + return self.rank + + llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", + enforce_eager=True, + load_format="dummy", + tensor_parallel_size=tp_size, + distributed_executor_backend=backend, + worker_cls=MyWorker) + for method in ["echo_rank", echo_rank]: + assert llm.collective_rpc(method) == list(range(tp_size)) diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py index 41163809237e9..3906ad766e0b6 100644 --- a/tests/entrypoints/llm/test_encode.py +++ b/tests/entrypoints/llm/test_encode.py @@ -105,3 +105,10 @@ def test_multiple_pooling_params(llm: LLM): # pooling_params is None, default params should be applied outputs = llm.encode(PROMPTS, pooling_params=None) assert len(PROMPTS) == len(outputs) + + +@pytest.mark.skip_global_cleanup +def test_right_side_truncation(llm: LLM): + # Embeddings models should truncate the end of the prompt + tokenizer = llm.get_tokenizer() + assert tokenizer.truncation_side == "right" diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py new file mode 100644 index 0000000000000..6ff99f6faa143 --- /dev/null +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -0,0 +1,300 @@ +import asyncio +import json +import shutil +from contextlib import suppress + +import openai # use the official client for correctness check +import pytest +import pytest_asyncio +# downloading lora to test lora requests +from huggingface_hub import snapshot_download + +from ...utils import RemoteOpenAIServer + +# any model with a chat template should work here +MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +# technically this needs Mistral-7B-v0.1 as base, but we're not testing +# generation quality here +LORA_NAME = "typeof/zephyr-7b-beta-lora" + +BADREQUEST_CASES = [ + ( + "test_rank", + { + "r": 1024 + }, + "is greater than max_lora_rank", + ), + ( + "test_bias", + { + "bias": "all" + }, + "Adapter bias cannot be used without bias_enabled", + ), + ("test_dora", { + "use_dora": True + }, "does not yet support DoRA"), + ( + "test_modules_to_save", + { + "modules_to_save": ["lm_head"] + }, + "only supports modules_to_save being None", + ), +] + + +@pytest.fixture(scope="module") +def zephyr_lora_files(): + return snapshot_download(repo_id=LORA_NAME) + + +@pytest.fixture(scope="module") +def server_with_lora_modules_json(zephyr_lora_files): + # Define the json format LoRA module configurations + lora_module_1 = { + "name": "zephyr-lora", + "path": zephyr_lora_files, + "base_model_name": MODEL_NAME + } + + lora_module_2 = { + "name": "zephyr-lora2", + "path": zephyr_lora_files, + "base_model_name": MODEL_NAME + } + + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--enforce-eager", + # lora config below + "--enable-lora", + "--lora-modules", + json.dumps(lora_module_1), + json.dumps(lora_module_2), + "--max-lora-rank", + "64", + "--max-cpu-loras", + "2", + "--max-num-seqs", + "64", + ] + + # Enable the /v1/load_lora_adapter endpoint + envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"} + + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server_with_lora_modules_json): + async with server_with_lora_modules_json.get_async_client( + ) as async_client: + yield async_client + + +@pytest.mark.asyncio +async def test_static_lora_lineage(client: openai.AsyncOpenAI, + zephyr_lora_files): + models = await client.models.list() + models = models.data + served_model = models[0] + lora_models = models[1:] + assert served_model.id == MODEL_NAME + assert served_model.root == MODEL_NAME + assert served_model.parent is None + assert all(lora_model.root == zephyr_lora_files + for lora_model in lora_models) + assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models) + assert lora_models[0].id == "zephyr-lora" + assert lora_models[1].id == "zephyr-lora2" + + +@pytest.mark.asyncio +async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, + zephyr_lora_files): + + response = await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": "zephyr-lora-3", + "lora_path": zephyr_lora_files + }) + # Ensure adapter loads before querying /models + assert "success" in response + + models = await client.models.list() + models = models.data + dynamic_lora_model = models[-1] + assert dynamic_lora_model.root == zephyr_lora_files + assert dynamic_lora_model.parent == MODEL_NAME + assert dynamic_lora_model.id == "zephyr-lora-3" + + +@pytest.mark.asyncio +async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI): + with pytest.raises(openai.NotFoundError): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": "notfound", + "lora_path": "/not/an/adapter" + }) + + +@pytest.mark.asyncio +async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, + tmp_path): + invalid_files = tmp_path / "invalid_files" + invalid_files.mkdir() + (invalid_files / "adapter_config.json").write_text("this is not json") + + with pytest.raises(openai.BadRequestError): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": "invalid-json", + "lora_path": str(invalid_files) + }) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_name,config_change,expected_error", + BADREQUEST_CASES) +async def test_dynamic_lora_badrequests(client: openai.AsyncOpenAI, tmp_path, + zephyr_lora_files, test_name: str, + config_change: dict, + expected_error: str): + # Create test directory + test_dir = tmp_path / test_name + + # Copy adapter files + shutil.copytree(zephyr_lora_files, test_dir) + + # Load and modify configuration + config_path = test_dir / "adapter_config.json" + with open(config_path) as f: + adapter_config = json.load(f) + # Apply configuration changes + adapter_config.update(config_change) + + # Save modified configuration + with open(config_path, "w") as f: + json.dump(adapter_config, f) + + # Test loading the adapter + with pytest.raises(openai.BadRequestError, match=expected_error): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": test_name, + "lora_path": str(test_dir) + }) + + +@pytest.mark.asyncio +async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path, + zephyr_lora_files): + """Validate that many loras can be dynamically registered and inferenced + with concurrently""" + + # This test file configures the server with --max-cpu-loras=2 and this test + # will concurrently load 10 adapters, so it should flex the LRU cache + async def load_and_run_adapter(adapter_name: str): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": adapter_name, + "lora_path": str(zephyr_lora_files) + }) + for _ in range(3): + await client.completions.create( + model=adapter_name, + prompt=["Hello there", "Foo bar bazz buzz"], + max_tokens=5, + ) + + lora_tasks = [] + for i in range(10): + lora_tasks.append( + asyncio.create_task(load_and_run_adapter(f"adapter_{i}"))) + + results, _ = await asyncio.wait(lora_tasks) + + for r in results: + assert not isinstance(r, Exception), f"Got exception {r}" + + +@pytest.mark.asyncio +async def test_loading_invalid_adapters_does_not_break_others( + client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files): + + invalid_files = tmp_path / "invalid_files" + invalid_files.mkdir() + (invalid_files / "adapter_config.json").write_text("this is not json") + + stop_good_requests_event = asyncio.Event() + + async def run_good_requests(client): + # Run chat completions requests until event set + + results = [] + + while not stop_good_requests_event.is_set(): + try: + batch = await client.completions.create( + model="zephyr-lora", + prompt=["Hello there", "Foo bar bazz buzz"], + max_tokens=5, + ) + results.append(batch) + except Exception as e: + results.append(e) + + return results + + # Create task to run good requests + good_task = asyncio.create_task(run_good_requests(client)) + + # Run a bunch of bad adapter loads + for _ in range(25): + with suppress(openai.NotFoundError): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": "notfound", + "lora_path": "/not/an/adapter" + }) + for _ in range(25): + with suppress(openai.BadRequestError): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": "invalid", + "lora_path": str(invalid_files) + }) + + # Ensure all the running requests with lora adapters succeeded + stop_good_requests_event.set() + results = await good_task + for r in results: + assert not isinstance(r, Exception), f"Got exception {r}" + + # Ensure we can load another adapter and run it + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": "valid", + "lora_path": zephyr_lora_files + }) + await client.completions.create( + model="valid", + prompt=["Hello there", "Foo bar bazz buzz"], + max_tokens=5, + ) diff --git a/tests/entrypoints/openai/test_lora_lineage.py b/tests/entrypoints/openai/test_lora_lineage.py deleted file mode 100644 index ce4f85c13fff9..0000000000000 --- a/tests/entrypoints/openai/test_lora_lineage.py +++ /dev/null @@ -1,109 +0,0 @@ -import json - -import openai # use the official client for correctness check -import pytest -import pytest_asyncio -# downloading lora to test lora requests -from huggingface_hub import snapshot_download - -from ...utils import RemoteOpenAIServer - -# any model with a chat template should work here -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" -# technically this needs Mistral-7B-v0.1 as base, but we're not testing -# generation quality here -LORA_NAME = "typeof/zephyr-7b-beta-lora" - - -@pytest.fixture(scope="module") -def zephyr_lora_files(): - return snapshot_download(repo_id=LORA_NAME) - - -@pytest.fixture(scope="module") -def server_with_lora_modules_json(zephyr_lora_files): - # Define the json format LoRA module configurations - lora_module_1 = { - "name": "zephyr-lora", - "path": zephyr_lora_files, - "base_model_name": MODEL_NAME - } - - lora_module_2 = { - "name": "zephyr-lora2", - "path": zephyr_lora_files, - "base_model_name": MODEL_NAME - } - - args = [ - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--max-model-len", - "8192", - "--enforce-eager", - # lora config below - "--enable-lora", - "--lora-modules", - json.dumps(lora_module_1), - json.dumps(lora_module_2), - "--max-lora-rank", - "64", - "--max-cpu-loras", - "2", - "--max-num-seqs", - "64", - ] - - # Enable the /v1/load_lora_adapter endpoint - envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"} - - with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server: - yield remote_server - - -@pytest_asyncio.fixture -async def client_for_lora_lineage(server_with_lora_modules_json): - async with server_with_lora_modules_json.get_async_client( - ) as async_client: - yield async_client - - -@pytest.mark.asyncio -async def test_static_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI, - zephyr_lora_files): - models = await client_for_lora_lineage.models.list() - models = models.data - served_model = models[0] - lora_models = models[1:] - assert served_model.id == MODEL_NAME - assert served_model.root == MODEL_NAME - assert served_model.parent is None - assert all(lora_model.root == zephyr_lora_files - for lora_model in lora_models) - assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models) - assert lora_models[0].id == "zephyr-lora" - assert lora_models[1].id == "zephyr-lora2" - - -@pytest.mark.asyncio -async def test_dynamic_lora_lineage( - client_for_lora_lineage: openai.AsyncOpenAI, zephyr_lora_files): - - response = await client_for_lora_lineage.post("load_lora_adapter", - cast_to=str, - body={ - "lora_name": - "zephyr-lora-3", - "lora_path": - zephyr_lora_files - }) - # Ensure adapter loads before querying /models - assert "success" in response - - models = await client_for_lora_lineage.models.list() - models = models.data - dynamic_lora_model = models[-1] - assert dynamic_lora_model.root == zephyr_lora_files - assert dynamic_lora_model.parent == MODEL_NAME - assert dynamic_lora_model.id == "zephyr-lora-3" diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py index a803ea4a8d6ad..06e0f93dbe269 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/openai/test_score.py @@ -12,6 +12,9 @@ MODEL_NAME = "BAAI/bge-reranker-v2-m3" def server(): args = [ "--enforce-eager", + # Will be used on tests to compare prompt input length + "--max-model-len", + "100" ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: @@ -20,8 +23,7 @@ def server(): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_text_1_str_text_2_list(server: RemoteOpenAIServer, - model_name: str): +def test_text_1_str_text_2_list(server: RemoteOpenAIServer, model_name: str): text_1 = "What is the capital of France?" text_2 = [ "The capital of Brazil is Brasilia.", "The capital of France is Paris." @@ -45,8 +47,7 @@ async def test_text_1_str_text_2_list(server: RemoteOpenAIServer, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_text_1_list_text_2_list(server: RemoteOpenAIServer, - model_name: str): +def test_text_1_list_text_2_list(server: RemoteOpenAIServer, model_name: str): text_1 = [ "What is the capital of the United States?", "What is the capital of France?" @@ -73,8 +74,7 @@ async def test_text_1_list_text_2_list(server: RemoteOpenAIServer, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_text_1_str_text_2_str(server: RemoteOpenAIServer, - model_name: str): +def test_text_1_str_text_2_str(server: RemoteOpenAIServer, model_name: str): text_1 = "What is the capital of France?" text_2 = "The capital of France is Paris." @@ -91,3 +91,36 @@ async def test_text_1_str_text_2_str(server: RemoteOpenAIServer, assert score.data is not None assert len(score.data) == 1 assert score.data[0].score >= 0.9 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_score_max_model_len(server: RemoteOpenAIServer, model_name: str): + + text_1 = "What is the capital of France?" * 20 + text_2 = [ + "The capital of Brazil is Brasilia.", "The capital of France is Paris." + ] + + score_response = requests.post(server.url_for("score"), + json={ + "model": model_name, + "text_1": text_1, + "text_2": text_2, + }) + assert score_response.status_code == 400 + # Assert just a small fragments of the response + assert "Please reduce the length of the input." in \ + score_response.text + + # Test truncation + score_response = requests.post(server.url_for("score"), + json={ + "model": model_name, + "text_1": text_1, + "text_2": text_2, + "truncate_prompt_tokens": 101 + }) + assert score_response.status_code == 400 + assert "Please, select a smaller truncation size." in \ + score_response.text diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index f431d1065e0eb..85f485364a411 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -52,7 +52,7 @@ async def _async_serving_chat_init(): engine = MockEngine() model_config = await engine.get_model_config() - models = OpenAIServingModels(model_config, BASE_MODEL_PATHS) + models = OpenAIServingModels(engine, model_config, BASE_MODEL_PATHS) serving_completion = OpenAIServingChat(engine, model_config, models, @@ -73,7 +73,8 @@ def test_serving_chat_should_set_correct_max_tokens(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False - models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, + models = OpenAIServingModels(engine_client=mock_engine, + base_model_paths=BASE_MODEL_PATHS, model_config=MockModelConfig()) serving_chat = OpenAIServingChat(mock_engine, MockModelConfig(), @@ -116,7 +117,8 @@ def test_serving_chat_could_load_correct_generation_config(): mock_engine.errored = False # Initialize the serving chat - models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, + models = OpenAIServingModels(engine_client=mock_engine, + base_model_paths=BASE_MODEL_PATHS, model_config=mock_model_config) serving_chat = OpenAIServingChat(mock_engine, mock_model_config, diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py index 96897dc730da2..657ea20213ec9 100644 --- a/tests/entrypoints/openai/test_serving_models.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -4,6 +4,7 @@ from unittest.mock import MagicMock import pytest from vllm.config import ModelConfig +from vllm.engine.protocol import EngineClient from vllm.entrypoints.openai.protocol import (ErrorResponse, LoadLoraAdapterRequest, UnloadLoraAdapterRequest) @@ -21,13 +22,16 @@ LORA_UNLOADING_SUCCESS_MESSAGE = ( async def _async_serving_models_init() -> OpenAIServingModels: mock_model_config = MagicMock(spec=ModelConfig) + mock_engine_client = MagicMock(spec=EngineClient) # Set the max_model_len attribute to avoid missing attribute mock_model_config.max_model_len = 2048 - serving_models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, + serving_models = OpenAIServingModels(engine_client=mock_engine_client, + base_model_paths=BASE_MODEL_PATHS, model_config=mock_model_config, lora_modules=None, prompt_adapters=None) + await serving_models.init_static_loras() return serving_models @@ -113,5 +117,5 @@ async def test_unload_lora_adapter_not_found(): request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter") response = await serving_models.unload_lora_adapter(request) assert isinstance(response, ErrorResponse) - assert response.type == "InvalidUserInput" - assert response.code == HTTPStatus.BAD_REQUEST + assert response.type == "NotFoundError" + assert response.code == HTTPStatus.NOT_FOUND diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py index 6fcc92022855b..090523a836e12 100644 --- a/tests/entrypoints/openai/test_shutdown.py +++ b/tests/entrypoints/openai/test_shutdown.py @@ -1,6 +1,3 @@ -import json -import os - import openai import pytest @@ -10,16 +7,7 @@ MODEL_NAME = "meta-llama/Llama-3.2-1B" @pytest.mark.asyncio -async def test_shutdown_on_engine_failure(tmp_path): - # Use a bad adapter to crash the engine - # (This test will fail when that bug is fixed) - adapter_path = tmp_path / "bad_adapter" - os.mkdir(adapter_path) - with open(adapter_path / "adapter_model_config.json", "w") as f: - json.dump({"not": "real"}, f) - with open(adapter_path / "adapter_model.safetensors", "wb") as f: - f.write(b"this is fake") - +async def test_shutdown_on_engine_failure(): # dtype, max-len etc set so that this can run in CI args = [ "--dtype", @@ -29,9 +17,6 @@ async def test_shutdown_on_engine_failure(tmp_path): "--enforce-eager", "--max-num-seqs", "128", - "--enable-lora", - "--lora-modules", - f"bad-adapter={tmp_path / 'bad_adapter'}", ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: @@ -39,9 +24,13 @@ async def test_shutdown_on_engine_failure(tmp_path): with pytest.raises( (openai.APIConnectionError, openai.InternalServerError)): - # This crashes the engine - await client.completions.create(model="bad-adapter", - prompt="Hello, my name is") + # Asking for lots of prompt logprobs will currently crash the + # engine. This may change in the future when that bug is fixed + prompt = "Hello " * 4000 + await client.completions.create( + model=MODEL_NAME, + prompt=prompt, + extra_body={"prompt_logprobs": 10}) # Now the server should shut down return_code = remote_server.proc.wait(timeout=8) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 8f242df4a60e3..513b466c10d60 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -754,6 +754,7 @@ def test_resolve_content_format_hf_defined(model, expected_format): ("template_chatglm.jinja", "string"), ("template_chatglm2.jinja", "string"), ("template_chatml.jinja", "string"), + ("template_deepseek_vl2.jinja", "string"), ("template_falcon_180b.jinja", "string"), ("template_falcon.jinja", "string"), ("template_inkbot.jinja", "string"), diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index a84501f9c303f..dac26efe866b8 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -6,8 +6,9 @@ import torch from tests.kernels.utils import opcheck from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul, - GeluAndMul, NewGELU, - QuickGELU, SiluAndMul) + GeluAndMul, MulAndSilu, + NewGELU, QuickGELU, + SiluAndMul) from vllm.platforms import current_platform from .allclose_default import get_default_atol, get_default_rtol @@ -21,8 +22,9 @@ CUDA_DEVICES = [ ] -@pytest.mark.parametrize("activation", - ["silu", "gelu", "gelu_tanh", "fatrelu"]) +@pytest.mark.parametrize( + "activation", + ["silu_and_mul", "mul_and_silu", "gelu", "gelu_tanh", "fatrelu"]) @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @@ -40,9 +42,12 @@ def test_act_and_mul( current_platform.seed_everything(seed) torch.set_default_device(device) x = torch.randn(num_tokens, 2 * d, dtype=dtype) - if activation == "silu": + if activation == "silu_and_mul": layer = SiluAndMul() fn = torch.ops._C.silu_and_mul + if activation == "mul_and_silu": + layer = MulAndSilu() + fn = torch.ops._C.mul_and_silu elif activation == "gelu": layer = GeluAndMul(approximate="none") fn = torch.ops._C.gelu_and_mul @@ -55,8 +60,9 @@ def test_act_and_mul( fn = torch.ops._C.fatrelu_and_mul out = layer(x) ref_out = layer.forward_native(x) - # The SiLU, GELU and FatReLU implementations are equivalent to the native - # PyTorch implementations, so we can do exact comparison. + # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are + # equivalent to the native PyTorch implementations, so we can do exact + # comparison. torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0) d = x.shape[-1] // 2 diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 3e3c0668198ad..124d5d297a574 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -31,9 +31,9 @@ NUM_GEN_SEQS = [7] # Arbitrary values for testing NUM_PREFILL_SEQS = [3] # Arbitrary values for testing NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing -# FlashAttention forward only supports head dimension at most 128 -# https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62 -HEAD_SIZES = [64, 80, 120, 256] +# This should be sync with get_supported_head_sizes() in +# vllm.attention.ops.paged_attn.PagedAttention +HEAD_SIZES = [32, 64, 80, 96, 112, 120, 128, 192, 256] BLOCK_SIZES = [16, 32] USE_ALIBI = [False, True] diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index a08c874407e3f..492acb91e8ed9 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -94,7 +94,12 @@ def test_flash_attn(monkeypatch): def test_invalid_env(monkeypatch): - """Throw an exception if the backend name is invalid.""" + """Ignore the invalid env variable if it is set.""" override_backend_env_variable(monkeypatch, STR_INVALID_VAL) - with pytest.raises(ValueError): - get_attn_backend(16, torch.float16, None, 16, False) + with patch("vllm.attention.selector.current_platform", CudaPlatform()): + backend = get_attn_backend(32, torch.float16, None, 16, False) + assert backend.get_name() == "FLASH_ATTN" + + # when block size == 16, backend will fall back to XFORMERS + backend = get_attn_backend(16, torch.float16, None, 16, False) + assert backend.get_name() == "XFORMERS" diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py index 614674375786e..e008a56de6208 100644 --- a/tests/kernels/test_encoder_decoder_attn.py +++ b/tests/kernels/test_encoder_decoder_attn.py @@ -142,12 +142,18 @@ def _make_test_resources(test_pt: TestPoint, ) -> TestResources: torch.tensor([], dtype=torch.float32, device=CUDA_DEVICE)) # Construct KV cache - kv_cache = make_kv_cache(test_pt.num_blocks, - test_pt.num_heads, - test_pt.head_size, - test_pt.block_size, - device=CUDA_DEVICE, - backend=test_pt.backend_name) + if test_pt.attn_type in (AttentionType.DECODER, + AttentionType.ENCODER_DECODER): + kv_cache = make_kv_cache(test_pt.num_blocks, + test_pt.num_heads, + test_pt.head_size, + test_pt.block_size, + device=CUDA_DEVICE, + backend=test_pt.backend_name) + else: + kv_cache = torch.tensor([]) + + attn.kv_cache = [kv_cache] return TestResources(scale, attn, kv_cache) diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 8b23b62826053..7fa5de1984452 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -14,6 +14,8 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_topk, moe_align_block_size) +from vllm.model_executor.layers.fused_moe.moe_torch_iterative import ( + fused_moe as iterative_moe) from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( marlin_quantize) from vllm.model_executor.models.mixtral import MixtralMoE @@ -46,6 +48,11 @@ def test_fused_moe( triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False) torch_output = torch_moe(a, w1, w2, score, topk) torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0) + iterative_output = iterative_moe(a, w1, w2, score, topk, renormalize=False) + torch.testing.assert_close(iterative_output, + torch_output, + atol=2e-2, + rtol=0) @pytest.mark.parametrize("dtype", diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py index 4beba4dc05dde..1cc1ced9968d7 100644 --- a/tests/kv_transfer/test_send_recv.py +++ b/tests/kv_transfer/test_send_recv.py @@ -22,13 +22,13 @@ def test_run(my_rank, pipe): x2 = pipe.recv_tensor() print(f"rank {my_rank} received x2 = ", x2) y2 = pipe.recv_tensor() - print(f"rank {my_rank} received y2 = ", x2) + print(f"rank {my_rank} received y2 = ", y2) else: x2 = pipe.recv_tensor() print(f"rank {my_rank} received x2 = ", x2) y2 = pipe.recv_tensor() - print(f"rank {my_rank} received y2 = ", x2) + print(f"rank {my_rank} received y2 = ", y2) pipe.send_tensor(x) print(f"rank {my_rank} sent tensor x") pipe.send_tensor(y) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 57ebaa424fc59..e7378d00765f0 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -21,6 +21,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader import get_model +from vllm.platforms import current_platform class ContextIDInfo(TypedDict): @@ -65,13 +66,16 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool): @pytest.fixture def dist_init(): temp_file = tempfile.mkstemp()[1] - init_distributed_environment( - world_size=1, - rank=0, - distributed_init_method=f"file://{temp_file}", - local_rank=0, - backend="nccl", - ) + + backend = "nccl" + if current_platform.is_cpu(): + backend = "gloo" + + init_distributed_environment(world_size=1, + rank=0, + distributed_init_method=f"file://{temp_file}", + local_rank=0, + backend=backend) initialize_model_parallel(1, 1) yield cleanup_dist_env_and_memory(shutdown_ray=True) @@ -81,13 +85,15 @@ def dist_init(): def dist_init_torch_only(): if torch.distributed.is_initialized(): return + backend = "nccl" + if current_platform.is_cpu(): + backend = "gloo" + temp_file = tempfile.mkstemp()[1] - torch.distributed.init_process_group( - backend="nccl", - world_size=1, - rank=0, - init_method=f"file://{temp_file}", - ) + torch.distributed.init_process_group(world_size=1, + rank=0, + init_method=f"file://{temp_file}", + backend=backend) @pytest.fixture diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index fb8c0b2a7ba26..08a589d7ee29c 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -48,10 +48,14 @@ TOLERANCES = { torch.float32: (5e-3, 5e-3), torch.bfloat16: (3e-2, 2e-2), } -# TODO: Modify this based on platform -DEVICES = [ + +pytestmark = pytest.mark.skipif( + not (current_platform.is_cuda_alike() or current_platform.is_cpu()), + reason="Backend not supported") + +DEVICES = ([ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] +] if current_platform.is_cuda_alike() else ["cpu"]) #For GPU, we will launch different triton kernels between the prefill and decode # stages, so we need to verify this. prefill stage(True) or decode stage(False) @@ -198,6 +202,10 @@ def check_punica_wrapper(punica_wrapper) -> bool: from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU return type(punica_wrapper) is PunicaWrapperGPU + elif current_platform.is_cpu(): + from vllm.lora.punica_wrapper.punica_cpu import PunicaWrapperCPU + + return type(punica_wrapper) is PunicaWrapperCPU else: return False @@ -211,7 +219,8 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA # device, see: https://github.com/triton-lang/triton/issues/2925 # Same below. - torch.cuda.set_device(device) + if current_platform.is_cuda_alike(): + torch.cuda.set_device(device) torch.set_default_device(device) max_loras = 8 @@ -313,7 +322,9 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: def test_embeddings_with_new_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: - torch.cuda.set_device(device) + if current_platform.is_cuda_alike(): + torch.cuda.set_device(device) + torch.set_default_device(device) max_loras = 8 punica_wrapper = get_punica_wrapper(8192, 256, device) @@ -450,7 +461,9 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device, def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size, stage) -> None: - torch.cuda.set_device(device) + if current_platform.is_cuda_alike(): + torch.cuda.set_device(device) + torch.set_default_device(device) max_loras = 8 punica_wrapper = get_punica_wrapper(8192, 256, device) @@ -582,7 +595,9 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size, def test_linear_replicated(dist_init, num_loras, device, stage, bias_enabled) -> None: - torch.cuda.set_device(device) + if current_platform.is_cuda_alike(): + torch.cuda.set_device(device) + torch.set_default_device(device) punica_wrapper = get_punica_wrapper(8192, 256, device) assert check_punica_wrapper(punica_wrapper) @@ -695,7 +710,9 @@ def test_linear_replicated(dist_init, num_loras, device, stage, def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, device, stage, bias_enabled) -> None: - torch.cuda.set_device(device) + if current_platform.is_cuda_alike(): + torch.cuda.set_device(device) + torch.set_default_device(device) punica_wrapper = get_punica_wrapper(8192, 256, device) assert check_punica_wrapper(punica_wrapper) @@ -818,7 +835,9 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, device, stage, bias_enabled) -> None: - torch.cuda.set_device(device) + if current_platform.is_cuda_alike(): + torch.cuda.set_device(device) + torch.set_default_device(device) punica_wrapper = get_punica_wrapper(8192, 256, device) assert check_punica_wrapper(punica_wrapper) @@ -971,6 +990,8 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, @pytest.mark.parametrize("rotary_dim", [None, 32]) @pytest.mark.parametrize("head_size", [32, 108]) @pytest.mark.parametrize("seq_len", [11, 1024]) +@pytest.mark.skipif(not current_platform.is_cuda_alike(), + reason="Only CUDA backends are supported") def test_rotary_embedding_long_context(dist_init, num_loras, device, scaling_factors, max_position, is_neox_style, rotary_dim, head_size, diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index 537d95b025a9d..b907af47d08d7 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -3,6 +3,7 @@ from typing import List import pytest from vllm.lora.models import LoRAModel +from vllm.lora.peft_helper import PEFTHelper from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM from vllm.model_executor.models.utils import WeightsMapper @@ -30,11 +31,14 @@ def test_load_checkpoints( else: expected_lora_modules.append(module) if lora_name == "baichuan7B": + peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files, + max_position_embeddings=4096) # For the baichuan7B model, load it's LoRA, # and the test should pass. LoRAModel.from_local_checkpoint( baichuan_lora_files, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, @@ -43,9 +47,12 @@ def test_load_checkpoints( # Test that the target_modules contain prefix # such as "model.layers.0.self_atten.W_pack", and # the test should pass. + peft_helper = PEFTHelper.from_local_dir(baichuan_zero_lora_files, + max_position_embeddings=4096) LoRAModel.from_local_checkpoint( baichuan_zero_lora_files, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, @@ -53,9 +60,12 @@ def test_load_checkpoints( elif lora_name == "baichuan7B-zero-regex": # Test that the `target_modules` in the form of regular expressions, # such as `model\\..*(W_pack|o_proj)`, and the test should pass. + peft_helper = PEFTHelper.from_local_dir(baichuan_regex_lora_files, + max_position_embeddings=4096) LoRAModel.from_local_checkpoint( baichuan_regex_lora_files, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, @@ -64,10 +74,13 @@ def test_load_checkpoints( # For the baichuan7B model, load chatglm3-6b's LoRA, # and the test should raise the following error. expected_error = "Please verify that the loaded LoRA module is correct" # noqa: E501 + peft_helper = PEFTHelper.from_local_dir(chatglm3_lora_files, + max_position_embeddings=4096) with pytest.raises(ValueError, match=expected_error): LoRAModel.from_local_checkpoint( chatglm3_lora_files, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, @@ -94,9 +107,12 @@ def test_lora_weights_mapping(baichuan_lora_files): ".layers.": ".baichuan_layers.", }, ) + peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files, + max_position_embeddings=4096) lora_model = LoRAModel.from_local_checkpoint( baichuan_lora_files, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index e2daf9d135113..1c0ee01c038d0 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -3,6 +3,7 @@ from typing import List import pytest from vllm.lora.models import LoRAModel +from vllm.lora.peft_helper import PEFTHelper from vllm.lora.utils import get_adapter_absolute_path from vllm.model_executor.models.llama import LlamaForCausalLM @@ -27,9 +28,11 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request): lora_path = get_adapter_absolute_path(lora_name) # lora loading should work for either absolute path and hugggingface id. + peft_helper = PEFTHelper.from_local_dir(lora_path, 4096) lora_model = LoRAModel.from_local_checkpoint( lora_path, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index a099f36b0a465..9a5b9aabf5078 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -1,5 +1,3 @@ -import json -import math import os from typing import Dict, List @@ -20,6 +18,7 @@ from vllm.lora.request import LoRARequest from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager, WorkerLoRAManager) from vllm.model_executor.layers.linear import RowParallelLinear +from vllm.platforms import current_platform EMBEDDING_MODULES = { "embed_tokens": "input_embeddings", @@ -28,73 +27,20 @@ EMBEDDING_MODULES = { EMBEDDING_PADDING_MODULES = ["lm_head"] -CUDA_DEVICES = [ +DEVICES = ([ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] +] if current_platform.is_cuda_alike() else ["cpu"]) -def test_peft_helper(sql_lora_files): - lora_config_path = os.path.join(sql_lora_files, "adapter_config.json") - with open(lora_config_path) as f: - config = json.load(f) - peft_helper = PEFTHelper.from_dict(config) - assert peft_helper.r == 8 - assert peft_helper.lora_alpha == 16 - assert peft_helper.target_modules == [ - "q_proj", - "v_proj", - "k_proj", - "o_proj", - "gate_proj", - "up_proj", - "down_proj", - "embed_tokens", - "lm_head", - ] - scaling = peft_helper.lora_alpha / peft_helper.r - assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3 - - # test RSLoRA - config = dict(r=8, - lora_alpha=16, - target_modules=["gate_proj"], - use_rslora=True) - peft_helper = PEFTHelper.from_dict(config) - - scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r) - assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3 - - expected_error = "vLLM only supports modules_to_save being None." - with pytest.raises(ValueError, match=expected_error): - config = dict( - r=8, - lora_alpha=16, - target_modules=["gate_proj"], - modules_to_save=["lm_head"], - ) - PEFTHelper.from_dict(config) - - expected_error = "vLLM does not yet support DoRA." - with pytest.raises(ValueError, match=expected_error): - config = dict(r=8, - lora_alpha=16, - target_modules=["gate_proj"], - use_dora=True) - PEFTHelper.from_dict(config) - - -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_from_lora_tensors(sql_lora_files, device): tensors = load_file( os.path.join(sql_lora_files, "adapter_model.safetensors")) new_embeddings = load_file( os.path.join(sql_lora_files, "new_embeddings.safetensors")) - lora_config_path = os.path.join(sql_lora_files, "adapter_config.json") - with open(lora_config_path) as f: - config = json.load(f) - - peft_helper = PEFTHelper.from_dict(config) + peft_helper = PEFTHelper.from_local_dir(sql_lora_files, + max_position_embeddings=4096) lora_model = LoRAModel.from_lora_tensors( 1, tensors, @@ -171,7 +117,7 @@ def test_replace_submodules(dist_init, dummy_model): manager = LoRAModelManager( model, 1, 1, 1, LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8), - torch.device("cuda")) + torch.device(DEVICES[0])) model = manager.model assert isinstance(model.get_submodule("dense1"), @@ -183,7 +129,7 @@ def test_replace_submodules(dist_init, dummy_model): RowParallelLinearWithLoRA) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_lora_model_manager(dist_init, dummy_model, device): model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] @@ -244,7 +190,7 @@ def test_lora_model_manager(dist_init, dummy_model, device): assert manager.punica_wrapper.device == device -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_lora_lru_cache_model_manager(dist_init, dummy_model, device): model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] @@ -336,7 +282,7 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device): assert manager.device == device -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_lru_lora_model_manager(dist_init, dummy_model, device): # This tests just the LRU cache functionality, everything else is # tested in test_lora_model_manager @@ -466,7 +412,7 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device): assert manager.device == device -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, sql_lora_files, device): lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4) @@ -545,7 +491,7 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, device) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, sql_lora_files, device): # Should remove every LoRA not specified in the request. @@ -621,7 +567,7 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, device) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_packed_loras(dist_init, dummy_model_gate_up, device): model = dummy_model_gate_up model.supported_lora_modules = ["gate_up_proj"] diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index 797a495201d33..940a865228806 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -5,6 +5,7 @@ import torch import vllm from vllm.lora.request import LoRARequest +from vllm.platforms import current_platform MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1" @@ -31,7 +32,8 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, @pytest.mark.parametrize("tp_size", [4]) def test_mixtral_lora(mixtral_lora_files, tp_size): """Original test, the LoRA model has the common target modules, not all""" - if torch.cuda.device_count() < tp_size: + if torch.cuda.device_count( + ) < tp_size and tp_size > 1 and current_platform.is_cuda_alike(): pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") prompts = [ diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py new file mode 100644 index 0000000000000..a524d5ce5f34a --- /dev/null +++ b/tests/lora/test_peft_helper.py @@ -0,0 +1,109 @@ +import json +import math +import shutil + +import pytest + +from vllm.config import LoRAConfig +from vllm.lora.peft_helper import PEFTHelper + +ERROR_CASES = [ + ( + "test_rank", + { + "r": 1024 + }, + "is greater than max_lora_rank", + ), + ( + "test_bias", + { + "bias": "all" + }, + "Adapter bias cannot be used without bias_enabled", + ), + ("test_dora", { + "use_dora": True + }, "does not yet support DoRA"), + ( + "test_modules_to_save", + { + "modules_to_save": ["lm_head"] + }, + "only supports modules_to_save being None", + ), +] + + +def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path): + peft_helper = PEFTHelper.from_local_dir(long_context_lora_files_16k_1, + max_position_embeddings=4096) + lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2) + peft_helper.validate_legal(lora_config) + assert peft_helper.r == 8 + assert peft_helper.lora_alpha == 16 + assert peft_helper.target_modules == [ + "q_proj", + "v_proj", + "k_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + "embed_tokens", + "lm_head", + ] + assert peft_helper.context_length == 16384 + assert peft_helper.vllm_max_position_embeddings == 4096 + assert peft_helper.vllm_long_context_scaling_factor == float( + math.ceil(peft_helper.context_length / + peft_helper.vllm_max_position_embeddings)) + # test RSLoRA + rslora_config = dict(use_rslora=True) + test_dir = tmp_path / "test_rslora" + shutil.copytree(long_context_lora_files_16k_1, test_dir) + + # Load and modify configuration + config_path = test_dir / "adapter_config.json" + with open(config_path) as f: + adapter_config = json.load(f) + # Apply configuration changes + adapter_config.update(rslora_config) + + # Save modified configuration + with open(config_path, "w") as f: + json.dump(adapter_config, f) + + peft_helper = PEFTHelper.from_local_dir(test_dir, + max_position_embeddings=4096) + peft_helper.validate_legal(lora_config) + scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r) + assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3 + + +@pytest.mark.parametrize("test_name,config_change,expected_error", ERROR_CASES) +def test_peft_helper_error( + sql_lora_files, + tmp_path, + test_name: str, + config_change: dict, + expected_error: str, +): + test_dir = tmp_path / test_name + shutil.copytree(sql_lora_files, test_dir) + + # Load and modify configuration + config_path = test_dir / "adapter_config.json" + with open(config_path) as f: + adapter_config = json.load(f) + # Apply configuration changes + adapter_config.update(config_change) + + # Save modified configuration + with open(config_path, "w") as f: + json.dump(adapter_config, f) + lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2) + # Test loading the adapter + with pytest.raises(ValueError, match=expected_error): + PEFTHelper.from_local_dir( + test_dir, max_position_embeddings=4096).validate_legal(lora_config) diff --git a/tests/lora/test_punica_sizes.py b/tests/lora/test_punica_ops_sizes.py similarity index 77% rename from tests/lora/test_punica_sizes.py rename to tests/lora/test_punica_ops_sizes.py index 0351fedd1cfa5..433ca7577d084 100644 --- a/tests/lora/test_punica_sizes.py +++ b/tests/lora/test_punica_ops_sizes.py @@ -9,17 +9,16 @@ from threading import Lock import pytest import torch -from vllm.lora.ops.bgmv_expand import bgmv_expand -from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice -from vllm.lora.ops.bgmv_shrink import bgmv_shrink -from vllm.lora.ops.sgmv_expand import sgmv_expand -from vllm.lora.ops.sgmv_shrink import sgmv_shrink -from vllm.lora.ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT +import vllm.lora.ops.triton_ops # noqa: F401 +from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice, + bgmv_shrink, sgmv_expand, + sgmv_expand_slice, sgmv_shrink) +from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT from vllm.platforms import current_platform from .utils import (assert_close, generate_data, generate_data_for_expand_nslices, - generate_data_for_nslices, ref_torch_groupgemm) + generate_data_for_nslices) HIDDEN_SIZES = [ 128, @@ -113,7 +112,7 @@ DTYPES = [torch.float16, torch.bfloat16] MAX_RANKS = [32] SCALES = [0.5] SEED = [0] -CUDA_DEVICES = [f"cuda:{0}"] +DEVICES = [f"cuda:{0}"] _dict_lock = Lock() @@ -127,7 +126,7 @@ _dict_lock = Lock() @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("op_type", ["shrink", "expand"]) @pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_punica_sgmv( batches: int, num_loras: int, @@ -174,7 +173,7 @@ def test_punica_sgmv( # Preventing cache error pointer. with _dict_lock: _LORA_A_PTR_DICT.clear() - sgmv_shrink( + torch.ops.vllm.sgmv_shrink( inputs_tensor, lora_weights_lst, our_out_tensor, @@ -187,20 +186,23 @@ def test_punica_sgmv( scaling, ) for index in range(nslices): - ref_torch_groupgemm( - ref_out_tensor[index], + sgmv_shrink( inputs_tensor, lora_weights_lst[index], - lora_indices_tensor, + ref_out_tensor[index], + b_seq_start_loc, seq_len_tensor, + lora_indices_tensor, batches, + max_seq_length, + token_nums, scaling, - op_type, ) + else: with _dict_lock: _LORA_B_PTR_DICT.clear() - sgmv_expand( + torch.ops.vllm.sgmv_expand( inputs_tensor, lora_weights_lst, our_out_tensor, @@ -213,21 +215,39 @@ def test_punica_sgmv( offset_start=0, add_inputs=True, ) - - slice_offset = 0 - for index in range(nslices): - lora_weights = lora_weights_lst[index] - ref_torch_groupgemm( - ref_out_tensor[:, slice_offset:slice_offset + hidden_size], - inputs_tensor[index], - lora_weights, - lora_indices_tensor, + if nslices == 1: + # Verify the torch's sgmv_expand op + sgmv_expand( + inputs_tensor[0], + lora_weights_lst[0], + ref_out_tensor, + b_seq_start_loc, seq_len_tensor, + lora_indices_tensor, batches, - 1.0, - op_type, + max_seq_length, + token_nums, + add_inputs=True, ) - slice_offset += hidden_size + else: + slice_offset = 0 + for index in range(nslices): + lora_weights = lora_weights_lst[index] + sgmv_expand_slice( + inputs_tensor[index], + lora_weights, + ref_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + slice_offset, + hidden_size, + add_inputs=True, + ) + slice_offset += hidden_size assert_close(our_out_tensor, ref_out_tensor) @@ -240,7 +260,7 @@ def test_punica_sgmv( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("op_type", ["shrink", "expand"]) @pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_punica_bgmv( batches: int, num_loras: int, @@ -276,31 +296,38 @@ def test_punica_bgmv( device, ) if op_type == "shrink": - bgmv_shrink( + torch.ops.vllm.bgmv_shrink( inputs_tensor, lora_weights, our_out_tensor, indices, scaling, ) + + bgmv_shrink( + inputs_tensor, + lora_weights, + ref_out_tensor, + indices, + scaling, + ) + else: - bgmv_expand( + torch.ops.vllm.bgmv_expand( inputs_tensor, lora_weights, our_out_tensor, indices, add_inputs=True, ) - ref_torch_groupgemm( - ref_out_tensor, - inputs_tensor, - lora_weights, - lora_indices_tensor, - seq_len_tensor, - batches, - scaling if op_type == "shrink" else 1.0, - op_type, - ) + bgmv_expand( + inputs_tensor, + lora_weights, + ref_out_tensor, + indices, + add_inputs=True, + ) + if op_type == "shrink": ref_out_tensor = ref_out_tensor.to(torch.float32) assert_close(our_out_tensor, ref_out_tensor) @@ -313,7 +340,7 @@ def test_punica_bgmv( @pytest.mark.parametrize("nslices", [2, 3]) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_punica_bgmv_expand_nslices( batches: int, num_loras: int, @@ -350,7 +377,7 @@ def test_punica_bgmv_expand_nslices( slice_offset = 0 for index in range(nslices): lora_weights = lora_weights_lst[index] - bgmv_expand_slice( + torch.ops.vllm.bgmv_expand_slice( inputs_tensor, lora_weights, our_outputs, @@ -359,15 +386,14 @@ def test_punica_bgmv_expand_nslices( slice_size=hidden_size, add_inputs=True, ) - ref_torch_groupgemm( - ref_outputs[:, slice_offset:slice_offset + hidden_size], + bgmv_expand_slice( inputs_tensor, lora_weights, - lora_indices_tensor, - seq_len_tensor, - batches, - 1.0, - op_type="expand", + ref_outputs, + indices, + slice_offset, + slice_size=hidden_size, + add_inputs=True, ) slice_offset += hidden_size diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_ops_variation.py similarity index 74% rename from tests/lora/test_punica_variation.py rename to tests/lora/test_punica_ops_variation.py index 9ee10e7c23ee6..2bb84c1cf11e9 100644 --- a/tests/lora/test_punica_variation.py +++ b/tests/lora/test_punica_ops_variation.py @@ -9,19 +9,18 @@ import pytest import torch # Enable custom op register -import vllm.lora.ops.bgmv_expand -import vllm.lora.ops.bgmv_expand_slice -import vllm.lora.ops.bgmv_shrink -import vllm.lora.ops.sgmv_expand -import vllm.lora.ops.sgmv_shrink # noqa: F401 -from vllm.lora.ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT +import vllm.lora.ops.triton_ops # noqa: F401 +from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice, + bgmv_shrink, sgmv_expand, + sgmv_expand_slice, sgmv_shrink) +from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT from vllm.platforms import current_platform from .utils import (assert_close, generate_data, generate_data_for_expand_nslices, - generate_data_for_nslices, ref_torch_groupgemm) + generate_data_for_nslices) -HIDDEN_SIZES = [4097] +HIDDEN_SIZES = [2049] BATCHES = [1, 4, 16, 32] NUM_LORA = [1, 8, 32, 128] @@ -29,15 +28,7 @@ DTYPES = [torch.float16, torch.bfloat16] MAX_RANKS = [1, 4, 8, 16, 32, 64, 128, 256] SCALES = [0.5] SEED = [0] -CUDA_DEVICES = [f"cuda:{0}"] - -# Unlike test_punica_sizes.py, we directly utilize custom op for -# testing, which verifies the correct registration of these ops. -bgmv_expand = torch.ops.vllm.bgmv_expand -bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice -bgmv_shrink = torch.ops.vllm.bgmv_shrink -sgmv_expand = torch.ops.vllm.sgmv_expand -sgmv_shrink = torch.ops.vllm.sgmv_shrink +DEVICES = [f"cuda:{0}"] _dict_lock = Lock() @@ -51,7 +42,7 @@ _dict_lock = Lock() @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("op_type", ["shrink", "expand"]) @pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_punica_sgmv( batches: int, num_loras: int, @@ -98,7 +89,7 @@ def test_punica_sgmv( # Preventing cache error pointer. with _dict_lock: _LORA_A_PTR_DICT.clear() - sgmv_shrink( + torch.ops.vllm.sgmv_shrink( inputs_tensor, lora_weights_lst, our_out_tensor, @@ -111,20 +102,23 @@ def test_punica_sgmv( scaling, ) for index in range(nslices): - ref_torch_groupgemm( - ref_out_tensor[index], + sgmv_shrink( inputs_tensor, lora_weights_lst[index], - lora_indices_tensor, + ref_out_tensor[index], + b_seq_start_loc, seq_len_tensor, + lora_indices_tensor, batches, + max_seq_length, + token_nums, scaling, - op_type, ) + else: with _dict_lock: _LORA_B_PTR_DICT.clear() - sgmv_expand( + torch.ops.vllm.sgmv_expand( inputs_tensor, lora_weights_lst, our_out_tensor, @@ -137,21 +131,39 @@ def test_punica_sgmv( offset_start=0, add_inputs=True, ) - slice_offset = 0 - for index in range(nslices): - lora_weights = lora_weights_lst[index] - ref_torch_groupgemm( - ref_out_tensor[:, slice_offset:slice_offset + hidden_size], - inputs_tensor[index], - lora_weights, - lora_indices_tensor, + if nslices == 1: + # Verify the torch's sgmv_expand op + sgmv_expand( + inputs_tensor[0], + lora_weights_lst[0], + ref_out_tensor, + b_seq_start_loc, seq_len_tensor, + lora_indices_tensor, batches, - 1.0, - op_type, + max_seq_length, + token_nums, + add_inputs=True, ) - slice_offset += hidden_size + else: + for index in range(nslices): + lora_weights = lora_weights_lst[index] + sgmv_expand_slice( + inputs_tensor[index], + lora_weights, + ref_out_tensor, + b_seq_start_loc, + seq_len_tensor, + lora_indices_tensor, + batches, + max_seq_length, + token_nums, + slice_offset, + hidden_size, + add_inputs=True, + ) + slice_offset += hidden_size assert_close(our_out_tensor, ref_out_tensor) @@ -164,7 +176,7 @@ def test_punica_sgmv( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("op_type", ["shrink", "expand"]) @pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_punica_bgmv( batches: int, num_loras: int, @@ -176,7 +188,6 @@ def test_punica_bgmv( seed: int, device: str, ): - torch.set_default_device(device) current_platform.seed_everything(seed) @@ -201,32 +212,38 @@ def test_punica_bgmv( device, ) if op_type == "shrink": - bgmv_shrink( + torch.ops.vllm.bgmv_shrink( inputs_tensor, lora_weights, our_out_tensor, indices, scaling, ) - else: - bgmv_expand( + bgmv_shrink( + inputs_tensor, + lora_weights, + ref_out_tensor, + indices, + scaling, + ) + + else: + torch.ops.vllm.bgmv_expand( inputs_tensor, lora_weights, our_out_tensor, indices, add_inputs=True, ) - ref_torch_groupgemm( - ref_out_tensor, - inputs_tensor, - lora_weights, - lora_indices_tensor, - seq_len_tensor, - batches, - scaling if op_type == "shrink" else 1.0, - op_type, - ) + bgmv_expand( + inputs_tensor, + lora_weights, + ref_out_tensor, + indices, + add_inputs=True, + ) + if op_type == "shrink": ref_out_tensor = ref_out_tensor.to(torch.float32) assert_close(our_out_tensor, ref_out_tensor) @@ -239,7 +256,7 @@ def test_punica_bgmv( @pytest.mark.parametrize("nslices", [2, 3]) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEED) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_punica_bgmv_expand_nslices( batches: int, num_loras: int, @@ -276,7 +293,7 @@ def test_punica_bgmv_expand_nslices( slice_offset = 0 for index in range(nslices): lora_weights = lora_weights_lst[index] - bgmv_expand_slice( + torch.ops.vllm.bgmv_expand_slice( inputs_tensor, lora_weights, our_outputs, @@ -285,15 +302,14 @@ def test_punica_bgmv_expand_nslices( slice_size=hidden_size, add_inputs=True, ) - ref_torch_groupgemm( - ref_outputs[:, slice_offset:slice_offset + hidden_size], + bgmv_expand_slice( inputs_tensor, lora_weights, - lora_indices_tensor, - seq_len_tensor, - batches, - 1.0, - op_type="expand", + ref_outputs, + indices, + slice_offset, + slice_size=hidden_size, + add_inputs=True, ) slice_offset += hidden_size diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 026269667b473..26bf770cc0d4a 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -72,7 +72,8 @@ def do_sample(llm: vllm.LLM, @pytest.mark.parametrize("tp_size", [1]) def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model, tp_size): - if num_gpus_available < tp_size: + if num_gpus_available < tp_size and \ + tp_size > 1 and current_platform.is_cuda_alike(): pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") llm = vllm.LLM( diff --git a/tests/lora/utils.py b/tests/lora/utils.py index b66d18074a7bf..ce47546f2154b 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -104,33 +104,6 @@ def assert_close(a, b): torch.testing.assert_close(a, b, rtol=rtol, atol=atol) -def ref_torch_groupgemm( - out_tensor, - inputs, - lora_weights, - lora_indices_tensor, - seq_len_tensor, - batches, - scaling, - op_type, -) -> torch.Tensor: - out_list = [] - current_offset = 0 - for lora_index, b_length in zip(range(batches), seq_len_tensor): - input_weight = inputs[current_offset:b_length + current_offset, :] - current_offset += b_length - lora_weight = lora_weights[lora_indices_tensor[lora_index]] - result = torch.nn.functional.linear(input_weight, lora_weight) - result *= scaling - out_list.append(result) - cat_result = torch.cat(out_list, dim=0) - if op_type == "expand": - out_tensor += cat_result - else: - out_tensor.copy_(cat_result) - return - - def generate_data( batches, hidden_size, diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index ed321ba9f00c1..9c1f784c1c93b 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -2,7 +2,7 @@ import os import pytest -from vllm.model_executor.layers.pooler import PoolingType +from vllm.model_executor.layers.pooler import CLSPool, PoolingType from vllm.model_executor.models.bert import BertEmbeddingModel from vllm.model_executor.models.roberta import RobertaEmbeddingModel from vllm.platforms import current_platform @@ -25,13 +25,12 @@ def test_model_loading_with_params(vllm_runner): with vllm_runner(model_name=MODEL_NAME, revision=REVISION, dtype="float16", - max_model_len=MAX_MODEL_LEN) as model: - output = model.encode("Write a short story about a robot that" - " dreams for the first time.\n") + max_model_len=MAX_MODEL_LEN) as vllm_model: + output = vllm_model.encode("Write a short story about a robot that" + " dreams for the first time.\n") - model_config = model.model.llm_engine.model_config - - model_tokenizer = model.model.llm_engine.tokenizer + model_config = vllm_model.model.llm_engine.model_config + model_tokenizer = vllm_model.model.llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -46,11 +45,13 @@ def test_model_loading_with_params(vllm_runner): assert model_tokenizer.tokenizer_config["do_lower_case"] assert model_tokenizer.tokenizer.model_max_length == 512 - model = model.model.llm_engine.model_executor\ - .driver_worker.model_runner.model - assert isinstance(model, BertEmbeddingModel) - assert model._pooler.pooling_type == PoolingType.CLS - assert model._pooler.normalize + def check_model(model): + assert isinstance(model, BertEmbeddingModel) + assert model._pooler.pooling_type == PoolingType.CLS + assert model._pooler.normalize + + vllm_model.apply_model(check_model) + # assert output assert output @@ -64,13 +65,12 @@ def test_roberta_model_loading_with_params(vllm_runner): with vllm_runner(model_name=MODEL_NAME_ROBERTA, revision=REVISION_ROBERTA, dtype="float16", - max_model_len=MAX_MODEL_LEN) as model: - output = model.encode("Write a short story about a robot that" - " dreams for the first time.\n") + max_model_len=MAX_MODEL_LEN) as vllm_model: + output = vllm_model.encode("Write a short story about a robot that" + " dreams for the first time.\n") - model_config = model.model.llm_engine.model_config - - model_tokenizer = model.model.llm_engine.tokenizer + model_config = vllm_model.model.llm_engine.model_config + model_tokenizer = vllm_model.model.llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -84,11 +84,38 @@ def test_roberta_model_loading_with_params(vllm_runner): assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-large" assert not model_tokenizer.tokenizer_config["do_lower_case"] - model = model.model.llm_engine.model_executor\ - .driver_worker.model_runner.model - assert isinstance(model, RobertaEmbeddingModel) - assert model._pooler.pooling_type == PoolingType.MEAN - assert model._pooler.normalize + def check_model(model): + assert isinstance(model, RobertaEmbeddingModel) + assert model._pooler.pooling_type == PoolingType.MEAN + assert model._pooler.normalize + + vllm_model.apply_model(check_model) # assert output assert output + + +@pytest.mark.skipif(current_platform.is_rocm(), + reason="Xformers backend is not supported on ROCm.") +def test_facebook_roberta_model_loading_with_params(vllm_runner): + """ + Test loading roberta-base model with no lm_head. + """ + model_name = "FacebookAI/roberta-base" + with vllm_runner(model_name=model_name, + dtype="float16", + max_model_len=MAX_MODEL_LEN) as vllm_model: + output = vllm_model.encode("Write a short story about a robot that" + " dreams for the first time.\n") + + model_tokenizer = vllm_model.model.llm_engine.tokenizer + assert model_tokenizer.tokenizer_id == model_name + + def check_model(model): + assert isinstance(model, RobertaEmbeddingModel) + assert not hasattr(model, "lm_head") + assert isinstance(model._pooler, CLSPool) + + vllm_model.apply_model(check_model) + + assert output diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index 0bb98df1b58e6..1e329dc4cb22e 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -237,8 +237,8 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str, @pytest.mark.asyncio -async def test_online_inference(client, audio_assets): - """Exercises online inference with/without chunked prefill enabled.""" +async def test_online_serving(client, audio_assets): + """Exercises online serving with/without chunked prefill enabled.""" messages = [{ "role": diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py index 2b8f5e2faa45e..81b93ebdf0fc0 100644 --- a/tests/models/decoder_only/language/test_gguf.py +++ b/tests/models/decoder_only/language/test_gguf.py @@ -4,6 +4,7 @@ Note: To pass the test, quantization higher than Q4 should be used """ import os +from typing import List, NamedTuple, Type import pytest from huggingface_hub import hf_hub_download @@ -11,6 +12,7 @@ from transformers import AutoTokenizer from tests.quantization.utils import is_quant_method_supported +from ....conftest import VllmRunner from ...utils import check_logprobs_close os.environ["TOKENIZERS_PARALLELISM"] = "true" @@ -18,31 +20,74 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true" MAX_MODEL_LEN = 1024 +class GGUFTestConfig(NamedTuple): + original_model: str + gguf_repo: str + gguf_filename: str + + @property + def gguf_model(self): + return hf_hub_download(self.gguf_repo, filename=self.gguf_filename) + + +LLAMA_CONFIG = GGUFTestConfig( + original_model="meta-llama/Llama-3.2-1B-Instruct", + gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF", + gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf", +) + +QWEN2_CONFIG = GGUFTestConfig( + original_model="Qwen/Qwen2.5-1.5B-Instruct", + gguf_repo="Qwen/Qwen2.5-1.5B-Instruct-GGUF", + gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf", +) + +PHI3_CONFIG = GGUFTestConfig( + original_model="microsoft/Phi-3.5-mini-instruct", + gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF", + gguf_filename="Phi-3.5-mini-instruct-IQ4_XS.gguf", +) + +GPT2_CONFIG = GGUFTestConfig( + original_model="openai-community/gpt2-large", + gguf_repo="QuantFactory/gpt2-large-GGUF", + gguf_filename="gpt2-large.Q4_K_M.gguf", +) + +STABLELM_CONFIG = GGUFTestConfig( + original_model="stabilityai/stablelm-3b-4e1t", + gguf_repo="afrideva/stablelm-3b-4e1t-GGUF", + gguf_filename="stablelm-3b-4e1t.q4_k_m.gguf", +) + +STARCODER_CONFIG = GGUFTestConfig( + original_model="bigcode/starcoder2-3b", + gguf_repo="QuantFactory/starcoder2-3b-GGUF", + gguf_filename="starcoder2-3b.Q6_K.gguf", +) + +MODELS = [ + LLAMA_CONFIG, + QWEN2_CONFIG, + PHI3_CONFIG, + GPT2_CONFIG, + STABLELM_CONFIG, + # STARCODER_CONFIG, # broken +] + + @pytest.mark.skipif(not is_quant_method_supported("gguf"), reason="gguf is not supported on this GPU type.") -@pytest.mark.parametrize(("original_model", "gguf_id", "gguf_path"), [ - ("meta-llama/Llama-3.2-1B-Instruct", - "bartowski/Llama-3.2-1B-Instruct-GGUF", - "Llama-3.2-1B-Instruct-Q4_K_M.gguf"), - ("meta-llama/Llama-3.2-1B-Instruct", - "bartowski/Llama-3.2-1B-Instruct-GGUF", - "Llama-3.2-1B-Instruct-IQ4_XS.gguf"), - ("Qwen/Qwen2-1.5B-Instruct", "Qwen/Qwen2-1.5B-Instruct-GGUF", - "qwen2-1_5b-instruct-q4_k_m.gguf"), - ("Qwen/Qwen2-1.5B-Instruct", "legraphista/Qwen2-1.5B-Instruct-IMat-GGUF", - "Qwen2-1.5B-Instruct.IQ4_XS.gguf"), -]) +@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("tp_size", [1, 2]) def test_models( - num_gpus_available, - vllm_runner, - example_prompts, - original_model, - gguf_id, - gguf_path, + num_gpus_available: int, + vllm_runner: Type[VllmRunner], + example_prompts: List[str], + model: GGUFTestConfig, dtype: str, max_tokens: int, num_logprobs: int, @@ -51,28 +96,26 @@ def test_models( if num_gpus_available < tp_size: pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") - gguf_model = hf_hub_download(gguf_id, filename=gguf_path) - - tokenizer = AutoTokenizer.from_pretrained(original_model) - messages = [[{ - 'role': 'user', - 'content': prompt - }] for prompt in example_prompts] - example_prompts = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model.original_model) + if tokenizer.chat_template is not None: + messages = [[{ + 'role': 'user', + 'content': prompt + }] for prompt in example_prompts] + example_prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True) # Run unquantized model. - with vllm_runner(model_name=original_model, + with vllm_runner(model_name=model.original_model, dtype=dtype, max_model_len=MAX_MODEL_LEN, tensor_parallel_size=tp_size) as original_model: - original_outputs = original_model.generate_greedy_logprobs( example_prompts[:-1], max_tokens, num_logprobs) # Run gguf model. - with vllm_runner(model_name=gguf_model, + with vllm_runner(model_name=model.gguf_model, + tokenizer_name=model.original_model, dtype=dtype, max_model_len=MAX_MODEL_LEN, tensor_parallel_size=tp_size) as gguf_model: diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py index 057b04349e8b7..2e06b10fbb827 100644 --- a/tests/models/decoder_only/language/test_jamba.py +++ b/tests/models/decoder_only/language/test_jamba.py @@ -33,10 +33,13 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py index 06739e8f02253..1ad4f5aae8f5b 100644 --- a/tests/models/decoder_only/language/test_mamba.py +++ b/tests/models/decoder_only/language/test_mamba.py @@ -51,10 +51,13 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index 2a7ed8826d2f3..c7efa4edbbc0a 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -48,6 +48,10 @@ from ...utils import check_logprobs_close ), pytest.param("stabilityai/stablelm-3b-4e1t"), # stablelm pytest.param("bigcode/starcoder2-3b"), # starcoder2 + pytest.param( + "ehristoforu/Falcon3-MoE-2x7B-Insruct", # mixtral + marks=[pytest.mark.cpu_model], + ) ]) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) @@ -69,10 +73,13 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) check_logprobs_close( outputs_0_lst=hf_outputs, diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 146685738a1d0..ca572cc39e538 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -9,6 +9,7 @@ from typing import Type import pytest from transformers import AutoModelForVision2Seq +from transformers import __version__ as TRANSFORMERS_VERSION from transformers.utils import is_flash_attn_2_available from vllm.platforms import current_platform @@ -188,6 +189,30 @@ VLM_TEST_SETTINGS = { max_tokens=8, dtype="bfloat16", ), + "deepseek_vl_v2": VLMTestInfo( + models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501 + max_model_len=4096, + max_num_seqs=2, + single_image_prompts=IMAGE_ASSETS.prompts({ + "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501 + "cherry_blossom": "\nPlease infer the season with reason in details.", # noqa: E501 + }), + multi_image_prompt="image_1:\nimage_2:\nWhich image can we see the car and the tower?", # noqa: E501 + vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}}, # noqa: E501 + patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner, + postprocess_inputs=model_utils.cast_dtype_post_processor("images"), + hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output, + stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501 + image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], + marks=[ + pytest.mark.skipif( + TRANSFORMERS_VERSION >= "4.48.0", + reason="HF model is not compatible with transformers>=4.48.0", + ) + ], + ), "fuyu": VLMTestInfo( models=["adept/fuyu-8b"], test_type=VLMTestType.IMAGE, diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py index 16e256e040a74..5a485f3d81747 100644 --- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -5,7 +5,6 @@ import pytest import torch from PIL import Image -from vllm.entrypoints.llm import LLM from vllm.multimodal.image import rescale_image_size from vllm.multimodal.video import rescale_video_size, sample_frames_from_video @@ -69,7 +68,7 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict): def batch_make_image_embeddings( image_batches: List[Union[Image.Image, List[Image.Image]]], processor, - llm: LLM) -> List[Qwen2VLPromptImageEmbeddingInput]: + llm: VllmRunner) -> List[Qwen2VLPromptImageEmbeddingInput]: """batched image embeddings for Qwen2-VL This will infer all images' embeddings in a single batch, @@ -105,17 +104,19 @@ def batch_make_image_embeddings( pixel_values = preprocess_result["pixel_values"] image_grid_thw = preprocess_result["image_grid_thw"] - # pixel values to embeddinds & grid_thws - with torch.no_grad(): - visual = llm.llm_engine.model_executor.driver_worker. \ - model_runner.model.visual + # pixel values to embeddings & grid_thws + def get_image_embeds(model): + with torch.no_grad(): + visual = model.visual - pixel_values_on_device = pixel_values.to(visual.device, - dtype=visual.dtype) - image_grid_thw_on_device = image_grid_thw.to(visual.device, - dtype=torch.int64) - image_embeds = visual(pixel_values_on_device, - grid_thw=image_grid_thw_on_device) + pixel_values_on_device = pixel_values.to(visual.device, + dtype=visual.dtype) + image_grid_thw_on_device = image_grid_thw.to(visual.device, + dtype=torch.int64) + return visual(pixel_values_on_device, + grid_thw=image_grid_thw_on_device) + + image_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches result: List[Qwen2VLPromptImageEmbeddingInput] = [] @@ -124,11 +125,10 @@ def batch_make_image_embeddings( for image_batch in image_batches_: cur_batch_image_count = len(image_batch) merge_size = image_processor.merge_size - cur_batch_embed_len = sum([ - grid_thw.prod() // merge_size // merge_size + cur_batch_embed_len = sum( + grid_thw.prod(-1) // merge_size // merge_size for grid_thw in image_grid_thw[image_counter:image_counter + - cur_batch_image_count] - ]) + cur_batch_image_count]) result.append({ "image_embeds": @@ -151,7 +151,7 @@ def batch_make_image_embeddings( def batch_make_video_embeddings( video_batches: PromptVideoInput, processor, - llm: LLM) -> List[Qwen2VLPromptVideoEmbeddingInput]: + llm: VllmRunner) -> List[Qwen2VLPromptVideoEmbeddingInput]: """batched video embeddings for Qwen2-VL A NDArray represents a single video's all frames. @@ -187,17 +187,19 @@ def batch_make_video_embeddings( pixel_values = preprocess_result["pixel_values_videos"] video_grid_thw = preprocess_result["video_grid_thw"] - # pixel values to embeddinds & grid_thws - with torch.no_grad(): - visual = llm.llm_engine.model_executor.driver_worker.\ - model_runner.model.visual + # pixel values to embeddings & grid_thws + def get_image_embeds(model): + with torch.no_grad(): + visual = model.visual - pixel_values_on_device = pixel_values.to(visual.device, - dtype=visual.dtype) - video_grid_thw_on_device = video_grid_thw.to(visual.device, - dtype=torch.int64) - video_embeds = visual(pixel_values_on_device, - grid_thw=video_grid_thw_on_device) + pixel_values_on_device = pixel_values.to(visual.device, + dtype=visual.dtype) + video_grid_thw_on_device = video_grid_thw.to(visual.device, + dtype=torch.int64) + return visual(pixel_values_on_device, + grid_thw=video_grid_thw_on_device) + + video_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches result: List[Qwen2VLPromptVideoEmbeddingInput] = [] @@ -206,11 +208,10 @@ def batch_make_video_embeddings( for video_batch in video_batches_: cur_batch_video_count = len(video_batch) merge_size = image_processor.merge_size - cur_batch_embed_len = sum([ - grid_thw.prod() // merge_size // merge_size + cur_batch_embed_len = sum( + grid_thw.prod(-1) // merge_size // merge_size for grid_thw in video_grid_thw[video_counter:video_counter + - cur_batch_video_count] - ]) + cur_batch_video_count]) result.append({ "video_embeds": @@ -280,9 +281,9 @@ def run_embedding_input_test( max_tokens, num_logprobs=num_logprobs, images=batch_make_image_embeddings( - images, processor, vllm_model.model) if images else None, + images, processor, vllm_model) if images else None, videos=batch_make_video_embeddings( - videos, processor, vllm_model.model) if videos else None) + videos, processor, vllm_model) if videos else None) for prompts, images, videos in inputs ] diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index 6c7a753af787e..1ca85c7bb2056 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -183,6 +183,14 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput, ####### Post-processors for HF outputs +def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput, + model: str) -> RunnerOutput: + output_ids, output_str, out_logprobs = hf_output + if output_str.endswith("<|end▁of▁sentence|>"): + output_str = output_str.split("<|end▁of▁sentence|>")[0] + return output_ids, output_str, out_logprobs + + def minicpmv_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput: output_ids, output_str, out_logprobs = hf_output @@ -261,6 +269,34 @@ def qwen_prompt_path_encoder( ####### Model-specific HuggingFace runner patchers +def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + """Patches and returns an instance of the HfRunner to use for GLM4.""" + hf_processor = hf_model.processor + + def processor(*args, text="", images=None, **kwargs): + if isinstance(images, Image): + images = [images] + # inputs is a custom class instead of dict or BatchFeature + inputs = hf_processor( + *args, + prompt=text, + images=images, + **kwargs, + ) + inputs = { + k: inputs[k] + for k in inputs.keys() # noqa + if k not in ("seq_lens", "sft_format") + } + inputs = BatchEncoding(data=inputs, tensor_type="pt") + return inputs + + hf_model.processor = processor + hf_model.model.get_output_embeddings = lambda: \ + hf_model.model.language.model.embed_tokens + return hf_model + + def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner: """Patches and returns an instance of the HfRunner to use for GLM4.""" hf_processor = hf_model.processor diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py index 6673a9fc22f69..0cbe4afe96c0a 100644 --- a/tests/models/embedding/language/test_cls_models.py +++ b/tests/models/embedding/language/test_cls_models.py @@ -24,10 +24,13 @@ def test_classification_models( ) -> None: with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.classify(example_prompts) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) with hf_runner(model, dtype=dtype, diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index 7749806548cd9..e17198e385475 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -17,14 +17,16 @@ from ..utils import check_embeddings_close marks=[pytest.mark.core_model, pytest.mark.cpu_model]), pytest.param("sentence-transformers/all-MiniLM-L12-v2"), pytest.param("intfloat/multilingual-e5-large"), - # [Encoder-decoder] - pytest.param("intfloat/e5-mistral-7b-instruct", - marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + # [Decoder-only] pytest.param("BAAI/bge-multilingual-gemma2", marks=[pytest.mark.core_model]), - pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), + pytest.param("intfloat/e5-mistral-7b-instruct", + marks=[pytest.mark.core_model, pytest.mark.cpu_model]), pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"), + pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), + # [Encoder-decoder] + pytest.param("sentence-transformers/stsb-roberta-base-v2"), ], ) @pytest.mark.parametrize("dtype", ["half"]) @@ -60,10 +62,13 @@ def test_models( max_model_len=None, **vllm_extra_kwargs) as vllm_model: vllm_outputs = vllm_model.encode(example_prompts) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) check_embeddings_close( embeddings_0_lst=hf_outputs, diff --git a/tests/models/decoder_only/vision_language/processing/__init__.py b/tests/models/multimodal/__init__.py similarity index 100% rename from tests/models/decoder_only/vision_language/processing/__init__.py rename to tests/models/multimodal/__init__.py diff --git a/tests/models/multimodal/processing/__init__.py b/tests/models/multimodal/processing/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py new file mode 100644 index 0000000000000..1e3e7ea50b122 --- /dev/null +++ b/tests/models/multimodal/processing/test_common.py @@ -0,0 +1,204 @@ +from functools import partial + +import numpy as np +import pytest +from PIL import Image + +from vllm.config import ModelConfig +from vllm.inputs import InputProcessingContext +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.processing import ProcessingCache +from vllm.multimodal.utils import cached_get_tokenizer + +from ....multimodal.utils import random_audio, random_image, random_video + + +def _test_processing_correctness( + model_id: str, + modalities: dict[str, bool], + hit_rate: float, + num_batches: int, + simplify_rate: float, +): + if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3": + hf_overrides = {"architectures": ["MantisForConditionalGeneration"]} + elif model_id == "deepseek-ai/deepseek-vl2-tiny": + hf_overrides = {"architectures": ["DeepseekVLV2ForCausalLM"]} + else: + hf_overrides = {} + + limit_mm_per_prompt = { + modality: 3 if supports_multi else 1 + for modality, supports_multi in modalities.items() + } + + model_config = ModelConfig( + model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=True, + seed=0, + dtype="float16", + revision=None, + hf_overrides=hf_overrides, + limit_mm_per_prompt=limit_mm_per_prompt, + ) + + model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) + factories = MULTIMODAL_REGISTRY._processor_factories[model_cls] + ctx = InputProcessingContext( + model_config, + tokenizer=cached_get_tokenizer(model_config.tokenizer), + ) + # Ensure that it can fit all of the data + cache = ProcessingCache(capacity=1 << 30) + + baseline_processor = factories.build_processor(ctx, cache=None) + cached_processor = factories.build_processor(ctx, cache=cache) + dummy_inputs = baseline_processor.dummy_inputs + tokenizer = baseline_processor.info.get_tokenizer() + + rng = np.random.RandomState(0) + + input_to_hit = { + "image": Image.new("RGB", size=(128, 128)), + "video": np.zeros((4, 128, 128, 3), dtype=np.uint8), + "audio": (np.zeros((512, )), 16000), + } + input_factory = { + "image": + partial(random_image, rng, min_wh=128, max_wh=256), + "video": + partial(random_video, + rng, + min_frames=2, + max_frames=8, + min_wh=128, + max_wh=256), + "audio": + partial(random_audio, rng, min_len=512, max_len=1024, sr=16000), + } + + for batch_idx in range(num_batches): + mm_data = { + k: + [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]()) + for _ in range(rng.randint(limit_mm_per_prompt[k]))] + for k in modalities + } + + mm_counts = {k: len(vs) for k, vs in mm_data.items()} + prompt = dummy_inputs.get_dummy_processor_inputs( + model_config.max_model_len, + mm_counts, + ).prompt_text + + # Drop unnecessary keys and test single -> multi conversion + if rng.rand() < simplify_rate: + for k in list(mm_data.keys()): + if not mm_data[k]: + del mm_data[k] + elif len(mm_data[k]) == 1: + mm_data[k] = mm_data[k][0] + + baseline_result = baseline_processor.apply( + prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + cached_result = cached_processor.apply( + prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + assert baseline_result == cached_result, ( + f"Failed ({batch_idx=}, {prompt=}, {mm_data=})") + + baseline_tokenized_result = baseline_processor.apply( + tokenizer.encode(prompt), + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + assert baseline_result == baseline_tokenized_result, ( + f"Failed ({batch_idx=}, {prompt=}, {mm_data=})") + + cached_tokenized_result = cached_processor.apply( + tokenizer.encode(prompt), + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + assert cached_result == cached_tokenized_result, ( + f"Failed ({batch_idx=}, {prompt=}, {mm_data=})") + + +# yapf: disable +# True if the model supports multiple data items of the modality per request +@pytest.mark.parametrize(("model_id", "modalities"), [ + ("rhymes-ai/Aria", {"image": True}), + ("Salesforce/blip2-opt-2.7b", {"image": False}), + ("facebook/chameleon-7b", {"image": False}), + ("deepseek-ai/deepseek-vl2-tiny", {"image": True}), + ("adept/fuyu-8b", {"image": False}), + ("llava-hf/llava-1.5-7b-hf", {"image": True}), + ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}), + ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}), + ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}), # noqa: E501 + ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}), + ("mistral-community/pixtral-12b", {"image": True}), + ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}), + ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}), + ("fixie-ai/ultravox-v0_3", {"audio": True}), +]) +@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) +@pytest.mark.parametrize("num_batches", [32]) +@pytest.mark.parametrize("simplify_rate", [1.0]) +# yapf: enable +def test_processing_correctness( + model_id: str, + modalities: dict[str, bool], + hit_rate: float, + num_batches: int, + simplify_rate: float, +): + _test_processing_correctness( + model_id, + modalities, + hit_rate=hit_rate, + num_batches=num_batches, + simplify_rate=simplify_rate, + ) + + +# yapf: disable +@pytest.mark.parametrize(("model_id", "modalities"), [ + ("microsoft/Phi-3-vision-128k-instruct", {"image": True}), +]) +@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) +@pytest.mark.parametrize("num_batches", [32]) +@pytest.mark.parametrize("simplify_rate", [1.0]) +# yapf: enable +def test_processing_correctness_phi3v( + model_id: str, + modalities: dict[str, bool], + hit_rate: float, + num_batches: int, + simplify_rate: float, +): + # HACK - this is an attempted workaround for the following bug + # https://github.com/huggingface/transformers/issues/34307 + from transformers import AutoImageProcessor # noqa: F401 + from transformers import AutoProcessor # noqa: F401 + + AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True) + + _test_processing_correctness( + model_id, + modalities, + hit_rate=hit_rate, + num_batches=num_batches, + simplify_rate=simplify_rate, + ) diff --git a/tests/models/decoder_only/vision_language/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py similarity index 98% rename from tests/models/decoder_only/vision_language/processing/test_idefics3.py rename to tests/models/multimodal/processing/test_idefics3.py index c71a2d359043d..69b91ad4a5df8 100644 --- a/tests/models/decoder_only/vision_language/processing/test_idefics3.py +++ b/tests/models/multimodal/processing/test_idefics3.py @@ -8,8 +8,8 @@ from transformers import AutoImageProcessor, AutoTokenizer from vllm.inputs import InputContext, token_inputs from vllm.multimodal import MultiModalRegistry -from .....conftest import _ImageAssets -from ....utils import build_model_context +from ....conftest import _ImageAssets +from ...utils import build_model_context models = ["HuggingFaceM4/Idefics3-8B-Llama3"] diff --git a/tests/models/decoder_only/vision_language/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py similarity index 98% rename from tests/models/decoder_only/vision_language/processing/test_internvl.py rename to tests/models/multimodal/processing/test_internvl.py index af0c2aa211998..d6c60595ca5ea 100644 --- a/tests/models/decoder_only/vision_language/processing/test_internvl.py +++ b/tests/models/multimodal/processing/test_internvl.py @@ -7,8 +7,8 @@ from transformers import AutoTokenizer from vllm.inputs import InputContext, token_inputs from vllm.multimodal import MultiModalRegistry -from .....conftest import _ImageAssets -from ....utils import build_model_context +from ....conftest import _ImageAssets +from ...utils import build_model_context models = ["OpenGVLab/InternVL2-2B"] diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py similarity index 65% rename from tests/models/decoder_only/vision_language/processing/test_llava_next.py rename to tests/models/multimodal/processing/test_llava_next.py index 689d17be81889..6de649f87204d 100644 --- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py +++ b/tests/models/multimodal/processing/test_llava_next.py @@ -10,7 +10,68 @@ from vllm.multimodal.parse import ImageSize from vllm.multimodal.processing import BaseMultiModalProcessor from vllm.multimodal.utils import cached_get_tokenizer -from ....utils import build_model_context +from ...utils import build_model_context + + +def _validate_image_max_tokens_one( + processor: BaseMultiModalProcessor, + max_tokens: int, + failed_size_excs: list[tuple[ImageSize, Exception]], + image_size: ImageSize, +) -> None: + info = processor.info + feature_size = info.get_num_image_tokens(image_width=image_size.width, + image_height=image_size.height) + + try: + assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}" + except Exception as exc: + failed_size_excs.append((image_size, exc)) + + +@pytest.mark.skip("This test takes around 5 minutes to run. " + "Comment this out to run it manually.") +@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) +def test_processor_max_tokens(model_id): + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": 1}, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) + info = processor.info + + seen_aspect_ratios = set[float]() + image_sizes = list[ImageSize]() + + # The aspect ratio of the grid layout is between 1 and 2 + # NOTE: Assumes that feature size calculation is the same if we + # swap the width and height of the image + for w, h in itertools.product(range(32, 4096), repeat=2): + aspect_ratio = w / h + if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios: + image_sizes.append(ImageSize(w, h)) + seen_aspect_ratios.add(aspect_ratio) + + failed_size_excs = list[tuple[ImageSize, Exception]]() + + validate_one = partial( + _validate_image_max_tokens_one, + processor, + info.get_max_image_tokens(), # type: ignore + failed_size_excs, + ) + pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes") + + if failed_size_excs: + msg = "Found failing image sizes:" \ + + "\n========\n".join(f"[{size}]\n{exc}" + for size, exc in failed_size_excs) + raise AssertionError(msg) def _validate_image_prompt_replacements_one( diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py similarity index 65% rename from tests/models/decoder_only/vision_language/processing/test_llava_onevision.py rename to tests/models/multimodal/processing/test_llava_onevision.py index a033354f0e9b8..806437d35ec87 100644 --- a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py +++ b/tests/models/multimodal/processing/test_llava_onevision.py @@ -10,7 +10,69 @@ from vllm.multimodal.parse import ImageSize from vllm.multimodal.processing import BaseMultiModalProcessor from vllm.multimodal.utils import cached_get_tokenizer -from ....utils import build_model_context +from ...utils import build_model_context + + +def _validate_image_max_tokens_one( + processor: BaseMultiModalProcessor, + max_tokens: int, + failed_size_excs: list[tuple[ImageSize, Exception]], + image_size: ImageSize, +) -> None: + info = processor.info + feature_size = info.get_num_image_tokens(image_width=image_size.width, + image_height=image_size.height) + + try: + assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}" + except Exception as exc: + failed_size_excs.append((image_size, exc)) + + +@pytest.mark.skip("This test takes around 5 minutes to run. " + "Comment this out to run it manually.") +@pytest.mark.parametrize("model_id", + ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]) +def test_processor_max_tokens(model_id): + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": 1}, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) + info = processor.info + + seen_aspect_ratios = set[float]() + image_sizes = list[ImageSize]() + + # The aspect ratio of the grid layout is between 1 and 6 + # NOTE: Assumes that feature size calculation is the same if we + # swap the width and height of the image + for w, h in itertools.product(range(32, 4096), repeat=2): + aspect_ratio = w / h + if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios: + image_sizes.append(ImageSize(w, h)) + seen_aspect_ratios.add(aspect_ratio) + + failed_size_excs = list[tuple[ImageSize, Exception]]() + + validate_one = partial( + _validate_image_max_tokens_one, + processor, + info.get_max_image_tokens(), # type: ignore + failed_size_excs, + ) + pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes") + + if failed_size_excs: + msg = "Found failing image sizes:" \ + + "\n========\n".join(f"[{size}]\n{exc}" + for size, exc in failed_size_excs) + raise AssertionError(msg) def _validate_image_prompt_replacements_one( diff --git a/tests/models/decoder_only/vision_language/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py similarity index 95% rename from tests/models/decoder_only/vision_language/processing/test_phi3v.py rename to tests/models/multimodal/processing/test_phi3v.py index c5b77260c6544..7f82a8f18f0ca 100644 --- a/tests/models/decoder_only/vision_language/processing/test_phi3v.py +++ b/tests/models/multimodal/processing/test_phi3v.py @@ -4,8 +4,8 @@ import pytest from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.utils import cached_get_tokenizer -from .....conftest import _ImageAssets -from ....utils import build_model_context +from ....conftest import _ImageAssets +from ...utils import build_model_context @pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"]) diff --git a/tests/models/decoder_only/vision_language/processing/test_qwen.py b/tests/models/multimodal/processing/test_qwen.py similarity index 98% rename from tests/models/decoder_only/vision_language/processing/test_qwen.py rename to tests/models/multimodal/processing/test_qwen.py index 163220c91a27d..af0ace711ba3e 100644 --- a/tests/models/decoder_only/vision_language/processing/test_qwen.py +++ b/tests/models/multimodal/processing/test_qwen.py @@ -9,8 +9,8 @@ from vllm.inputs import InputContext, token_inputs from vllm.multimodal import MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer -from .....conftest import IMAGE_ASSETS -from ....utils import build_model_context +from ....conftest import IMAGE_ASSETS +from ...utils import build_model_context ### Multimodal preprocessing tests SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image diff --git a/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py similarity index 96% rename from tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py rename to tests/models/multimodal/processing/test_qwen2_vl.py index 0d54802f2b733..de14fbbffe5b7 100644 --- a/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py +++ b/tests/models/multimodal/processing/test_qwen2_vl.py @@ -3,8 +3,8 @@ import pytest from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.utils import cached_get_tokenizer -from .....conftest import _ImageAssets -from ....utils import build_model_context +from ....conftest import _ImageAssets +from ...utils import build_model_context @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) diff --git a/tests/models/registry.py b/tests/models/registry.py index dcb8bfa0f9510..9603ea8817cac 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -22,6 +22,11 @@ class _HfExamplesInfo: for speculative decoding. """ + min_transformers_version: Optional[str] = None + """ + The minimum version of HF Transformers that is required to run this model. + """ + is_available_online: bool = True """ Set this to ``False`` if the name of this architecture no longer exists on @@ -64,6 +69,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "DeepseekV3ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V3", # noqa: E501 trust_remote_code=True), "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), # noqa: E501 + "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501 "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"), "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"), "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"), @@ -80,6 +86,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True), "InternLM2VEForCausalLM": _HfExamplesInfo("OpenGVLab/Mono-InternVL-2B", trust_remote_code=True), + "InternLM3ForCausalLM": _HfExamplesInfo("internlm/internlm3-8b-instruct", + trust_remote_code=True), "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"), "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini"), "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"), @@ -147,6 +155,7 @@ _EMBEDDING_EXAMPLE_MODELS = { "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"), "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"), + "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"), "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"), # noqa: E501 "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"), # noqa: E501 "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"), # noqa: E501 @@ -174,6 +183,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True), "ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b", is_available_online=False), + "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny"), # noqa: E501 "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"), "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"), "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 3b728f2744fca..daece7c93c0ef 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -1,7 +1,9 @@ from unittest.mock import patch import pytest +from packaging.version import Version from transformers import PretrainedConfig +from transformers import __version__ as TRANSFORMERS_VERSION from vllm import LLM @@ -13,9 +15,20 @@ def test_can_initialize(model_arch): model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) if not model_info.is_available_online: pytest.skip("Model is not available online") + if model_info.min_transformers_version is not None: + current_version = TRANSFORMERS_VERSION + required_version = model_info.min_transformers_version + if Version(current_version) < Version(required_version): + pytest.skip( + f"You have `transformers=={current_version}` installed, but " + f"`transformers>={required_version}` is required to run this " + "model") # Avoid OOM def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig: + if hf_config.model_type == "deepseek_vl_v2": + hf_config.update({"architectures": ["DeepseekVLV2ForCausalLM"]}) + if hasattr(hf_config, "text_config"): text_config: PretrainedConfig = hf_config.text_config else: diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py index cc1fd19252019..34030d9d6ac60 100644 --- a/tests/multi_step/test_correctness_llm.py +++ b/tests/multi_step/test_correctness_llm.py @@ -5,6 +5,8 @@ from typing import Optional import pytest +from tests.kernels.utils import override_backend_env_variable + from ..models.utils import check_logprobs_close, check_outputs_equal MODELS = [ @@ -19,10 +21,11 @@ NUM_PROMPTS = [10] @pytest.mark.parametrize("tp_size", [1]) @pytest.mark.parametrize("enable_chunked_prefill", [False, True]) @pytest.mark.parametrize("max_tokens", [5]) -@pytest.mark.parametrize("enforce_eager", [True]) +@pytest.mark.parametrize("enforce_eager", [True, False]) @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS) @pytest.mark.parametrize("num_prompts", NUM_PROMPTS) @pytest.mark.parametrize("num_logprobs", [None, 5]) +@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN", "FLASHINFER"]) def test_multi_step_llm( hf_runner, vllm_runner, @@ -36,6 +39,8 @@ def test_multi_step_llm( num_scheduler_steps: int, num_prompts: int, num_logprobs: Optional[int], + attention_backend: str, + monkeypatch, ) -> None: """Test vLLM engine with multi-step scheduling via sync LLM Engine. @@ -63,6 +68,7 @@ def test_multi_step_llm( num_logprobs: corresponds to the `logprobs` argument to the OpenAI completions endpoint; `None` -> 1 logprob returned. """ + override_backend_env_variable(monkeypatch, attention_backend) prompts = example_prompts if len(prompts) < num_prompts: @@ -114,6 +120,7 @@ def test_multi_step_llm( @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS) @pytest.mark.parametrize("num_prompts", NUM_PROMPTS) @pytest.mark.parametrize("num_logprobs,num_prompt_logprobs", [(5, 5)]) +@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"]) def test_multi_step_llm_w_prompt_logprobs( vllm_runner, example_prompts, @@ -126,6 +133,8 @@ def test_multi_step_llm_w_prompt_logprobs( num_prompts: int, num_logprobs: Optional[int], num_prompt_logprobs: Optional[int], + attention_backend: str, + monkeypatch, ) -> None: """Test prompt logprobs with multi-step scheduling via sync LLM Engine. @@ -155,6 +164,7 @@ def test_multi_step_llm_w_prompt_logprobs( note that this argument is not supported by the OpenAI completions endpoint. """ + override_backend_env_variable(monkeypatch, attention_backend) prompts = example_prompts if len(prompts) < num_prompts: @@ -205,6 +215,7 @@ def test_multi_step_llm_w_prompt_logprobs( @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS) @pytest.mark.parametrize("num_prompts", NUM_PROMPTS) @pytest.mark.parametrize("num_logprobs", [None, 5]) +@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"]) def test_multi_step_llm_chunked_prefill_prefix_cache( vllm_runner, example_prompts, @@ -216,6 +227,8 @@ def test_multi_step_llm_chunked_prefill_prefix_cache( num_scheduler_steps: int, num_prompts: int, num_logprobs: Optional[int], + attention_backend: str, + monkeypatch, ) -> None: """Test vLLM engine with multi-step+"single-step chunked prefill"+APC. @@ -278,6 +291,8 @@ def test_multi_step_llm_chunked_prefill_prefix_cache( # # The Incorrect scheduling behavior - if it occurs - will cause an exception # in the model runner resulting from `do_sample=False`. + override_backend_env_variable(monkeypatch, attention_backend) + assert len(example_prompts) >= 2 challenge_prompts = copy.deepcopy(example_prompts) challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient ' diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index d98bd9736b65f..9e58ed4cfde93 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -1,30 +1,25 @@ from contextlib import nullcontext -from functools import partial from typing import cast from unittest.mock import MagicMock import numpy as np import pytest -from PIL import Image from vllm.config import ModelConfig -from vllm.inputs import InputProcessingContext from vllm.multimodal import MULTIMODAL_REGISTRY -# yapf conflicts with isort for this block -# yapf: disable -from vllm.multimodal.processing import (PlaceholderInfo, ProcessingCache, - PromptReplacement, +from vllm.multimodal.processing import (PlaceholderInfo, PromptReplacement, find_mm_placeholders, find_text_matches, find_token_matches, iter_token_matches, replace_text_matches, replace_token_matches) -# yapf: enable from vllm.multimodal.profiling import MultiModalProfiler from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import full_groupby +from .utils import random_image + # yapf: disable @pytest.mark.parametrize( @@ -426,6 +421,8 @@ def test_find_replace_tokens( "pattern_1": [32000, 32000], "pattern_2": [], "pattern_3": [1550, 918, 1550], + # Test different modalities having the same tokens (32000) + "pattern_4": [32000], }, ], ) @@ -443,6 +440,14 @@ def test_find_replace_tokens( replacement=[32000, 32000], ), ], + "pattern_4": [ + PlaceholderInfo( + modality="pattern_4", + item_idx=0, + start_idx=3, + replacement=[32000], + ), + ], } ), @@ -471,6 +476,7 @@ def test_find_replace_tokens( replacement=[1550, 918, 1550], ), ], + # No match for pattern_4 as it has lower priority than pattern_1 } ), ( @@ -490,6 +496,14 @@ def test_find_replace_tokens( replacement=[32000, 32000], ), ], + "pattern_4": [ + PlaceholderInfo( + modality="pattern_4", + item_idx=0, + start_idx=5, + replacement=[32000], + ), + ], "pattern_3": [ PlaceholderInfo( modality="pattern_3", @@ -531,37 +545,6 @@ def test_find_mm_placeholders( assert result == expected -def _rand_img(rng: np.random.RandomState, min_wh: int, max_wh: int): - w, h = rng.randint(min_wh, max_wh, size=(2, )) - arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8) - return Image.fromarray(arr) - - -def _rand_video( - rng: np.random.RandomState, - min_frames: int, - max_frames: int, - min_wh: int, - max_wh: int, -): - # Temporary workaround for https://github.com/huggingface/transformers/issues/35412 - num_frames = rng.randint(min_frames, max_frames) - num_frames = (num_frames // 2) * 2 - - w, h = rng.randint(min_wh, max_wh, size=(2, )) - return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8) - - -def _rand_audio( - rng: np.random.RandomState, - min_len: int, - max_len: int, - sr: int, -): - audio_len = rng.randint(min_len, max_len) - return rng.rand(audio_len), sr - - @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @pytest.mark.parametrize( ("limit", "num_supported", "is_valid"), @@ -628,7 +611,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): ) rng = np.random.RandomState(0) - image = _rand_img(rng, min_wh=128, max_wh=256) + image = random_image(rng, min_wh=128, max_wh=256) if num_images == 0: mm_data = {} elif num_images == 1: @@ -647,172 +630,3 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): mm_data=mm_data, hf_processor_mm_kwargs={}, ) - - -def _test_processing_cache_correctness( - model_id: str, - modalities: dict[str, bool], - hit_rate: float, - num_batches: int, - simplify_rate: float, -): - if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3": - hf_overrides = {"architectures": ["MantisForConditionalGeneration"]} - else: - hf_overrides = {} - - limit_mm_per_prompt = { - modality: 3 if supports_multi else 1 - for modality, supports_multi in modalities.items() - } - - model_config = ModelConfig( - model_id, - task="auto", - tokenizer=model_id, - tokenizer_mode="auto", - trust_remote_code=True, - seed=0, - dtype="float16", - revision=None, - hf_overrides=hf_overrides, - limit_mm_per_prompt=limit_mm_per_prompt, - ) - - model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) - factories = MULTIMODAL_REGISTRY._processor_factories[model_cls] - ctx = InputProcessingContext( - model_config, - tokenizer=cached_get_tokenizer(model_config.tokenizer), - ) - # Ensure that it can fit all of the data - cache = ProcessingCache(capacity=1 << 30) - - baseline_processor = factories.build_processor(ctx, cache=None) - cached_processor = factories.build_processor(ctx, cache=cache) - dummy_inputs = baseline_processor.dummy_inputs - - rng = np.random.RandomState(0) - - input_to_hit = { - "image": Image.new("RGB", size=(128, 128)), - "video": np.zeros((4, 128, 128, 3), dtype=np.uint8), - "audio": (np.zeros((512, )), 16000), - } - input_factory = { - "image": - partial(_rand_img, rng, min_wh=128, max_wh=256), - "video": - partial(_rand_video, - rng, - min_frames=2, - max_frames=8, - min_wh=128, - max_wh=256), - "audio": - partial(_rand_audio, rng, min_len=512, max_len=1024, sr=16000), - } - - for batch_idx in range(num_batches): - mm_data = { - k: - [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]()) - for _ in range(rng.randint(limit_mm_per_prompt[k]))] - for k in modalities - } - - mm_counts = {k: len(vs) for k, vs in mm_data.items()} - prompt = dummy_inputs.get_dummy_processor_inputs( - model_config.max_model_len, - mm_counts, - ).prompt_text - - # Drop unnecessary keys and test single -> multi conversion - if rng.rand() < simplify_rate: - for k in list(mm_data.keys()): - if not mm_data[k]: - del mm_data[k] - elif len(mm_data[k]) == 1: - mm_data[k] = mm_data[k][0] - - baseline_result = baseline_processor.apply( - prompt, - mm_data=mm_data, - hf_processor_mm_kwargs={}, - ) - cached_result = cached_processor.apply( - prompt, - mm_data=mm_data, - hf_processor_mm_kwargs={}, - ) - - assert baseline_result == cached_result, ( - f"Failed ({batch_idx=}, {mm_data=})") - - -# yapf: disable -# True if the model supports multiple data items of the modality per request -@pytest.mark.parametrize(("model_id", "modalities"), [ - ("rhymes-ai/Aria", {"image": True}), - ("Salesforce/blip2-opt-2.7b", {"image": False}), - ("facebook/chameleon-7b", {"image": False}), - ("adept/fuyu-8b", {"image": False}), - ("llava-hf/llava-1.5-7b-hf", {"image": True}), - ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}), - ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}), - ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}), # noqa: E501 - ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}), - ("mistral-community/pixtral-12b", {"image": True}), - ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}), - ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}), - ("fixie-ai/ultravox-v0_3", {"audio": True}), -]) -@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) -@pytest.mark.parametrize("num_batches", [32]) -@pytest.mark.parametrize("simplify_rate", [1.0]) -# yapf: enable -def test_processing_cache_correctness( - model_id: str, - modalities: dict[str, bool], - hit_rate: float, - num_batches: int, - simplify_rate: float, -): - _test_processing_cache_correctness( - model_id, - modalities, - hit_rate=hit_rate, - num_batches=num_batches, - simplify_rate=simplify_rate, - ) - - -# yapf: disable -@pytest.mark.parametrize(("model_id", "modalities"), [ - ("microsoft/Phi-3-vision-128k-instruct", {"image": True}), -]) -@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) -@pytest.mark.parametrize("num_batches", [32]) -@pytest.mark.parametrize("simplify_rate", [1.0]) -# yapf: enable -def test_processing_cache_correctness_phi3v( - model_id: str, - modalities: dict[str, bool], - hit_rate: float, - num_batches: int, - simplify_rate: float, -): - # HACK - this is an attempted workaround for the following bug - # https://github.com/huggingface/transformers/issues/34307 - from transformers import AutoImageProcessor # noqa: F401 - from transformers import AutoProcessor # noqa: F401 - - AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True) - - _test_processing_cache_correctness( - model_id, - modalities, - hit_rate=hit_rate, - num_batches=num_batches, - simplify_rate=simplify_rate, - ) diff --git a/tests/multimodal/utils.py b/tests/multimodal/utils.py new file mode 100644 index 0000000000000..29aeca605109b --- /dev/null +++ b/tests/multimodal/utils.py @@ -0,0 +1,33 @@ +import numpy as np +from PIL import Image + + +def random_image(rng: np.random.RandomState, min_wh: int, max_wh: int): + w, h = rng.randint(min_wh, max_wh, size=(2, )) + arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8) + return Image.fromarray(arr) + + +def random_video( + rng: np.random.RandomState, + min_frames: int, + max_frames: int, + min_wh: int, + max_wh: int, +): + # Temporary workaround for https://github.com/huggingface/transformers/issues/35412 + num_frames = rng.randint(min_frames, max_frames) + num_frames = (num_frames // 2) * 2 + + w, h = rng.randint(min_wh, max_wh, size=(2, )) + return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8) + + +def random_audio( + rng: np.random.RandomState, + min_len: int, + max_len: int, + sr: int, +): + audio_len = rng.randint(min_len, max_len) + return rng.rand(audio_len), sr diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py new file mode 100644 index 0000000000000..5634be3c8d882 --- /dev/null +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py @@ -0,0 +1,8 @@ +from vllm.attention.backends.flash_attn import FlashAttentionBackend + + +class DummyAttentionBackend(FlashAttentionBackend): + + @staticmethod + def get_name() -> str: + return "Dummy_Backend" diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py index fde93142f1103..84721d5971ccf 100644 --- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py @@ -3,3 +3,7 @@ from vllm.platforms.cuda import CudaPlatform class DummyPlatform(CudaPlatform): device_name = "DummyDevice" + + def get_attn_backend_cls(self, backend_name, head_size, dtype, + kv_cache_dtype, block_size, use_v1): + return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend" # noqa E501 diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py index 57518bd3e8299..661aa5f649ab9 100644 --- a/tests/plugins_tests/test_platform_plugins.py +++ b/tests/plugins_tests/test_platform_plugins.py @@ -1,3 +1,10 @@ +import torch + +from tests.kernels.utils import override_backend_env_variable +from vllm.attention.selector import get_attn_backend +from vllm.utils import STR_INVALID_VAL + + def test_platform_plugins(): # simulate workload by running an example import runpy @@ -5,7 +12,7 @@ def test_platform_plugins(): import os example_file = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(current_file))), - "examples", "offline_inference/offline_inference.py") + "examples", "offline_inference/basic.py") runpy.run_path(example_file) # check if the plugin is loaded correctly @@ -14,3 +21,10 @@ def test_platform_plugins(): f"Expected DummyDevice, got {current_platform.device_name}, " "possibly because current_platform is imported before the plugin" f" is loaded. The first import:\n{_init_trace}") + + +def test_oot_attention_backend(monkeypatch): + # ignore the backend env variable if it is set + override_backend_env_variable(monkeypatch, STR_INVALID_VAL) + backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) + assert backend.get_name() == "Dummy_Backend" diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 92436889ecffe..0cd86cef0a475 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -30,50 +30,55 @@ from vllm.platforms import current_platform def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): model_path, strategy, quant_type, shape_0, is_symmetric = model_args with vllm_runner(model_path, enforce_eager=True) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj - o_proj = layer.self_attn.o_proj - gate_up_proj = layer.mlp.gate_up_proj - down_proj = layer.mlp.down_proj + def check_model(model): + layer = model.model.layers[0] - # assert zp for symmetric and asymmetric cases - def zp_valid(zp: Optional[torch.Tensor]): - if is_symmetric: - return zp is None + qkv_proj = layer.self_attn.qkv_proj + o_proj = layer.self_attn.o_proj + gate_up_proj = layer.mlp.gate_up_proj + down_proj = layer.mlp.down_proj - return zp is not None and zp.dtype is torch.int32 + # assert zp for symmetric and asymmetric cases + def zp_valid(zp: Optional[torch.Tensor]): + if is_symmetric: + return zp is None - assert zp_valid(qkv_proj.input_zero_point) - assert zp_valid(o_proj.input_zero_point) - assert zp_valid(gate_up_proj.input_zero_point) - assert zp_valid(down_proj.input_zero_point) + return zp is not None and zp.dtype is torch.int32 - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(gate_up_proj.quant_method, - CompressedTensorsLinearMethod) - assert isinstance(down_proj.quant_method, - CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8) + assert zp_valid(qkv_proj.input_zero_point) + assert zp_valid(o_proj.input_zero_point) + assert zp_valid(gate_up_proj.input_zero_point) + assert zp_valid(down_proj.input_zero_point) - assert qkv_proj.scheme.strategy == strategy - assert qkv_proj.scheme.is_static_input_scheme - expected_type = torch.int8 + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(o_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(gate_up_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(down_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8) - assert qkv_proj.weight.dtype is expected_type - assert o_proj.weight.dtype is expected_type - assert gate_up_proj.weight.dtype is expected_type + assert qkv_proj.scheme.strategy == strategy + assert qkv_proj.scheme.is_static_input_scheme + expected_type = torch.int8 - if qkv_proj.scheme.strategy == "tensor": - # Make sure it is a channelwise buffer - # After running process_weights_after_loading - assert len(qkv_proj.weight_scale.shape) == 2 - assert qkv_proj.weight_scale.shape[0] == shape_0 - assert qkv_proj.weight_scale.shape[1] == 1 - assert qkv_proj.weight_scale.dtype is torch.float32 - assert qkv_proj.input_scale.dtype is torch.float32 + assert qkv_proj.weight.dtype is expected_type + assert o_proj.weight.dtype is expected_type + assert gate_up_proj.weight.dtype is expected_type + + if qkv_proj.scheme.strategy == "tensor": + # Make sure it is a channelwise buffer + # After running process_weights_after_loading + assert len(qkv_proj.weight_scale.shape) == 2 + assert qkv_proj.weight_scale.shape[0] == shape_0 + assert qkv_proj.weight_scale.shape[1] == 1 + assert qkv_proj.weight_scale.dtype is torch.float32 + assert qkv_proj.input_scale.dtype is torch.float32 + + llm.apply_model(check_model) output = llm.generate_greedy(["Hello my name is"], max_tokens=20) assert output @@ -129,16 +134,20 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner): def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args): model_path, strategy = model_args with vllm_runner(model_path, dtype=torch.float16) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8) - assert not qkv_proj.scheme.is_static_input_scheme - assert qkv_proj.scheme.strategy == strategy - assert qkv_proj.weight.dtype is torch.int8 + qkv_proj = layer.self_attn.qkv_proj + + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8) + assert not qkv_proj.scheme.is_static_input_scheme + assert qkv_proj.scheme.strategy == strategy + assert qkv_proj.weight.dtype is torch.int8 + + llm.apply_model(check_model) output = llm.generate_greedy(["Hello my name is"], max_tokens=20) assert output @@ -152,19 +161,24 @@ def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args): def test_compressed_tensors_wNa16(vllm_runner, wNa16_args): model, strategy, group, pack_factor = wNa16_args with vllm_runner(model) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16) + def check_model(model): + layer = model.model.layers[0] - assert qkv_proj.scheme.strategy == strategy - assert qkv_proj.scheme.group_size == (-1 if group is None else group) + qkv_proj = layer.self_attn.qkv_proj + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16) - assert qkv_proj.weight_packed.dtype is torch.int32 - assert qkv_proj.weight_scale.dtype is torch.float16 - assert qkv_proj.scheme.pack_factor == pack_factor + assert qkv_proj.scheme.strategy == strategy + assert qkv_proj.scheme.group_size == (-1 + if group is None else group) + + assert qkv_proj.weight_packed.dtype is torch.int32 + assert qkv_proj.weight_scale.dtype is torch.float16 + assert qkv_proj.scheme.pack_factor == pack_factor + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output @@ -173,14 +187,18 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args): def test_compressed_tensors_w4a16_marlin24(vllm_runner): model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t" with vllm_runner(model_path) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24) - assert qkv_proj.weight_packed.dtype is torch.int32 + qkv_proj = layer.self_attn.qkv_proj + + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24) + assert qkv_proj.weight_packed.dtype is torch.int32 + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output @@ -189,23 +207,27 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner): def test_compressed_tensors_fp8(vllm_runner): model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test" with vllm_runner(model_path) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance( - qkv_proj.scheme, - (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8)) + qkv_proj = layer.self_attn.qkv_proj - assert qkv_proj.input_scale.dtype is torch.float32 + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance( + qkv_proj.scheme, + (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8)) - if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8): - assert len(qkv_proj.input_scale.shape) == 0 - assert qkv_proj.weight.dtype is torch.float8_e4m3fn - assert qkv_proj.weight_scale.dtype is torch.float32 - assert len(qkv_proj.weight_scale.shape) == 0 + assert qkv_proj.input_scale.dtype is torch.float32 + + if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8): + assert len(qkv_proj.input_scale.shape) == 0 + assert qkv_proj.weight.dtype is torch.float8_e4m3fn + assert qkv_proj.weight_scale.dtype is torch.float32 + assert len(qkv_proj.weight_scale.shape) == 0 + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output @@ -248,12 +270,15 @@ def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy): def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4): model, weight_strategy, input_strategy = args_2of4 with vllm_runner(model) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj - assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn - _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy) + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn + _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy) + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) print(output) @@ -273,12 +298,15 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4): def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4): model, weight_strategy, input_strategy = args_2of4 with vllm_runner(model) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj - assert qkv_proj.scheme.weights_dtype == torch.int8 - _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy) + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert qkv_proj.scheme.weights_dtype == torch.int8 + _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy) + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) print(output) @@ -293,20 +321,24 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4): def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4): model = args_2of4 with vllm_runner(model) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensors24) + def check_model(model): + layer = model.model.layers[0] - assert qkv_proj.scheme.weight_quant is None - assert qkv_proj.scheme.input_quant is None - assert not qkv_proj.scheme.quantized - assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map - sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map # noqa: E501 - assert sparsity_map.get("Linear").format == "dense" - assert sparsity_map.get("Linear").sparsity_structure == "2:4" + qkv_proj = layer.self_attn.qkv_proj + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensors24) + + assert qkv_proj.scheme.weight_quant is None + assert qkv_proj.scheme.input_quant is None + assert not qkv_proj.scheme.quantized + assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map + sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map # noqa: E501 + assert sparsity_map.get("Linear").format == "dense" + assert sparsity_map.get("Linear").sparsity_structure == "2:4" + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) print(output) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index a0c1d7e24c503..4bff734746297 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -49,13 +49,17 @@ KV_CACHE_MODELS = [ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str): with vllm_runner(model_id, kv_cache_dtype="fp8") as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - attn = model.model.layers[0].self_attn.attn - assert isinstance(attn.quant_method, Fp8KVCacheMethod) - # NOTE: it is valid for scales to be 1.0 (default value), but we know - # these checkpoints have scales < 1.0 - assert 0.0 < attn._k_scale < 1.0 - assert 0.0 < attn._v_scale < 1.0 + def check_model(model): + attn = model.model.layers[0].self_attn.attn + + assert isinstance(attn.quant_method, Fp8KVCacheMethod) + + # NOTE: it is valid for scales to be 1.0 (default value), but + # we know these checkpoints have scales < 1.0 + assert 0.0 < attn._k_scale < 1.0 + assert 0.0 < attn._v_scale < 1.0 + + llm.apply_model(check_model) # note: this does not test accuracy, just that we can run through # see lm-eval tests for accuracy @@ -77,22 +81,24 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool, quantization="fp8", kv_cache_dtype=kv_cache_dtype) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - fc1 = model.model.decoder.layers[0].fc1 - assert isinstance(fc1.quant_method, Fp8LinearMethod) - if kv_cache_dtype == "fp8": - attn = model.model.decoder.layers[0].self_attn.attn - assert isinstance(attn.quant_method, Fp8KVCacheMethod) - assert attn._k_scale == 1.0 - assert attn._v_scale == 1.0 + def check_model(model): + fc1 = model.model.decoder.layers[0].fc1 + assert isinstance(fc1.quant_method, Fp8LinearMethod) + if kv_cache_dtype == "fp8": + attn = model.model.decoder.layers[0].self_attn.attn + assert isinstance(attn.quant_method, Fp8KVCacheMethod) + assert attn._k_scale == 1.0 + assert attn._v_scale == 1.0 - if current_platform.has_device_capability(89) and not force_marlin: - # For GPUs with hardware support, we keep weights in fp8 - assert fc1.weight.dtype == torch.float8_e4m3fn - else: - # For GPUs without hardware support, we pack the fp8 weights - # for weight-only quantization using Marlin kernels - assert fc1.weight.dtype == torch.int32 + if current_platform.has_device_capability(89) and not force_marlin: + # For GPUs with hardware support, we keep weights in fp8 + assert fc1.weight.dtype == torch.float8_e4m3fn + else: + # For GPUs without hardware support, we pack the fp8 weights + # for weight-only quantization using Marlin kernels + assert fc1.weight.dtype == torch.int32 + + llm.apply_model(check_model) @pytest.mark.skipif(not is_quant_method_supported("fp8"), diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index ad526a4065101..fa2d9645ea47f 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -28,20 +28,23 @@ def test_lm_head( model_lm_head_quant: Tuple[str, bool], ) -> None: model, lm_head_quantized = model_lm_head_quant - vllm_model = vllm_runner(model, dtype=torch.float16, max_model_len=2048) - lm_head_layer = (vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model.lm_head) + with vllm_runner(model, dtype=torch.float16, + max_model_len=2048) as vllm_model: - if lm_head_quantized: - assert isinstance( - lm_head_layer.linear_method, - (GPTQLinearMethod, GPTQMarlinLinearMethod, MarlinLinearMethod)) - else: - assert isinstance(lm_head_layer.linear_method, - UnquantizedEmbeddingMethod) + def check_model(model): + lm_head_layer = model.lm_head - print( - vllm_model.generate_greedy(prompts=["Hello my name is"], - max_tokens=10)[0][1]) - del vllm_model + if lm_head_quantized: + assert isinstance(lm_head_layer.linear_method, + (GPTQLinearMethod, GPTQMarlinLinearMethod, + MarlinLinearMethod)) + else: + assert isinstance(lm_head_layer.linear_method, + UnquantizedEmbeddingMethod) + + vllm_model.apply_model(check_model) + + print( + vllm_model.generate_greedy(prompts=["Hello my name is"], + max_tokens=10)[0][1]) diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py new file mode 100644 index 0000000000000..11382ad708faa --- /dev/null +++ b/tests/quantization/test_quark.py @@ -0,0 +1,33 @@ +"""Test model set-up and weight loading for quark-quantized models. + +Run `pytest tests/quantization/test_quark.py`. +""" + +import torch + +from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501 + QuarkLinearMethod, QuarkW8A8Fp8) + + +def test_quark_fp8(vllm_runner): + model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test" + with vllm_runner(model_path) as llm: + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + + assert isinstance(qkv_proj.quant_method, QuarkLinearMethod) + assert isinstance(qkv_proj.scheme, QuarkW8A8Fp8) + + if isinstance(qkv_proj.scheme, QuarkW8A8Fp8): + assert len(qkv_proj.input_scale.shape) == 0 + assert qkv_proj.weight.dtype is torch.float8_e4m3fn + #assert qkv_proj.weight.dtype is torch.float8_e4m3fnuz + assert len(qkv_proj.weight_scale.shape) == 0 + + llm.apply_model(check_model) + + output = llm.generate_greedy("Hello my name is", max_tokens=20) + assert output diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py new file mode 100644 index 0000000000000..8e7f44a399ddf --- /dev/null +++ b/tests/quantization/test_register_quantization_config.py @@ -0,0 +1,117 @@ +"""Tests register custom quantization config. + +See https://github.com/vllm-project/vllm/issues/11926 for more details. + +Run `pytest tests/quantization/test_register_quantization_config.py`. +""" +from typing import Any, Dict, List, Optional + +import pytest +import torch +import torch.nn.functional as F + +from vllm.model_executor.layers.linear import LinearBase # noqa: E501 +from vllm.model_executor.layers.linear import UnquantizedLinearMethod +from vllm.model_executor.layers.quantization import ( + get_quantization_config, register_quantization_config) +from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 + QuantizationConfig) + + +class FakeQuantLinearMethod(UnquantizedLinearMethod): + """Fake quantization linear method for per-token dynamic quantization.""" + + def __init__(self, num_bits: int = 8) -> None: + """Initialize the quantization method.""" + super().__init__() + self.num_bits = num_bits + + def apply(self, + layer: "torch.nn.Module", + x: "torch.Tensor", + bias: Optional["torch.Tensor"] = None) -> "torch.Tensor": + """Perform fake quantization before the linear layer.""" + + # Calculate the scales dynamically + max_val = torch.amax(x, dim=(0, -1), keepdims=True) + min_val = torch.amin(x, dim=(0, -1), keepdims=True) + scales = (max_val - min_val) / (2**self.num_bits - 1) + + # Fake quantize the input + quant_x = torch.clamp(torch.round(x / scales), -2**(self.num_bits - 1), + 2**(self.num_bits - 1) - 1) + dequant_x = quant_x * scales + + return F.linear(dequant_x, layer.weight, bias) + + +@register_quantization_config("custom_quant") +class CustomQuantConfig(QuantizationConfig): + """Custom quantization config for per-token dynamic fake quantization.""" + + def __init__(self, num_bits: int = 8) -> None: + """Initialize the quantization config.""" + self.num_bits = num_bits + + def get_name(self) -> str: + """Name of the quantization method.""" + return "custom_quant" + + def get_supported_act_dtypes(self) -> List["torch.dtype"]: + """List of supported activation dtypes.""" + return [torch.float16, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + """Minimum GPU capability to support the quantization method.""" + return -1 + + @staticmethod + def get_config_filenames() -> List[str]: + """List of filenames to search for in the model directory.""" + return [] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "CustomQuantConfig": + """Create a config class from the model's quantization config.""" + return CustomQuantConfig(num_bits=config.get("num_bits", 8)) + + def get_quant_method(self, layer: "torch.nn.Module", + prefix: str) -> Optional["FakeQuantLinearMethod"]: + """Get the quantize method to use for the quantized layer.""" + if isinstance(layer, LinearBase): + return FakeQuantLinearMethod(num_bits=self.num_bits) + return None + + +def test_register_quantization_config(): + """Test register custom quantization config.""" + + # The quantization method `custom_quant` should be registered. + assert get_quantization_config("custom_quant") == CustomQuantConfig + + # The quantization method `custom_quant` is already exists, + # should raise an error. + with pytest.raises(ValueError): + register_quantization_config("custom_quant")(CustomQuantConfig) + + +@pytest.mark.parametrize(argnames="model", + argvalues=[ + "meta-llama/Meta-Llama-3-8B-Instruct", + ]) +def test_custom_quant(vllm_runner, model): + """Test infer with the custom quantization method.""" + with vllm_runner(model_name=model, + quantization="custom_quant", + enforce_eager=True) as llm: + + model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 + layer = model.model.layers[0] + qkv_proj = layer.self_attn.qkv_proj + + # Check the quantization method is FakeQuantLinearMethod + assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod) + + output = llm.generate_greedy("Hello my name is", max_tokens=20) + assert output diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py index 555aef99218c3..2cb10de1c6f55 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp4.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py @@ -108,7 +108,8 @@ def test_skip_speculation(common_llm_kwargs, per_test_common_llm_kwargs, TODO: fix it to pass without raising Error. (#5814) """ - with pytest.raises(openai.APIConnectionError): + with pytest.raises( + (openai.APIConnectionError, openai.InternalServerError)): run_equality_correctness_test_tp(MAIN_MODEL, common_llm_kwargs, per_test_common_llm_kwargs, diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index bf409d2d97aa1..6e7eec1c6ab34 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -3,6 +3,7 @@ import json import os import pathlib import subprocess +from functools import partial from unittest.mock import MagicMock, patch import openai @@ -24,7 +25,6 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig, # yapf: enable from vllm.utils import PlaceholderModule, import_from_path -from ..conftest import VllmRunner from ..utils import VLLM_PATH, RemoteOpenAIServer from .conftest import retry_until_skip @@ -58,16 +58,6 @@ def is_curl_installed(): return False -def get_torch_model(vllm_runner: VllmRunner): - return vllm_runner \ - .model \ - .llm_engine \ - .model_executor \ - .driver_worker \ - .model_runner \ - .model - - def write_keyfile(keyfile_path: str): encryption_params = EncryptionParams.random() pathlib.Path(keyfile_path).parent.mkdir(parents=True, exist_ok=True) @@ -121,8 +111,10 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs( config_for_serializing = TensorizerConfig(tensorizer_uri=model_path, encryption_keyfile=key_path) - serialize_vllm_model(get_torch_model(vllm_model), - config_for_serializing) + + vllm_model.apply_model( + partial(serialize_vllm_model, + tensorizer_config=config_for_serializing)) config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path, encryption_keyfile=key_path) @@ -175,8 +167,10 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): with vllm_runner(model_ref, ) as vllm_model: model_path = tmp_path / (model_ref + ".tensors") - serialize_vllm_model(get_torch_model(vllm_model), - TensorizerConfig(tensorizer_uri=model_path)) + vllm_model.apply_model( + partial( + serialize_vllm_model, + tensorizer_config=TensorizerConfig(tensorizer_uri=model_path))) with vllm_runner( model_ref, @@ -215,8 +209,10 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): with vllm_runner(model_ref, ) as vllm_model: model_path = tmp_path / (model_ref + ".tensors") - serialize_vllm_model(get_torch_model(vllm_model), - TensorizerConfig(tensorizer_uri=model_path)) + vllm_model.apply_model( + partial( + serialize_vllm_model, + tensorizer_config=TensorizerConfig(tensorizer_uri=model_path))) model_loader_extra_config = { "tensorizer_uri": str(model_path), @@ -337,7 +333,9 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): with vllm_runner(model_ref) as vllm_model: outputs = vllm_model.generate(prompts, sampling_params) - serialize_vllm_model(get_torch_model(vllm_model), config) + + vllm_model.apply_model( + partial(serialize_vllm_model, tensorizer_config=config)) assert is_vllm_tensorized(config) diff --git a/tests/test_utils.py b/tests/test_utils.py index 14d2fbd63b90d..d5dc4464e634d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,14 +2,17 @@ import asyncio import os import socket from typing import AsyncIterator, Tuple +from unittest.mock import patch import pytest import torch from vllm_test_utils import monitor -from vllm.utils import (FlexibleArgumentParser, PlaceholderModule, - StoreBoolean, deprecate_kwargs, get_open_port, - memory_profiling, merge_async_iterators, supports_kw) +from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.utils import (FlexibleArgumentParser, MemorySnapshot, + PlaceholderModule, StoreBoolean, bind_kv_cache, + deprecate_kwargs, get_open_port, memory_profiling, + merge_async_iterators, supports_kw) from .utils import error_on_warning, fork_new_process_for_each_test @@ -281,14 +284,13 @@ def test_memory_profiling(): # 512 MiB allocation outside of this instance handle1 = lib.cudaMalloc(512 * 1024 * 1024) - baseline_memory_in_bytes = \ - torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0] + baseline_snapshot = MemorySnapshot() # load weights weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32) - weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB + weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB def measure_current_non_torch(): free, total = torch.cuda.mem_get_info() @@ -297,8 +299,8 @@ def test_memory_profiling(): current_non_torch = current_used - current_torch return current_non_torch - with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes, - weights_memory_in_bytes=weights_memory_in_bytes) as result, \ + with memory_profiling(baseline_snapshot=baseline_snapshot, + weights_memory=weights_memory) as result, \ monitor(measure_current_non_torch) as monitored_values: # make a memory spike, 1 GiB spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32) @@ -313,18 +315,99 @@ def test_memory_profiling(): assert measured_diff == 256 * 1024 * 1024 # Check that the memory usage is within 5% of the expected values - # 5% tolerance is caused by PyTorch caching allocator, - # we cannot control PyTorch's behavior of its internal buffers, + # 5% tolerance is caused by cuda runtime. + # we cannot control cuda runtime in the granularity of bytes, # which causes a small error (<10 MiB in practice) - non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa - torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa + non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa assert abs(non_torch_ratio - 1) <= 0.05 - assert abs(torch_peak_ratio - 1) <= 0.05 + assert result.torch_peak_increase == 1024 * 1024 * 1024 del weights lib.cudaFree(handle1) lib.cudaFree(handle2) +def test_bind_kv_cache(): + from vllm.attention import Attention + + ctx = { + 'layers.0.self_attn': Attention(32, 128, 0.1), + 'layers.1.self_attn': Attention(32, 128, 0.1), + 'layers.2.self_attn': Attention(32, 128, 0.1), + 'layers.3.self_attn': Attention(32, 128, 0.1), + } + kv_cache = [ + torch.zeros((1, )), + torch.zeros((1, )), + torch.zeros((1, )), + torch.zeros((1, )), + ] + bind_kv_cache(ctx, [kv_cache]) + assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0] + assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1] + assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[2] + assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[3] + +def test_bind_kv_cache_non_attention(): + from vllm.attention import Attention + + # example from Jamba PP=2 + ctx = { + 'model.layers.20.attn': Attention(32, 128, 0.1), + 'model.layers.28.attn': Attention(32, 128, 0.1), + } + kv_cache = [ + torch.zeros((1, )), + torch.zeros((1, )), + ] + bind_kv_cache(ctx, [kv_cache]) + assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[0] + assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1] + + +def test_bind_kv_cache_encoder_decoder(): + from vllm.attention import Attention, AttentionType + + # example from bart + ctx = { + 'encoder.layers.0.self_attn.attn': + Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER), + 'decoder.layers.0.encoder_attn.attn': + Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER), + 'decoder.layers.0.self_attn.attn': + Attention(32, 128, 0.1, attn_type=AttentionType.DECODER), + } + + kv_cache = [ + torch.zeros((1, )), + ] + encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache + + bind_kv_cache(ctx, [kv_cache]) + assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache + assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0] + assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0] + + +def test_bind_kv_cache_pp(): + with patch("vllm.utils.cuda_device_count_stateless", lambda: 2): + # this test runs with 1 GPU, but we simulate 2 GPUs + cfg = VllmConfig( + parallel_config=ParallelConfig(pipeline_parallel_size=2)) + with set_current_vllm_config(cfg): + from vllm.attention import Attention + + ctx = { + 'layers.0.self_attn': Attention(32, 128, 0.1), + } + kv_cache = [ + [torch.zeros((1, ))], + [torch.zeros((1, ))] + ] + bind_kv_cache(ctx, kv_cache) + assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0][0] + assert ctx['layers.0.self_attn'].kv_cache[1] is kv_cache[1][0] + + def test_placeholder_module_error_handling(): placeholder = PlaceholderModule("placeholder_1234") diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index b97f55b8c6535..fafd9d0ce4455 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -49,9 +49,10 @@ def test_prefill(): unique_token_ids = [3] * 7 all_token_ids = common_token_ids + unique_token_ids req0 = make_request("0", all_token_ids) - computed_blocks = manager.get_computed_blocks(req0) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) assert len(req0.kv_block_hashes) == 3 assert not computed_blocks + assert num_computed_tokens == 0 blocks = manager.allocate_slots(req0, 55, computed_blocks) assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4] @@ -73,9 +74,10 @@ def test_prefill(): # Incomplete 1 block (5 tokens) unique_token_ids = [3] * 5 req1 = make_request("1", common_token_ids + unique_token_ids) - computed_blocks = manager.get_computed_blocks(req1) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert len(req1.kv_block_hashes) == 3 assert [b.block_id for b in computed_blocks] == [0, 1, 2] + assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks) assert [b.block_id for b in blocks] == [5, 6] @@ -91,7 +93,7 @@ def test_prefill(): # All blocks should be available. assert manager.free_block_queue.num_free_blocks == 10 # The order should be - # [unallocated (7, 8)] + # [unallocated (7, 8, 9)] # [unique_req0 (4, 3)] # [unique_req1 (6, 5)] # [common (2, 1, 0)] @@ -103,9 +105,10 @@ def test_prefill(): # Incomplete 1 block (6 tokens) unique_token_ids = [3] * 6 req2 = make_request("2", common_token_ids + unique_token_ids) - computed_blocks = manager.get_computed_blocks(req2) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert len(req2.kv_block_hashes) == 3 assert [b.block_id for b in computed_blocks] == [0, 1, 2] + assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks) assert [b.block_id for b in blocks] == [7, 8] @@ -123,8 +126,9 @@ def test_prefill(): # Cache miss and eviction. req3 = make_request("3", [99] * (16 * 9)) - computed_blocks = manager.get_computed_blocks(req3) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3) assert not computed_blocks + assert num_computed_tokens == 0 blocks = manager.allocate_slots(req3, 16 * 9, computed_blocks) # This block ID order also checks the eviction order. assert [b.block_id for b in blocks] == [9, 4, 3, 6, 5, 8, 7, 2, 1, 0] @@ -150,8 +154,9 @@ def test_decode(): # Incomplete 1 block (7 tokens) unique_token_ids = [3] * 7 req0 = make_request("0", common_token_ids + unique_token_ids) - computed_blocks = manager.get_computed_blocks(req0) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) assert not computed_blocks + assert num_computed_tokens == 0 blocks = manager.allocate_slots(req0, 55, computed_blocks) assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4] @@ -197,16 +202,18 @@ def test_evict(): last_token_id = 5 * 16 + 7 req0 = make_request("0", list(range(last_token_id))) - computed_blocks = manager.get_computed_blocks(req0) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) assert not computed_blocks + assert num_computed_tokens == 0 blocks = manager.allocate_slots(req0, 5 * 16 + 7, computed_blocks) assert len(blocks) == 7 # 5 full + 1 partial + 1 preallocated # 3 blocks. req1 = make_request("1", list(range(last_token_id, last_token_id + 3 * 16))) - computed_blocks = manager.get_computed_blocks(req1) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert not computed_blocks + assert num_computed_tokens == 0 blocks = manager.allocate_slots(req1, 3 * 16, computed_blocks) assert len(blocks) == 3 # 3 full blocks last_token_id += 3 * 16 @@ -222,8 +229,9 @@ def test_evict(): # Touch the first 2 blocks. req2 = make_request("2", list(range(2 * 16 + 3))) - computed_blocks = manager.get_computed_blocks(req2) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert [b.block_id for b in computed_blocks] == [0, 1] + assert num_computed_tokens == 2 * 16 blocks = manager.allocate_slots(req2, 3, computed_blocks) assert [b.block_id for b in blocks] == [6, 5] assert manager.free_block_queue.num_free_blocks == 6 @@ -247,8 +255,9 @@ def test_hash_block_correct_reuse(): # Allocate 1 block and cache it. num_tokens = block_size * 1 req = make_request("0", list(range(num_tokens))) - computed_blocks = manager.get_computed_blocks(req) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req) assert not computed_blocks + assert num_computed_tokens == 0 blocks = manager.allocate_slots(req, num_tokens, computed_blocks) assert len(blocks) == 1 @@ -258,8 +267,9 @@ def test_hash_block_correct_reuse(): # Allocate a new block that's not full, make sure hash info on the # block is cleared. req = make_request("1", list(range(num_tokens - 1))) - computed_blocks = manager.get_computed_blocks(req) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req) assert not computed_blocks + assert num_computed_tokens == 0 blocks = manager.allocate_slots(req, num_tokens - 1, computed_blocks) assert len(blocks) == 1 @@ -284,16 +294,18 @@ def test_computed_blocks_not_evicted(): # Allocate a block and cache it. num_tokens = block_size * 1 req0 = make_request("0", list(range(num_tokens))) - computed_blocks = manager.get_computed_blocks(req0) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) assert not computed_blocks + assert num_computed_tokens == 0 blocks = manager.allocate_slots(req0, num_tokens, computed_blocks) assert len(blocks) == 1 assert blocks[0].block_id == 0 # Allocate another block. req1 = make_request("1", list(range(num_tokens, num_tokens * 2))) - computed_blocks = manager.get_computed_blocks(req1) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert not computed_blocks + assert num_computed_tokens == 0 blocks = manager.allocate_slots(req1, num_tokens, computed_blocks) assert len(blocks) == 1 assert blocks[0].block_id == 1 @@ -305,9 +317,10 @@ def test_computed_blocks_not_evicted(): # Now if we have a cache hit on the first block, we should evict the second # cached block rather than the first one. req2 = make_request("2", list(range(num_tokens * 2))) - computed_blocks = manager.get_computed_blocks(req2) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert len(computed_blocks) == 1 assert computed_blocks[0].block_id == 0 + assert num_computed_tokens == block_size blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens, computed_blocks) @@ -331,8 +344,9 @@ def test_basic_prefix_caching_disabled(): req1 = make_request("1", list(range(10))) # 2 blocks and some more - computed_blocks = manager.get_computed_blocks(req1) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert not computed_blocks + assert num_computed_tokens == 0 blocks = manager.allocate_slots(req1, 10, computed_blocks) assert len(blocks) == 3 @@ -341,15 +355,17 @@ def test_basic_prefix_caching_disabled(): # No caching. req2 = make_request("2", list(range(16))) # shared prefix - computed_blocks = manager.get_computed_blocks(req2) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert not computed_blocks + assert num_computed_tokens == 0 blocks = manager.allocate_slots(req2, 16, computed_blocks) assert len(blocks) == 4 # New requests should not have any blocks. req3 = make_request("3", list(range(4))) - computed_blocks = manager.get_computed_blocks(req3) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3) assert not computed_blocks + assert num_computed_tokens == 0 blocks = manager.allocate_slots(req3, 4, computed_blocks) assert not blocks @@ -371,8 +387,9 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int): num_preallocated_blocks = cdiv(num_preallocate_tokens, block_size) req = make_request("0", list(range(block_size * 30))) - computed_blocks = manager.get_computed_blocks(req) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req) assert not computed_blocks + assert num_computed_tokens == 0 # Just ask for 1 block. blocks = manager.allocate_slots(req, block_size, computed_blocks) req.num_computed_tokens = block_size @@ -469,10 +486,11 @@ def test_mm_prefix_caching(): all_token_ids, mm_positions=mm_positions, mm_hashes=mm_hashes) - computed_blocks = manager.get_computed_blocks(req0) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) # Completed block should have hashes with extra keys. assert not computed_blocks + assert num_computed_tokens == 0 assert len(req0.kv_block_hashes) == 3 assert req0.kv_block_hashes[0].extra_keys == ("aaa", ) assert req0.kv_block_hashes[1].extra_keys == ("aaa", "bbb") @@ -503,8 +521,9 @@ def test_mm_prefix_caching(): all_token_ids, mm_positions=mm_positions, mm_hashes=mm_hashes) - computed_blocks = manager.get_computed_blocks(req1) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert len(computed_blocks) == 3 + assert num_computed_tokens == 3 * 16 def test_prefill_not_enough_free_blocks_with_computed_blocks(): @@ -527,15 +546,17 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks(): # | Common-0 | Common-1 | Common-2 | ... | common_token_ids = [i for i in range(3) for _ in range(16)] req0 = make_request("0", common_token_ids) - computed_blocks = manager.get_computed_blocks(req0) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) assert not computed_blocks + assert num_computed_tokens == 0 manager.allocate_slots(req0, 48, computed_blocks) block_part0 = manager.req_to_blocks[req0.request_id] # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... | req1 = make_request("1", common_token_ids * 2) - computed_blocks = manager.get_computed_blocks(req1) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert computed_blocks == block_part0 + assert num_computed_tokens == 3 * 16 manager.allocate_slots(req1, 48, computed_blocks) block_part1 = manager.req_to_blocks[req1.request_id] # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) | @@ -547,8 +568,9 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks(): # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) | # | Req1-5(F)| Req2-0 | Req2-1 | ... | req2 = make_request("2", [7] * block_size * 2) - computed_blocks = manager.get_computed_blocks(req2) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert not computed_blocks + assert num_computed_tokens == 0 manager.allocate_slots(req2, block_size * 2, computed_blocks) # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed, @@ -556,8 +578,9 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks(): # In this case, the ref_cnt of the computed blocks should not be changed. assert manager.free_block_queue.num_free_blocks == 5 req3 = make_request("3", common_token_ids * 3) - computed_blocks = manager.get_computed_blocks(req3) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3) assert computed_blocks == block_part1 + assert num_computed_tokens == 6 * 16 # Req3 cannot be allocated. assert manager.allocate_slots(req3, 48, computed_blocks) is None # Block 0-2 are used by Req 1. diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index fffb5b8100ec7..2c805e18eebae 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -1,5 +1,5 @@ import asyncio -from typing import Tuple +from typing import List, Tuple import pytest @@ -13,6 +13,7 @@ if not current_platform.is_cuda(): allow_module_level=True) ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B", + enforce_eager=True, disable_log_requests=True) @@ -53,17 +54,63 @@ async def test_load(monkeypatch): generate(engine, request_id, NUM_EXPECTED_TOKENS))) # Confirm that we got all the EXPECTED tokens from the requests. - failed_request_id = None - tokens = None for task in tasks: num_generated_tokens, request_id = await task - if (num_generated_tokens != NUM_EXPECTED_TOKENS - and failed_request_id is None): - failed_request_id = request_id - tokens = num_generated_tokens + assert num_generated_tokens == NUM_EXPECTED_TOKENS, ( + f"{request_id} generated {num_generated_tokens} but " + f"expected {NUM_EXPECTED_TOKENS}") - assert failed_request_id is None, ( - f"{failed_request_id} generated {tokens} but " - f"expected {NUM_EXPECTED_TOKENS}") + assert not engine.output_processor.has_unfinished_requests() + engine.shutdown() + + +@pytest.mark.asyncio +async def test_abort(monkeypatch): + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + engine = AsyncLLM.from_engine_args(ENGINE_ARGS) + + NUM_REQUESTS = 100 + NUM_EXPECTED_TOKENS = 100 + REQUEST_IDS_TO_ABORT = range(1, 100, 10) + + request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)] + + # Create concurrent requests. + tasks: List[asyncio.Task] = [] + for request_id in request_ids: + tasks.append( + asyncio.create_task( + generate(engine, request_id, NUM_EXPECTED_TOKENS))) + + # API server cancels requests when they disconnect. + for idx in REQUEST_IDS_TO_ABORT: + tasks[idx].cancel() + await asyncio.sleep(0.1) + + # Confirm the other requests are okay. + for idx, task in enumerate(tasks): + # Confirm that it was actually canceled. + if idx in REQUEST_IDS_TO_ABORT: + with pytest.raises(asyncio.CancelledError): + await task + else: + # Otherwise, make sure the request was not impacted. + num_generated_tokens, request_id = await task + assert num_generated_tokens == NUM_EXPECTED_TOKENS, ( + f"{request_id} generated {num_generated_tokens} but " + f"expected {NUM_EXPECTED_TOKENS}") + + assert not engine.output_processor.has_unfinished_requests() + + # Confirm we can do another generation. + request_id = f"request-{REQUEST_IDS_TO_ABORT[0]}" + task = asyncio.create_task( + generate(engine, request_id, NUM_EXPECTED_TOKENS)) + num_generated_tokens, request_id = await task + assert num_generated_tokens == NUM_EXPECTED_TOKENS + assert not engine.output_processor.has_unfinished_requests() engine.shutdown() diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 8dd9b23fbdd5f..cccfd305ac604 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -4,6 +4,7 @@ import uuid import pytest from transformers import AutoTokenizer +from tests.utils import fork_new_process_for_each_test from vllm import SamplingParams from vllm.engine.arg_utils import EngineArgs from vllm.platforms import current_platform @@ -36,6 +37,7 @@ def make_request() -> EngineCoreRequest: ) +@fork_new_process_for_each_test def test_engine_core(monkeypatch): with monkeypatch.context() as m: @@ -78,7 +80,7 @@ def test_engine_core(monkeypatch): assert len(engine_core.scheduler.running) == 4 # Loop through until they are all done. - while len(engine_core.step()) > 0: + while len(engine_core.step().outputs) > 0: pass assert len(engine_core.scheduler.waiting) == 0 @@ -138,6 +140,7 @@ def test_engine_core(monkeypatch): assert len(engine_core.scheduler.running) == 0 +@fork_new_process_for_each_test def test_engine_core_advanced_sampling(monkeypatch): """ A basic end-to-end test to verify that the engine functions correctly @@ -167,7 +170,7 @@ def test_engine_core_advanced_sampling(monkeypatch): assert len(engine_core.scheduler.waiting) == 1 assert len(engine_core.scheduler.running) == 0 # Loop through until they are all done. - while len(engine_core.step()) > 0: + while len(engine_core.step().outputs) > 0: pass assert len(engine_core.scheduler.waiting) == 0 diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 5a21806e57a11..e2c728b22d481 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -6,6 +6,7 @@ from typing import Dict, List import pytest from transformers import AutoTokenizer +from tests.utils import fork_new_process_for_each_test from vllm import SamplingParams from vllm.engine.arg_utils import EngineArgs from vllm.platforms import current_platform @@ -42,7 +43,7 @@ def make_request(params: SamplingParams) -> EngineCoreRequest: def loop_until_done(client: EngineCoreClient, outputs: Dict): while True: - engine_core_outputs = client.get_output() + engine_core_outputs = client.get_output().outputs if len(engine_core_outputs) == 0: break @@ -60,7 +61,7 @@ def loop_until_done(client: EngineCoreClient, outputs: Dict): async def loop_until_done_async(client: EngineCoreClient, outputs: Dict): while True: - engine_core_outputs = await client.get_output_async() + engine_core_outputs = await client.get_output_async().outputs if len(engine_core_outputs) == 0: break @@ -75,6 +76,7 @@ async def loop_until_done_async(client: EngineCoreClient, outputs: Dict): break +@fork_new_process_for_each_test @pytest.mark.parametrize("multiprocessing_mode", [True, False]) def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): @@ -143,6 +145,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): client.abort_requests([request.request_id]) +@fork_new_process_for_each_test @pytest.mark.asyncio async def test_engine_core_client_asyncio(monkeypatch): diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_output_processor.py similarity index 65% rename from tests/v1/engine/test_detokenizer.py rename to tests/v1/engine/test_output_processor.py index aeae697ca32b0..4735c6f947537 100644 --- a/tests/v1/engine/test_detokenizer.py +++ b/tests/v1/engine/test_output_processor.py @@ -3,11 +3,18 @@ from typing import List import pytest from transformers import AutoTokenizer +from vllm.engine.arg_utils import EngineArgs from vllm.sampling_params import RequestOutputKind, SamplingParams +from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest -from vllm.v1.engine.detokenizer import Detokenizer +from vllm.v1.engine.output_processor import OutputProcessor TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3" +VLLM_CONFIG = EngineArgs(model=TOKENIZER_NAME).create_engine_config() +TOKENIZER_GROUP = init_tokenizer_from_configs(VLLM_CONFIG.model_config, + VLLM_CONFIG.scheduler_config, + VLLM_CONFIG.parallel_config, + VLLM_CONFIG.lora_config) tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME) FULL_STRINGS = [ @@ -66,7 +73,7 @@ class MockEngineCore: "request_output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) def test_incremental_detokenization(request_output_kind: RequestOutputKind): - detokenizer = Detokenizer(TOKENIZER_NAME) + output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False) engine_core = MockEngineCore(GENERATION_TOKENS) # Make N requests. @@ -93,7 +100,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind): # Add requests to the detokenizer. for request in requests: - detokenizer.add_request(request) + output_processor.add_request(request) gen_strings = {} gen_tokens = {} @@ -104,7 +111,9 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind): break # Step the Detokenizer. - request_outputs, requests_to_abort = detokenizer.step(outputs) + processed_outputs = output_processor.process_outputs(outputs, ) + request_outputs = processed_outputs.request_outputs + requests_to_abort = processed_outputs.reqs_to_abort assert len(requests_to_abort) == 0 # Update tracking. @@ -128,13 +137,13 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind): assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}" assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}" - assert detokenizer.get_num_unfinished_requests() == 0 - assert not detokenizer.has_unfinished_requests() + assert output_processor.get_num_unfinished_requests() == 0 + assert not output_processor.has_unfinished_requests() @pytest.mark.parametrize("include_stop_str_in_output", [True, False]) def test_stop_string(include_stop_str_in_output: bool): - detokenizer = Detokenizer(TOKENIZER_NAME) + output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False) engine_core = MockEngineCore(GENERATION_TOKENS) # Make N requests. @@ -162,7 +171,7 @@ def test_stop_string(include_stop_str_in_output: bool): # Add requests to the detokenizer. for request in requests: - detokenizer.add_request(request) + output_processor.add_request(request) gen_strings = {} aborted = [] @@ -173,7 +182,9 @@ def test_stop_string(include_stop_str_in_output: bool): break # Step the Detokenizer. - request_outputs, requests_to_abort = detokenizer.step(outputs) + processed_outputs = output_processor.process_outputs(outputs) + request_outputs = processed_outputs.request_outputs + requests_to_abort = processed_outputs.reqs_to_abort for request_output in request_outputs: # If aborted, we should not get a request output. assert request_output.request_id not in aborted @@ -214,5 +225,71 @@ def test_stop_string(include_stop_str_in_output: bool): assert gen_str == ref_str_exc_stop, ( f"{gen_str=}, {ref_str_exc_stop=}") - assert detokenizer.get_num_unfinished_requests() == 0 - assert not detokenizer.has_unfinished_requests() + assert output_processor.get_num_unfinished_requests() == 0 + assert not output_processor.has_unfinished_requests() + + +def test_iteration_stats(): + output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=True) + engine_core = MockEngineCore(GENERATION_TOKENS) + + # Make N requests. + requests = [ + EngineCoreRequest( + request_id=f"request-{idx}", + prompt=prompt, + prompt_token_ids=prompt_tokens, + arrival_time=0, + mm_inputs=None, + mm_hashes=None, + mm_placeholders=None, + eos_token_id=None, + lora_request=None, + sampling_params=SamplingParams(), + ) for idx, ( + prompt, + prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) + ] + + # Add all requests except one to the OutputProcessor. + num_active = len(GENERATION_TOKENS) - 1 + for request in requests[:num_active]: + output_processor.add_request(request) + inactive_request = requests[num_active] + + # First iteration has 2 prefills. + outputs = engine_core.get_outputs()[:num_active] + processed_outputs = output_processor.process_outputs(outputs) + iteration_stats = processed_outputs.iteration_stats + total_prompt_tokens = sum( + [len(prompt_tokens) for prompt_tokens in PROMPT_TOKENS[:num_active]]) + + assert iteration_stats.num_prompt_tokens == total_prompt_tokens + assert iteration_stats.num_generation_tokens == num_active + + # Just decodes in this step. + outputs = engine_core.get_outputs()[:num_active] + processed_outputs = output_processor.process_outputs(outputs) + iteration_stats = processed_outputs.iteration_stats + + assert iteration_stats.num_prompt_tokens == 0 + assert iteration_stats.num_generation_tokens == num_active + + # Add a new request - prefill and 2 decodes in this step. + output_processor.add_request(inactive_request) + num_active += 1 + outputs = engine_core.get_outputs()[:num_active] + processed_outputs = output_processor.process_outputs(outputs) + iteration_stats = processed_outputs.iteration_stats + total_prompt_tokens = len(PROMPT_TOKENS[num_active - 1]) + + assert iteration_stats.num_prompt_tokens == total_prompt_tokens + assert iteration_stats.num_generation_tokens == num_active + + # Just decodes in this step. + outputs = engine_core.get_outputs()[:num_active] + processed_outputs = output_processor.process_outputs(outputs) + iteration_stats = processed_outputs.iteration_stats + + assert iteration_stats.num_prompt_tokens == 0 + assert iteration_stats.num_generation_tokens == num_active diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py new file mode 100644 index 0000000000000..ac773b611f406 --- /dev/null +++ b/tests/v1/test_utils.py @@ -0,0 +1,62 @@ +from typing import List + +import torch + +from vllm.v1.utils import bind_kv_cache + + +def test_bind_kv_cache(): + from vllm.attention import Attention + + ctx = { + 'layers.0.self_attn': Attention(32, 128, 0.1), + 'layers.1.self_attn': Attention(32, 128, 0.1), + 'layers.2.self_attn': Attention(32, 128, 0.1), + 'layers.3.self_attn': Attention(32, 128, 0.1), + } + kv_cache = { + 'layers.0.self_attn': torch.zeros((1, )), + 'layers.1.self_attn': torch.zeros((1, )), + 'layers.2.self_attn': torch.zeros((1, )), + 'layers.3.self_attn': torch.zeros((1, )), + } + runner_kv_caches: List[torch.Tensor] = [] + bind_kv_cache(kv_cache, ctx, runner_kv_caches) + assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[ + 'layers.0.self_attn'] + assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[ + 'layers.1.self_attn'] + assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[ + 'layers.2.self_attn'] + assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[ + 'layers.3.self_attn'] + + assert runner_kv_caches[0] is kv_cache['layers.0.self_attn'] + assert runner_kv_caches[1] is kv_cache['layers.1.self_attn'] + assert runner_kv_caches[2] is kv_cache['layers.2.self_attn'] + assert runner_kv_caches[3] is kv_cache['layers.3.self_attn'] + + +def test_bind_kv_cache_non_attention(): + from vllm.attention import Attention + + # example from Jamba PP=2 + ctx = { + 'model.layers.20.attn': Attention(32, 128, 0.1), + 'model.layers.28.attn': Attention(32, 128, 0.1), + } + kv_cache = { + 'model.layers.20.attn': torch.zeros((1, )), + 'model.layers.28.attn': torch.zeros((1, )), + } + + runner_kv_caches: List[torch.Tensor] = [] + bind_kv_cache(kv_cache, ctx, runner_kv_caches) + + assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[ + 'model.layers.20.attn'] + assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[ + 'model.layers.28.attn'] + + assert runner_kv_caches[0] is kv_cache['model.layers.20.attn'] + assert runner_kv_caches[1] is kv_cache['model.layers.28.attn'] diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt index a06956ce18a93..272206d4502e9 100644 --- a/tests/weight_loading/models.txt +++ b/tests/weight_loading/models.txt @@ -30,4 +30,5 @@ marlin, nm-testing/zephyr-beta-7b-marlin-g128, main marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main qqq, HandH1998/QQQ-Llama-3-8b-g128, main qqq, HandH1998/QQQ-Llama-3-8b, main -hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main \ No newline at end of file +hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main +None, mgleize/fairseq2-dummy-Llama-3.2-1B, main \ No newline at end of file diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py index 199731bdc21fe..7a3786456d0d6 100644 --- a/tests/weight_loading/test_weight_loading.py +++ b/tests/weight_loading/test_weight_loading.py @@ -20,12 +20,13 @@ def test_weight_loading(vllm_runner): """ Test parameter weight loading with tp>1. """ - with vllm_runner(model_name=MODEL_NAME, - revision=REVISION, - dtype=torch.half if QUANTIZATION == "gptq" else "auto", - quantization=QUANTIZATION, - max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=2) as model: + with vllm_runner( + model_name=MODEL_NAME, + revision=REVISION, + dtype=torch.half if QUANTIZATION == "gptq" else "auto", + quantization=None if QUANTIZATION == "None" else QUANTIZATION, + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=2) as model: output = model.generate_greedy("Hello world!", max_tokens=20) print(output) diff --git a/tools/actionlint.sh b/tools/actionlint.sh deleted file mode 100755 index f6a8b5e83a2de..0000000000000 --- a/tools/actionlint.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -if command -v actionlint &> /dev/null; then - actionlint "$@" - exit 0 -elif [ -x ./actionlint ]; then - ./actionlint "$@" - exit 0 -fi - -# download a binary to the current directory - v1.7.3 -bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash) -./actionlint "$@" diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py index 49366abc7fb56..54cd60c2bc95b 100644 --- a/tools/profiler/print_layerwise_table.py +++ b/tools/profiler/print_layerwise_table.py @@ -31,7 +31,7 @@ if __name__ == "__main__": type=str, required=True, help="json trace file output by " - "examples/offline_inference/offline_profile.py") + "examples/offline_inference/profiling.py") parser.add_argument("--phase", type=str, required=True, diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py index fa88ed4204d8f..cb56ebd69a8c1 100644 --- a/tools/profiler/visualize_layerwise_profile.py +++ b/tools/profiler/visualize_layerwise_profile.py @@ -538,7 +538,7 @@ if __name__ == "__main__": type=str, required=True, help="json trace file output by \ - examples/offline_inference/offline_profile.py") + examples/offline_inference/profiling.py") parser.add_argument("--output-directory", type=str, required=False, diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh index d99fa77b96351..7efb3cabc64fe 100755 --- a/tools/shellcheck.sh +++ b/tools/shellcheck.sh @@ -19,4 +19,4 @@ if ! [ -x "$(command -v shellcheck)" ]; then fi # TODO - fix warnings in .buildkite/run-amd-test.sh -find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"' +find . -name "*.sh" ".git" -prune -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck -s bash "{}"' diff --git a/tools/sphinx-lint.sh b/tools/sphinx-lint.sh deleted file mode 100755 index 04f8075c5527f..0000000000000 --- a/tools/sphinx-lint.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -sphinx-lint --disable trailing-whitespace,missing-final-newline docs diff --git a/vllm/__init__.py b/vllm/__init__.py index 45252b93e3d54..a533dba561c00 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -17,6 +17,44 @@ from vllm.sampling_params import SamplingParams from .version import __version__, __version_tuple__ + +def configure_as_vllm_process(): + """ + set some common config/environment variables that should be set + for all processes created by vllm and all processes + that interact with vllm workers. + """ + import os + + import torch + + # see https://github.com/NVIDIA/nccl/issues/1234 + os.environ['NCCL_CUMEM_ENABLE'] = '0' + + # see https://github.com/vllm-project/vllm/issues/10480 + os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1' + # see https://github.com/vllm-project/vllm/issues/10619 + torch._inductor.config.compile_threads = 1 + + from vllm.platforms import current_platform + + if current_platform.is_xpu(): + # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158 + torch._dynamo.config.disable = True + elif current_platform.is_hpu(): + # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1) + # does not support torch.compile + # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for + # torch.compile support + is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1' + if is_lazy: + torch._dynamo.config.disable = True + # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only) + # requires enabling lazy collectives + # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501 + os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true' + + __all__ = [ "__version__", "__version_tuple__", @@ -42,4 +80,5 @@ __all__ = [ "AsyncEngineArgs", "initialize_ray_cluster", "PoolingParams", + "configure_as_vllm_process", ] diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index afb350591e562..d04cbbc0a9eed 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -34,33 +34,6 @@ else: from torch.library import impl_abstract as register_fake -# activation ops -def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - torch.ops._C.gelu_and_mul(out, x) - - -def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - torch.ops._C.gelu_tanh_and_mul(out, x) - - -def fatrelu_and_mul(out: torch.Tensor, - x: torch.Tensor, - threshold: float = 0.0) -> None: - torch.ops._C.fatrelu_and_mul(out, x, threshold) - - -def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None: - torch.ops._C.gelu_fast(out, x) - - -def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None: - torch.ops._C.gelu_new(out, x) - - -def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None: - torch.ops._C.gelu_quick(out, x) - - # page attention ops def paged_attention_v1( out: torch.Tensor, diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index f5dcaea79af93..737559bfe70ca 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -31,6 +31,10 @@ class AttentionType: class AttentionBackend(ABC): """Abstract class for attention backends.""" + # For some attention backends, we allocate an output tensor before + # calling the custom op. When piecewise cudagraph is enabled, this + # makes sure the output tensor is allocated inside the cudagraph. + accept_output_buffer: bool = False @staticmethod @abstractmethod diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index 7089d59392c36..77cfa8490172b 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -89,8 +89,7 @@ class BlocksparseFlashAttentionBackend(AttentionBackend): @staticmethod def get_name() -> str: - # For attention layer compatibility - return "FLASH_ATTN" + return "BLOCK_SPARSE_FLASH_ATTN" @staticmethod def get_impl_cls() -> Type["BlocksparseFlashAttentionImpl"]: diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 23ea244f07dfe..48b3e8d177ec9 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -29,6 +29,8 @@ from vllm.vllm_flash_attn import (flash_attn_varlen_func, class FlashAttentionBackend(AttentionBackend): + accept_output_buffer: bool = True + @staticmethod def get_supported_head_sizes() -> List[int]: return [32, 64, 96, 128, 160, 192, 224, 256] diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index a11462b2068a5..6ca75fabdfc38 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -256,7 +256,12 @@ class FlashInferState(AttentionState): def begin_forward(self, model_input): assert not self._is_graph_capturing state = self - if model_input.attn_metadata.use_cuda_graph: + use_cuda_graph = model_input.attn_metadata.use_cuda_graph + is_decode = model_input.attn_metadata.num_prefills == 0 + # In case of multistep chunked-prefill, there might be prefill requests + # scheduled while CUDA graph mode is enabled. We don't run graph in that + # case. + if use_cuda_graph and is_decode: batch_size = model_input.input_tokens.shape[0] state = (self.runner.graph_runners[model_input.virtual_engine] [batch_size].attn_state) @@ -429,10 +434,24 @@ class FlashInferMetadata(AttentionMetadata): Update metadata in-place to advance one decode step. """ - assert not turn_prefills_into_decodes, \ - ("Chunked prefill is not supported with flashinfer yet." - "turn_prefills_into_decodes is a Multi-Step + Chunked-Prefill " - "specific parameter.") + if turn_prefills_into_decodes: + # When Multi-Step is enabled with Chunked-Prefill, prefills and + # decodes are scheduled together. In the first step, all the + # prefills turn into decodes. This update reflects that + # conversion. + assert self.num_decode_tokens + self.num_prefills == num_seqs + # Flashinfer doesn't support speculative decoding + chunked-prefill + # + multi-step scheduling yet. + assert self.decode_query_len == 1 + self.num_decode_tokens += self.num_prefills + self.num_prefills = 0 + self.num_prefill_tokens = 0 + self.max_prefill_seq_len = 0 + self.max_query_len = 1 + + self.slot_mapping = self.slot_mapping[:num_seqs] + else: + assert self.seq_lens_tensor is not None assert num_seqs > 0 assert num_queries > 0 diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index f1b3598e60b54..e2403306950a3 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -101,7 +101,9 @@ class Attention(nn.Module): self.num_heads = num_heads self.head_size = head_size self.num_kv_heads = num_kv_heads + self.sliding_window = sliding_window self.backend = backend_name_to_enum(attn_backend.get_name()) + self.dtype = dtype # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how # torch.compile works by registering the attention as one giant @@ -110,17 +112,20 @@ class Attention(nn.Module): self.use_direct_call = not current_platform.is_cuda_alike( ) and not current_platform.is_cpu() - # For some attention backends, we allocate an output tensor before - # calling the custom op. When piecewise cudagraph is enabled, this - # makes sure the output tensor is allocated inside the cudagraph. - self.use_output = self.backend == _Backend.FLASH_ATTN or \ - self.backend == _Backend.FLASH_ATTN_VLLM_V1 + self.use_output = attn_backend.accept_output_buffer compilation_config = get_current_vllm_config().compilation_config if prefix in compilation_config.static_forward_context: raise ValueError(f"Duplicate layer name: {prefix}") compilation_config.static_forward_context[prefix] = self self.layer_name = prefix self.attn_type = attn_type + # use a placeholder kv cache tensor during init, which will be replaced + # by bind_kv_cache + # this variable will not be accessed if use_direct_call is True + self.kv_cache = [ + torch.tensor([]) for _ in range(get_current_vllm_config( + ).parallel_config.pipeline_parallel_size) + ] def forward( self, @@ -130,12 +135,7 @@ class Attention(nn.Module): kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, ) -> torch.Tensor: - - if self.use_direct_call: - return self.impl.forward(query, key, value, kv_cache, - attn_metadata, self._k_scale, - self._v_scale) - elif self.use_output: + if self.use_output: output = torch.empty_like(query) hidden_size = query.size(-1) # Reshape the query, key, and value tensors. @@ -147,12 +147,19 @@ class Attention(nn.Module): key = key.view(-1, self.num_kv_heads, self.head_size) if value is not None: value = value.view(-1, self.num_kv_heads, self.head_size) - torch.ops.vllm.unified_attention_with_output( - query, key, value, output, kv_cache, self.layer_name) + if self.use_direct_call: + unified_attention_with_output(query, key, value, output, + self.layer_name) + else: + torch.ops.vllm.unified_attention_with_output( + query, key, value, output, self.layer_name) return output.view(-1, hidden_size) else: - return torch.ops.vllm.unified_attention(query, key, value, - kv_cache, self.layer_name) + if self.use_direct_call: + return unified_attention(query, key, value, self.layer_name) + else: + return torch.ops.vllm.unified_attention( + query, key, value, self.layer_name) def extra_repr(self) -> str: s = f"head_size={self.impl.head_size}" # type: ignore @@ -185,11 +192,11 @@ class MultiHeadAttention(nn.Module): kv_cache_dtype=None, block_size=16, is_attention_free=False) - attn_backend = backend_name_to_enum(attn_backend.get_name()) - if attn_backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}: - attn_backend = _Backend.XFORMERS + backend = backend_name_to_enum(attn_backend.get_name()) + if backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}: + backend = _Backend.XFORMERS - self.attn_backend = attn_backend if attn_backend in { + self.attn_backend = backend if backend in { _Backend.TORCH_SDPA, _Backend.XFORMERS } else _Backend.TORCH_SDPA @@ -223,19 +230,19 @@ class MultiHeadAttention(nn.Module): value, scale=self.scale) out = out.transpose(1, 2) - return out.view(bsz, q_len, -1) + return out.reshape(bsz, q_len, -1) def unified_attention( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - kv_cache: torch.Tensor, layer_name: str, ) -> torch.Tensor: forward_context: ForwardContext = get_forward_context() - attn_metadata = forward_context.dynamic_forward_context - self = forward_context.static_forward_context[layer_name] + attn_metadata = forward_context.attn_metadata + self = forward_context.attn_layers[layer_name] + kv_cache = self.kv_cache[forward_context.virtual_engine] return self.impl.forward(query, key, value, kv_cache, attn_metadata, self._k_scale, self._v_scale) @@ -244,7 +251,6 @@ def unified_attention_fake( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - kv_cache: torch.Tensor, layer_name: str, ) -> torch.Tensor: return torch.empty_like(query).contiguous() @@ -253,7 +259,7 @@ def unified_attention_fake( direct_register_custom_op( op_name="unified_attention", op_func=unified_attention, - mutates_args=["kv_cache"], + mutates_args=[], fake_impl=unified_attention_fake, dispatch_key=current_platform.dispatch_key, ) @@ -264,12 +270,12 @@ def unified_attention_with_output( key: torch.Tensor, value: torch.Tensor, output: torch.Tensor, - kv_cache: torch.Tensor, layer_name: str, ) -> None: forward_context: ForwardContext = get_forward_context() - attn_metadata = forward_context.dynamic_forward_context - self = forward_context.static_forward_context[layer_name] + attn_metadata = forward_context.attn_metadata + self = forward_context.attn_layers[layer_name] + kv_cache = self.kv_cache[forward_context.virtual_engine] self.impl.forward(query, key, value, @@ -285,7 +291,6 @@ def unified_attention_with_output_fake( key: torch.Tensor, value: torch.Tensor, output: torch.Tensor, - kv_cache: torch.Tensor, layer_name: str, ) -> None: return @@ -294,7 +299,7 @@ def unified_attention_with_output_fake( direct_register_custom_op( op_name="unified_attention_with_output", op_func=unified_attention_with_output, - mutates_args=["kv_cache", "output"], + mutates_args=["output"], fake_impl=unified_attention_with_output_fake, dispatch_key=current_platform.dispatch_key, ) diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 0ff007c87b1c9..81ea6eefb5410 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -14,16 +14,18 @@ from vllm.utils import STR_BACKEND_ENV_VAR, resolve_obj_by_qualname logger = init_logger(__name__) -def backend_name_to_enum(backend_name: str) -> _Backend: +def backend_name_to_enum(backend_name: str) -> Optional[_Backend]: + """ + Convert a string backend name to a _Backend enum value. + + Returns: + * _Backend: enum value if backend_name is a valid in-tree type + * None: otherwise it's an invalid in-tree type or an out-of-tree platform is + loaded. + """ assert backend_name is not None - - backend_members = _Backend.__members__ - if backend_name not in backend_members: - raise ValueError(f"Invalid attention backend '{backend_name}'. " - f"Available backends: {', '.join(backend_members)} " - "(case-sensitive).") - - return _Backend[backend_name] + return _Backend[backend_name] if backend_name in _Backend.__members__ else \ + None def get_env_variable_attn_backend() -> Optional[_Backend]: diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 87655530cead4..955c25f300512 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -25,23 +25,30 @@ from .pass_manager import PostGradPassManager logger = init_logger(__name__) +@dataclasses.dataclass +class InductorArtifact: + hash_str: str = "" + file_path: str = "" + + class InductorHashCache: """ Disk format: a Python list of tuples, each tuple is - (runtime_shape, graph_index, hash_str) + (runtime_shape, graph_index, hash_str, file_path) We use list of tuple for readability. In-memory format: a defaultdict of dict, where the key is runtime_shape, and the value is a dict of graph_index to hash_str. - The data is essentially `Dict[Optional[int], Dict[int, str]]`, + The data is essentially `Dict[Optional[int], Dict[int, InductorArtifact]]`, we don't use json here because json doesn't support int as key. TODO: better off-the-shelf solution to serialize the data? """ def __init__(self, cache_dir: str, disabled: bool = False): - self.cache: defaultdict = defaultdict(dict) + self.cache: Dict[Optional[int], + Dict[int, InductorArtifact]] = defaultdict(dict) self.disabled = disabled self.cache_dir = cache_dir self.cache_file_path = os.path.join(cache_dir, @@ -66,14 +73,25 @@ class InductorHashCache: # because it is a safe way to parse Python literals. # do not use eval(), it is unsafe. list_data = ast.literal_eval(data) - for runtime_shape, graph_index, hash_str in list_data: - self.cache[runtime_shape][graph_index] = hash_str + for item in list_data: + runtime_shape = item[0] + graph_index = item[1] + hash_str = item[2] + # for compatibility of old version, + # where we don't have file_path. + # NOTE: after running the new code, the file_path + # will be updated. + file_path = "" if len(item) == 3 else item[3] + self.cache[runtime_shape][graph_index] = InductorArtifact( + hash_str=hash_str, file_path=file_path) def serialize(self) -> str: data = [] - for runtime_shape, graph_index_to_hash_str in self.cache.items(): - for graph_index, hash_str in graph_index_to_hash_str.items(): - data.append((runtime_shape, graph_index, hash_str)) + for runtime_shape, value in self.cache.items(): + for graph_index, inductor_artifact in value.items(): + data.append( + (runtime_shape, graph_index, inductor_artifact.hash_str, + inductor_artifact.file_path)) printer = pprint.PrettyPrinter(indent=4) return printer.pformat(data) @@ -90,13 +108,14 @@ class InductorHashCache: return runtime_shape in self.cache and graph_index in self.cache[ runtime_shape] - def __getitem__(self, key: Tuple[Optional[int], int]) -> str: + def __getitem__(self, key: Tuple[Optional[int], int]) -> InductorArtifact: if self.disabled: raise KeyError("cannot read from disabled cache") runtime_shape, graph_index = key return self.cache[runtime_shape][graph_index] - def __setitem__(self, key: Tuple[Optional[int], int], value: str): + def __setitem__(self, key: Tuple[Optional[int], int], + value: InductorArtifact): # setitem for disabled cache is fine, because we # don't actually write to the disk runtime_shape, graph_index = key @@ -181,7 +200,8 @@ def wrap_inductor(graph: fx.GraphModule, if (runtime_shape, graph_index) in cache_data: # we compiled this graph before # so we can directly lookup the compiled graph via hash - hash_str = cache_data[(runtime_shape, graph_index)] + inductor_artifact = cache_data[(runtime_shape, graph_index)] + hash_str = inductor_artifact.hash_str if graph_index == 0: # adds some info logging for the first graph logger.info( @@ -199,6 +219,7 @@ def wrap_inductor(graph: fx.GraphModule, "Inductor cache lookup failed. Please remove" f"the cache file {cache_data.cache_file_path} and try again." # noqa ) + inductor_artifact.file_path = inductor_compiled_graph.current_callable.__code__.co_filename # noqa # Inductor calling convention (function signature): # f(list) -> tuple @@ -224,19 +245,20 @@ def wrap_inductor(graph: fx.GraphModule, # the assumption is that we don't have nested Inductor compilation. # compiled_fx_graph_hash will only be called once, and we can hook # it to get the hash of the compiled graph directly. - from torch._inductor.codecache import compiled_fx_graph_hash + + inductor_artifact = InductorArtifact() + from torch._inductor.codecache import (FxGraphCache, + compiled_fx_graph_hash) + original_load = FxGraphCache.load + + def hijack_load(*args, **kwargs): + inductor_compiled_graph = original_load(*args, **kwargs) + inductor_artifact.file_path = inductor_compiled_graph.current_callable.__code__.co_filename # noqa + return inductor_compiled_graph def hijack_compiled_fx_graph_hash(*args, **kwargs): out = compiled_fx_graph_hash(*args, **kwargs) - # store the hash in the cache - nonlocal cache_data - cache_data[(runtime_shape, graph_index)] = out[0] - if graph_index == 0: - # adds some info logging for the first graph - logger.info("Cache the graph of shape %s for later use", - str(runtime_shape)) - logger.debug("store the %s-th graph for shape %s via hash %s", - graph_index, str(runtime_shape), out[0]) + inductor_artifact.hash_str = out[0] return out def _check_can_cache(*args, **kwargs): @@ -251,19 +273,45 @@ def wrap_inductor(graph: fx.GraphModule, def _get_shape_env() -> AlwaysHitShapeEnv: return AlwaysHitShapeEnv() - with patch(# for hijacking the hash of the compiled graph - "torch._inductor.codecache.compiled_fx_graph_hash", - hijack_compiled_fx_graph_hash), \ - patch(# for providing a dummy shape environment - "torch._inductor.codecache.FxGraphCache._get_shape_env", - _get_shape_env), \ - patch(# for forcing the graph to be cached - "torch._inductor.codecache.FxGraphCache._check_can_cache", - _check_can_cache): + with ExitStack() as stack: + if not cache_data.disabled: + # compilation cache is enabled, patch several functions + + # hijack to get the compiled graph itself + stack.enter_context( + patch("torch._inductor.codecache.FxGraphCache.load", + hijack_load)) + + # for hijacking the hash of the compiled graph + stack.enter_context( + patch("torch._inductor.codecache.compiled_fx_graph_hash", + hijack_compiled_fx_graph_hash)) + + # for providing a dummy shape environment + stack.enter_context( + patch( + "torch._inductor.codecache.FxGraphCache._get_shape_env", + _get_shape_env)) + + # for forcing the graph to be cached + stack.enter_context( + patch( + "torch._inductor.codecache.FxGraphCache._check_can_cache", + _check_can_cache)) + compiled_graph = compile_fx(graph, example_inputs, config_patches=current_config) - + # store the inductor_artifact in the cache + cache_data[(runtime_shape, graph_index)] = inductor_artifact + if graph_index == 0: + # adds some info logging for the first graph + logger.info("Cache the graph of shape %s for later use", + str(runtime_shape)) + logger.debug( + "store the %s-th graph for shape %s via hash %s from file %s", + graph_index, str(runtime_shape), inductor_artifact.hash_str, + inductor_artifact.file_path) # after compiling the last graph, record the end time if graph_index == num_graphs - 1: now = time.time() @@ -576,9 +624,13 @@ class VllmBackend: ] # index of tensors that have symbolic shapes (batch size) + # for weights and static buffers, they will have concrete shapes. + # symbolic shape only happens for input tensors. + from torch.fx.experimental.symbolic_shapes import is_symbolic self.sym_tensor_indices = [ i for i, x in enumerate(fake_args) - if isinstance(x, torch._subclasses.fake_tensor.FakeTensor) + if isinstance(x, torch._subclasses.fake_tensor.FakeTensor) and \ + any(is_symbolic(d) for d in x.size()) ] # compiler managed cudagraph input buffers diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 10513111ea7f1..38f284794b8db 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -76,8 +76,8 @@ def support_torch_compile( During runtime, when we actually mark dimensions of tensors, it depends on the value of arguments: - - if it is a single integer, the corresponding dimension of the argument - will be marked as dynamic. + - if it is a single integer (can be negative), the corresponding dimension + of the argument will be marked as dynamic. - if it is `None`, ignored. - if it is `IntermediateTensors`, all the tensors in the intermediate tensors will be marked as dynamic. @@ -177,10 +177,20 @@ def _support_torch_compile( for k, dims in dynamic_arg_dims.items(): arg = bound_args.arguments.get(k) if arg is not None: + dims = [dims] if isinstance(dims, int) else dims if isinstance(arg, torch.Tensor): + # In case dims is specified with negative indexing + dims = [ + arg.ndim + dim if dim < 0 else dim for dim in dims + ] torch._dynamo.mark_dynamic(arg, dims) elif isinstance(arg, IntermediateTensors): for tensor in arg.tensors.values(): + # In case dims is specified with negative indexing + dims = [ + tensor.ndim + dim if dim < 0 else dim + for dim in dims + ] torch._dynamo.mark_dynamic(tensor, dims) else: raise ValueError( diff --git a/vllm/config.py b/vllm/config.py index 19609085cc960..4698a05020332 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -357,6 +357,10 @@ class ModelConfig: supported_tasks, task = self._resolve_task(task, self.hf_config) self.supported_tasks = supported_tasks self.task: Final = task + if self.task in ("draft", "generate"): + self.truncation_side = "left" + else: + self.truncation_side = "right" self.pooler_config = self._init_pooler_config(override_pooler_config) self.logits_processor_pattern = logits_processor_pattern @@ -553,7 +557,7 @@ class ModelConfig: optimized_quantization_methods = [ "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin", "awq_marlin", "fbgemm_fp8", "compressed_tensors", - "compressed-tensors", "experts_int8" + "compressed-tensors", "experts_int8", "quark" ] if self.quantization is not None: self.quantization = self.quantization.lower() @@ -603,10 +607,12 @@ class ModelConfig: self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, self.max_model_len) - if (self.hf_config.model_type == 'deepseek_v3' + MODEL_NOT_SUPPORT_CUDA_GRAPH = ['deepseek_v3', 'mllama'] + if (self.hf_config.model_type in MODEL_NOT_SUPPORT_CUDA_GRAPH and not self.enforce_eager): - logger.warning("CUDA graph is not supported for Deepseek V3 yet, " - "fallback to the eager mode.") + logger.warning( + "CUDA graph is not supported for %s yet, fallback to the eager " + "mode.", self.hf_config.model_type) self.enforce_eager = True def _verify_bnb_config(self) -> None: @@ -729,9 +735,12 @@ class ModelConfig: if hasattr(self.hf_text_config, "model_type") and (self.hf_text_config.model_type in ('deepseek_v2', 'deepseek_v3')): - # FlashAttention supports only head_size 32, 64, 128, 256, - # we need to pad head_size 192 to 256 - return 256 + qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim", + 0) + qk_nope_head_dim = getattr(self.hf_text_config, "qk_nope_head_dim", + 0) + if qk_rope_head_dim and qk_nope_head_dim: + return qk_rope_head_dim + qk_nope_head_dim if self.is_attention_free: return 0 @@ -1294,8 +1303,11 @@ class ParallelConfig: from vllm.executor import ray_utils backend = "mp" ray_found = ray_utils.ray_is_available() - if (current_platform.is_cuda() - and cuda_device_count_stateless() < self.world_size): + if current_platform.is_neuron(): + # neuron uses single process to control multiple devices + backend = "uni" + elif (current_platform.is_cuda() + and cuda_device_count_stateless() < self.world_size): if not ray_found: raise ValueError("Unable to load Ray which is " "required for multi-node inference, " @@ -1328,13 +1340,15 @@ class ParallelConfig: from vllm.executor.executor_base import ExecutorBase from vllm.platforms import current_platform if self.distributed_executor_backend not in ( - "ray", "mp", None) and not (isinstance( + "ray", "mp", "uni", + "external_launcher", None) and not (isinstance( self.distributed_executor_backend, type) and issubclass( self.distributed_executor_backend, ExecutorBase)): raise ValueError( "Unrecognized distributed executor backend " f"{self.distributed_executor_backend}. Supported " - "values are 'ray', 'mp' or custom ExecutorBase subclass.") + "values are 'ray', 'mp' 'uni', 'external_launcher' or" + " custom ExecutorBase subclass.") if self.use_ray: from vllm.executor import ray_utils ray_utils.assert_ray_available() @@ -1379,13 +1393,15 @@ class SchedulerConfig: is_multimodal_model: bool = False - # FIXME(woosuk & ywang96): Below are placeholder values. We need to - # calculate the actual values from the configurations. - # Multimodal encoder run compute budget, only used in V1 - max_num_encoder_input_tokens = 16384 + # NOTE: The following multimodal encoder budget will be initialized to + # max_num_batched_tokens and overridden in case max multimodal embedding + # size is larger. + # TODO (ywang96): Make these configurable. + # Multimodal encoder compute budget, only used in V1 + max_num_encoder_input_tokens: int = field(default=None) # type: ignore # Multimodal encoder cache size, only used in V1 - encoder_cache_size = 16384 + encoder_cache_size: int = field(default=None) # type: ignore # Whether to perform preemption by swapping or # recomputation. If not specified, we determine the mode as follows: @@ -1459,6 +1475,9 @@ class SchedulerConfig: _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, ) + self.max_num_encoder_input_tokens = self.max_num_batched_tokens + self.encoder_cache_size = self.max_num_batched_tokens + if self.enable_chunked_prefill: logger.info( "Chunked prefill is enabled with max_num_batched_tokens=%d.", @@ -2124,8 +2143,7 @@ class MultiModalConfig: limit_per_prompt: Mapping[str, int] = field(default_factory=dict) """ - The maximum number of multi-modal input instances allowed per prompt - for each :class:`~vllm.multimodal.MultiModalPlugin`. + The maximum number of input items allowed per prompt for each modality. """ def compute_hash(self) -> str: @@ -2780,7 +2798,6 @@ class CompilationConfig(BaseModel): compilation_time: float = PrivateAttr # Per-model forward context - # Mainly used to store attention cls # Map from layer name to the attention cls static_forward_context: Dict[str, Any] = PrivateAttr @@ -2845,17 +2862,8 @@ class CompilationConfig(BaseModel): "vllm.unified_attention_with_output", ] else: - # v0 can use full graph compilation without splitting, - # splitting is optional. - # right now we still need it. kv cache shape - # will be included in the graph if we don't split - # the graph. - # TODO: hide kv cache in static forward context - # so that inductor does not see it. - self.splitting_ops = [ - "vllm.unified_attention", - "vllm.unified_attention_with_output", - ] + # v0 uses full graph compilation + self.splitting_ops = [] for k, v in self.inductor_passes.items(): if not isinstance(v, str): @@ -3157,7 +3165,8 @@ class VllmConfig: if self.compilation_config is None: self.compilation_config = CompilationConfig() - if envs.VLLM_USE_V1 and not self.model_config.enforce_eager: + if envs.VLLM_USE_V1 and self.model_config is not None and \ + not self.model_config.enforce_eager: # NOTE(woosuk): Currently, we use inductor because the piecewise # CUDA graphs do not work properly with the custom CUDA kernels. # FIXME(woosuk): Disable inductor to reduce the compilation time diff --git a/vllm/connections.py b/vllm/connections.py index e785a0b3ebd74..4c9f4f40cf640 100644 --- a/vllm/connections.py +++ b/vllm/connections.py @@ -29,7 +29,7 @@ class HTTPConnection: # required, so that the client is only accessible inside async event loop async def get_async_client(self) -> aiohttp.ClientSession: if self._async_client is None or not self.reuse_client: - self._async_client = aiohttp.ClientSession() + self._async_client = aiohttp.ClientSession(trust_env=True) return self._async_client diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py index 4ace03ff1184e..7780e2dfa317d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py @@ -35,6 +35,7 @@ class SimpleConnector(KVConnectorBase): ): self.config = config.kv_transfer_config + self.tp_size = config.parallel_config.tensor_parallel_size if self.config.kv_connector == "PyNcclConnector": from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import ( @@ -161,7 +162,7 @@ class SimpleConnector(KVConnectorBase): end_layer = model_executable.model.end_layer model_config = model_executable.model.config - num_heads = model_config.num_key_value_heads + num_heads = int(model_config.num_key_value_heads / self.tp_size) hidden_size = model_config.hidden_size num_attention_heads = model_config.num_attention_heads head_size = int(hidden_size / num_attention_heads) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index be7f16ef52a47..bf8b30cccd5f6 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -862,12 +862,14 @@ def init_model_parallel_group( ) -> GroupCoordinator: if use_custom_allreduce is None: use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE + from vllm.platforms import current_platform return GroupCoordinator( group_ranks=group_ranks, local_rank=local_rank, torch_distributed_backend=backend, - use_pynccl=True, - use_custom_allreduce=use_custom_allreduce, + use_pynccl=current_platform.is_cuda_alike(), + use_custom_allreduce=current_platform.is_cuda_alike() + and use_custom_allreduce, use_tpu_communicator=True, use_hpu_communicator=True, use_xpu_communicator=True, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 0850bab6bb7e1..a4f4c9558d056 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -238,7 +238,7 @@ class EngineArgs: choices=get_args(TaskOption), help='The task to use the model for. Each vLLM instance only ' 'supports one task, even if the same model can be used for ' - 'multiple tasks. When the model only supports one task, "auto" ' + 'multiple tasks. When the model only supports one task, ``"auto"`` ' 'can be used to select it; otherwise, you must specify explicitly ' 'which task to use.') parser.add_argument( @@ -250,7 +250,7 @@ class EngineArgs: parser.add_argument( '--skip-tokenizer-init', action='store_true', - help='Skip initialization of tokenizer and detokenizer') + help='Skip initialization of tokenizer and detokenizer.') parser.add_argument( '--revision', type=nullable_str, @@ -388,7 +388,7 @@ class EngineArgs: # Parallel arguments parser.add_argument( '--distributed-executor-backend', - choices=['ray', 'mp'], + choices=['ray', 'mp', 'uni', 'external_launcher'], default=EngineArgs.distributed_executor_backend, help='Backend to use for distributed model ' 'workers, either "ray" or "mp" (multiprocessing). If the product ' @@ -401,7 +401,7 @@ class EngineArgs: parser.add_argument( '--worker-use-ray', action='store_true', - help='Deprecated, use --distributed-executor-backend=ray.') + help='Deprecated, use ``--distributed-executor-backend=ray``.') parser.add_argument('--pipeline-parallel-size', '-pp', type=int, @@ -430,7 +430,7 @@ class EngineArgs: choices=[8, 16, 32, 64, 128], help='Token block size for contiguous chunks of ' 'tokens. This is ignored on neuron devices and ' - 'set to max-model-len. On CUDA devices, ' + 'set to ``--max-model-len``. On CUDA devices, ' 'only block sizes up to 32 are supported. ' 'On HPU devices, block size defaults to 128.') @@ -439,12 +439,12 @@ class EngineArgs: action=argparse.BooleanOptionalAction, default=EngineArgs.enable_prefix_caching, help="Enables automatic prefix caching. " - "Use --no-enable-prefix-caching to disable explicitly.", + "Use ``--no-enable-prefix-caching`` to disable explicitly.", ) parser.add_argument('--disable-sliding-window', action='store_true', help='Disables sliding window, ' - 'capping to sliding window size') + 'capping to sliding window size.') parser.add_argument('--use-v2-block-manager', action='store_true', default=True, @@ -538,7 +538,7 @@ class EngineArgs: default=None, type=json.loads, help='RoPE scaling configuration in JSON format. ' - 'For example, {"rope_type":"dynamic","factor":2.0}') + 'For example, ``{"rope_type":"dynamic","factor":2.0}``') parser.add_argument('--rope-theta', default=None, type=float, @@ -607,7 +607,7 @@ class EngineArgs: default=None, type=json.loads, help=('Overrides for the multimodal input mapping/processing, ' - 'e.g., image processor. For example: {"num_crops": 4}.')) + 'e.g., image processor. For example: ``{"num_crops": 4}``.')) parser.add_argument( '--disable-mm-preprocessor-cache', action='store_true', @@ -861,7 +861,7 @@ class EngineArgs: "of the provided names. The model name in the model " "field of a response will be the first name in this " "list. If not specified, the model name will be the " - "same as the `--model` argument. Noted that this name(s) " + "same as the ``--model`` argument. Noted that this name(s) " "will also be used in `model_name` tag content of " "prometheus metrics, if multiple names provided, metrics " "tag will take the first one.") @@ -881,7 +881,7 @@ class EngineArgs: default=None, help="Valid choices are " + ",".join(ALLOWED_DETAILED_TRACE_MODULES) + - ". It makes sense to set this only if --otlp-traces-endpoint is" + ". It makes sense to set this only if ``--otlp-traces-endpoint`` is" " set. If set, it will collect detailed traces for the specified " "modules. This involves use of possibly costly and or blocking " "operations and hence might have a performance impact.") @@ -908,13 +908,13 @@ class EngineArgs: type=json.loads, default=None, help="Override or set neuron device configuration. " - "e.g. {\"cast_logits_dtype\": \"bloat16\"}.'") + "e.g. ``{\"cast_logits_dtype\": \"bloat16\"}``.") parser.add_argument( '--override-pooler-config', type=PoolerConfig.from_json, default=None, help="Override or set the pooling method for pooling models. " - "e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'") + "e.g. ``{\"pooling_type\": \"mean\", \"normalize\": false}``.") parser.add_argument('--compilation-config', '-O', diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 66a5089074ff5..08fef8250d483 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -18,9 +18,7 @@ from vllm.engine.async_timeout import asyncio_timeout from vllm.engine.llm_engine import LLMEngine, SchedulerOutputState from vllm.engine.metrics_types import StatLoggerBase from vllm.engine.protocol import EngineClient -from vllm.executor.executor_base import ExecutorAsyncBase -from vllm.executor.gpu_executor import GPUExecutorAsync -from vllm.executor.ray_utils import initialize_ray_cluster +from vllm.executor.executor_base import ExecutorBase from vllm.inputs import PromptType from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger @@ -620,69 +618,9 @@ class AsyncLLMEngine(EngineClient): rt.new_requests_event.set() @classmethod - def _get_executor_cls( - cls, engine_config: VllmConfig) -> Type[ExecutorAsyncBase]: - distributed_executor_backend = ( - engine_config.parallel_config.distributed_executor_backend) - if isinstance(distributed_executor_backend, type): - if not issubclass(distributed_executor_backend, ExecutorAsyncBase): - raise TypeError( - "distributed_executor_backend must be a subclass of " - f"ExecutorAsyncBase. Got {distributed_executor_backend}.") - executor_class = distributed_executor_backend - elif engine_config.device_config.device_type == "neuron": - from vllm.executor.neuron_executor import NeuronExecutorAsync - executor_class = NeuronExecutorAsync - elif engine_config.device_config.device_type == "tpu": - if distributed_executor_backend == "ray": - from vllm.executor.ray_tpu_executor import RayTPUExecutorAsync - executor_class = RayTPUExecutorAsync - else: - assert distributed_executor_backend is None - from vllm.executor.tpu_executor import TPUExecutorAsync - executor_class = TPUExecutorAsync - elif engine_config.device_config.device_type == "cpu": - from vllm.executor.cpu_executor import CPUExecutorAsync - executor_class = CPUExecutorAsync - elif engine_config.device_config.device_type == "hpu": - if distributed_executor_backend == "ray": - initialize_ray_cluster(engine_config.parallel_config) - from vllm.executor.ray_hpu_executor import RayHPUExecutorAsync - executor_class = RayHPUExecutorAsync - else: - from vllm.executor.hpu_executor import HPUExecutorAsync - executor_class = HPUExecutorAsync - elif engine_config.device_config.device_type == "openvino": - assert distributed_executor_backend is None, ( - "Distributed execution is not supported with " - "the OpenVINO backend.") - from vllm.executor.openvino_executor import OpenVINOExecutorAsync - executor_class = OpenVINOExecutorAsync - elif engine_config.device_config.device_type == "xpu": - if distributed_executor_backend is None: - from vllm.executor.xpu_executor import XPUExecutorAsync - executor_class = XPUExecutorAsync - elif distributed_executor_backend == "ray": - from vllm.executor.ray_xpu_executor import RayXPUExecutorAsync - executor_class = RayXPUExecutorAsync - elif distributed_executor_backend == "mp": - from vllm.executor.multiproc_xpu_executor import ( - MultiprocessingXPUExecutorAsync) - executor_class = MultiprocessingXPUExecutorAsync - else: - raise RuntimeError( - "Not supported distributed execution model on XPU device.") - elif distributed_executor_backend == "ray": - from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync - executor_class = RayGPUExecutorAsync - elif distributed_executor_backend == "mp": - from vllm.executor.multiproc_gpu_executor import ( - MultiprocessingGPUExecutorAsync) - executor_class = MultiprocessingGPUExecutorAsync - else: - from vllm.executor.gpu_executor import GPUExecutorAsync - executor_class = GPUExecutorAsync - return executor_class + def _get_executor_cls(cls, + engine_config: VllmConfig) -> Type[ExecutorBase]: + return LLMEngine._get_executor_cls(engine_config) @classmethod def from_engine_args( @@ -700,9 +638,6 @@ class AsyncLLMEngine(EngineClient): executor_class = cls._get_executor_cls(engine_config) - if executor_class.uses_ray: - initialize_ray_cluster(engine_config.parallel_config) - # Create the async LLM engine. engine = cls( vllm_config=engine_config, @@ -1242,20 +1177,13 @@ class AsyncLLMEngine(EngineClient): self.engine.remove_logger(logger_name=logger_name) async def start_profile(self) -> None: - # using type instead of isinstance to check to avoid capturing - # inherited classes - if type(self.engine.model_executor) == GPUExecutorAsync: # noqa: E721 - self.engine.model_executor.start_profile() - else: - self.engine.model_executor._run_workers("start_profile") + self.engine.start_profile() async def stop_profile(self) -> None: - # using type instead of isinstance to check to avoid capturing - # inherited classes - if type(self.engine.model_executor) == GPUExecutorAsync: # noqa: E721 - self.engine.model_executor.stop_profile() - else: - self.engine.model_executor._run_workers("stop_profile") + self.engine.stop_profile() + + async def add_lora(self, lora_request: LoRARequest) -> None: + self.engine.add_lora(lora_request) # TODO(v1): Remove this class proxy when V1 goes default. diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 1db3e59ff3bae..6a6b4a14a4c49 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -28,8 +28,6 @@ from vllm.engine.output_processor.util import create_output_by_sequence_group from vllm.entrypoints.openai.logits_processors import ( get_logits_processors as get_openai_logits_processors) from vllm.executor.executor_base import ExecutorBase -from vllm.executor.gpu_executor import GPUExecutor -from vllm.executor.ray_utils import initialize_ray_cluster from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, PromptType, SingletonInputsAdapter) from vllm.inputs.parse import is_encoder_decoder_inputs, is_token_prompt @@ -442,64 +440,31 @@ class LLMEngine: raise TypeError( "distributed_executor_backend must be a subclass of " f"ExecutorBase. Got {distributed_executor_backend}.") - if distributed_executor_backend.uses_ray: # type: ignore - initialize_ray_cluster(engine_config.parallel_config) executor_class = distributed_executor_backend - elif engine_config.device_config.device_type == "neuron": - from vllm.executor.neuron_executor import NeuronExecutor - executor_class = NeuronExecutor - elif engine_config.device_config.device_type == "tpu": + elif engine_config.parallel_config.world_size > 1: if distributed_executor_backend == "ray": - initialize_ray_cluster(engine_config.parallel_config) - from vllm.executor.ray_tpu_executor import RayTPUExecutor - executor_class = RayTPUExecutor - else: - assert distributed_executor_backend is None - from vllm.executor.tpu_executor import TPUExecutor - executor_class = TPUExecutor - elif engine_config.device_config.device_type == "cpu": - from vllm.executor.cpu_executor import CPUExecutor - executor_class = CPUExecutor - elif engine_config.device_config.device_type == "hpu": - if distributed_executor_backend == "ray": - initialize_ray_cluster(engine_config.parallel_config) - from vllm.executor.ray_hpu_executor import RayHPUExecutor - executor_class = RayHPUExecutor - else: - from vllm.executor.hpu_executor import HPUExecutor - executor_class = HPUExecutor - elif engine_config.device_config.device_type == "openvino": - from vllm.executor.openvino_executor import OpenVINOExecutor - executor_class = OpenVINOExecutor - elif engine_config.device_config.device_type == "xpu": - if distributed_executor_backend == "ray": - initialize_ray_cluster(engine_config.parallel_config) - from vllm.executor.ray_xpu_executor import RayXPUExecutor - executor_class = RayXPUExecutor + from vllm.executor.ray_distributed_executor import ( + RayDistributedExecutor) + executor_class = RayDistributedExecutor elif distributed_executor_backend == "mp": - # FIXME(kunshang): - # spawn needs calling `if __name__ == '__main__':`` - # fork is not supported for xpu start new process. - logger.error( - "Both start methods (spawn and fork) have issue " - "on XPU if you use mp backend, Please try ray instead.") - else: - from vllm.executor.xpu_executor import XPUExecutor - executor_class = XPUExecutor - elif distributed_executor_backend == "ray": - initialize_ray_cluster(engine_config.parallel_config) - from vllm.executor.ray_gpu_executor import RayGPUExecutor - executor_class = RayGPUExecutor - elif distributed_executor_backend == "mp": - from vllm.executor.multiproc_gpu_executor import ( - MultiprocessingGPUExecutor) - assert not envs.VLLM_USE_RAY_SPMD_WORKER, ( - "multiprocessing distributed executor backend does not " - "support VLLM_USE_RAY_SPMD_WORKER=1") - executor_class = MultiprocessingGPUExecutor + from vllm.executor.mp_distributed_executor import ( + MultiprocessingDistributedExecutor) + assert not envs.VLLM_USE_RAY_SPMD_WORKER, ( + "multiprocessing distributed executor backend does not " + "support VLLM_USE_RAY_SPMD_WORKER=1") + executor_class = MultiprocessingDistributedExecutor + elif distributed_executor_backend == "uni": + # JAX-style, single-process, multi-device executor. + from vllm.executor.uniproc_executor import UniProcExecutor + executor_class = UniProcExecutor + elif distributed_executor_backend == "external_launcher": + # executor with external launcher + from vllm.executor.uniproc_executor import ( # noqa + ExecutorWithExternalLauncher) + executor_class = ExecutorWithExternalLauncher else: - from vllm.executor.gpu_executor import GPUExecutor - executor_class = GPUExecutor + from vllm.executor.uniproc_executor import UniProcExecutor + executor_class = UniProcExecutor return executor_class @classmethod @@ -724,7 +689,9 @@ class LLMEngine: :class:`~vllm.PoolingParams` for pooling. arrival_time: The arrival time of the request. If None, we use the current monotonic time. + lora_request: The LoRA request to add. trace_headers: OpenTelemetry trace headers. + prompt_adapter_request: The prompt adapter request to add. priority: The priority of the request. Only applicable with priority scheduling. @@ -1845,27 +1812,17 @@ class LLMEngine: def list_prompt_adapters(self) -> List[int]: return self.model_executor.list_prompt_adapters() + def start_profile(self) -> None: + self.model_executor.start_profile() + + def stop_profile(self) -> None: + self.model_executor.stop_profile() + def check_health(self) -> None: if self.tokenizer: self.tokenizer.check_health() self.model_executor.check_health() - def start_profile(self) -> None: - # using type instead of isinstance to check to avoid capturing - # inherited classes (MultiprocessingGPUExecutor) - if type(self.model_executor) == GPUExecutor: # noqa: E721 - self.model_executor.start_profile() - else: - self.model_executor._run_workers("start_profile") - - def stop_profile(self) -> None: - # using type instead of isinstance to check to avoid capturing - # inherited classes (MultiprocessingGPUExecutor) - if type(self.model_executor) == GPUExecutor: # noqa: E721 - self.model_executor.stop_profile() - else: - self.model_executor._run_workers("stop_profile") - def is_tracing_enabled(self) -> bool: return self.tracer is not None diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py index 420f540d0b5f4..7132f9840001a 100644 --- a/vllm/engine/multiprocessing/__init__.py +++ b/vllm/engine/multiprocessing/__init__.py @@ -1,4 +1,5 @@ -from dataclasses import dataclass +import uuid +from dataclasses import dataclass, field from enum import Enum from typing import List, Mapping, Optional, Union, overload @@ -120,10 +121,23 @@ class RPCUProfileRequest(Enum): STOP_PROFILE = 2 -RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest, - RPCUProfileRequest] +@dataclass +class RPCLoadAdapterRequest: + lora_request: LoRARequest + # Set the default value of request_id to a new UUID + request_id: str = field(default_factory=lambda: str(uuid.uuid4())) -REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCError] + +@dataclass +class RPCAdapterLoadedResponse: + request_id: str + + +RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest, + RPCUProfileRequest, RPCLoadAdapterRequest] + +REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse, + RPCError] def ENGINE_DEAD_ERROR( diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 0a046c71e86e8..a9ab899535180 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -25,8 +25,10 @@ from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT, IPC_HEALTH_EXT, IPC_INPUT_EXT, IPC_OUTPUT_EXT, RPC_REQUEST_T, VLLM_RPC_SUCCESS_STR, RPCAbortRequest, - RPCError, RPCProcessRequest, - RPCStartupRequest, RPCStartupResponse, + RPCAdapterLoadedResponse, RPCError, + RPCLoadAdapterRequest, + RPCProcessRequest, RPCStartupRequest, + RPCStartupResponse, RPCUProfileRequest) from vllm.engine.protocol import EngineClient # yapf: enable @@ -240,17 +242,22 @@ class MQLLMEngineClient(EngineClient): queue = self.output_queues.get(request_id) if queue is not None: queue.put_nowait(exception) + # Put each output into the appropriate queue. + elif isinstance(request_outputs, RPCAdapterLoadedResponse): + self._add_output(request_outputs) else: - # Put each output into the appropriate steam. for request_output in request_outputs: - queue = self.output_queues.get( - request_output.request_id) - if queue is not None: - queue.put_nowait(request_output) + self._add_output(request_output) except asyncio.CancelledError: logger.debug("Shutting down MQLLMEngineClient output handler.") + def _add_output(self, request_output: Union[RequestOutput, + RPCAdapterLoadedResponse]): + queue = self.output_queues.get(request_output.request_id) + if queue is not None: + queue.put_nowait(request_output) + async def setup(self): """Setup the client before it starts sending server requests.""" @@ -659,3 +666,24 @@ class MQLLMEngineClient(EngineClient): await self._send_one_way_rpc_request( request=RPCUProfileRequest.STOP_PROFILE, socket=self.input_socket) + + async def add_lora(self, lora_request: LoRARequest) -> None: + """Load a new LoRA adapter into the engine for future requests.""" + # Uses the same I/O as generate requests + request = RPCLoadAdapterRequest(lora_request) + + # Create output queue for this requests. + queue: asyncio.Queue[Union[None, BaseException]] = asyncio.Queue() + self.output_queues[request.request_id] = queue + + # Send the request + request_bytes = pickle.dumps(request) + await self.input_socket.send_multipart((request_bytes, ), copy=False) + + # Wait for the response + request_output = await queue.get() + self.output_queues.pop(request.request_id) + + # Raise on error, otherwise happily return None + if isinstance(request_output, BaseException): + raise request_output diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 49a90b321dac4..3aa9d30549f36 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -14,11 +14,12 @@ from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT, IPC_HEALTH_EXT, IPC_INPUT_EXT, IPC_OUTPUT_EXT, REQUEST_OUTPUTS_T, VLLM_RPC_SUCCESS_STR, RPCAbortRequest, - RPCError, RPCProcessRequest, - RPCStartupRequest, RPCStartupResponse, + RPCAdapterLoadedResponse, RPCError, + RPCLoadAdapterRequest, + RPCProcessRequest, RPCStartupRequest, + RPCStartupResponse, RPCUProfileRequest) # yapf: enable -from vllm.executor.gpu_executor import GPUExecutor from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.usage.usage_lib import UsageContext @@ -234,6 +235,8 @@ class MQLLMEngine: self.start_profile() else: self.stop_profile() + elif isinstance(request, RPCLoadAdapterRequest): + self._handle_load_adapter_request(request) else: raise ValueError("Unknown RPCRequest Type: " f"{type(request)}") @@ -284,6 +287,20 @@ class MQLLMEngine: if self.log_requests: logger.info("Aborted request %s.", request.request_id) + def _handle_load_adapter_request(self, request: RPCLoadAdapterRequest): + try: + self.engine.add_lora(request.lora_request) + except BaseException as e: + # Send back an error if the adater fails to load + rpc_err = RPCError(request_id=request.request_id, + is_engine_errored=False, + exception=e) + self._send_outputs(rpc_err) + return + # Otherwise, send back the successful load message + self._send_outputs( + RPCAdapterLoadedResponse(request_id=request.request_id)) + def _health_check(self): # Send unhealthy if engine has already errored if self._errored_with is not None: @@ -296,7 +313,11 @@ class MQLLMEngine: self._send_unhealthy(e) def _send_outputs(self, outputs: REQUEST_OUTPUTS_T): - """Send List of RequestOutput to RPCClient.""" + """Send outputs back to the engine client. These can be: + - Exceptions + - A list of generation outputs + - A response from loading a lora adapter + """ if outputs: try: from ray.exceptions import RayTaskError @@ -335,16 +356,10 @@ class MQLLMEngine: self._errored_with = e def start_profile(self) -> None: - if type(self.engine.model_executor) is GPUExecutor: - self.engine.model_executor.start_profile() - else: - self.engine.model_executor._run_workers("start_profile") + self.engine.start_profile() def stop_profile(self) -> None: - if type(self.engine.model_executor) is GPUExecutor: - self.engine.model_executor.stop_profile() - else: - self.engine.model_executor._run_workers("stop_profile") + self.engine.stop_profile() def signal_handler(*_) -> None: diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index da3185f33dbe9..55c56abea0da3 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -102,9 +102,9 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor): Args: seq_group: the output is associated with this :class:`SequenceGroup` - output: the :class:`SequenceGroupOutput` for a single scheduler step + outputs: the :class:`SequenceGroupOutput` for a single scheduler step """ - assert len(outputs) == 1, ("Single step should only has 1 output.") + assert len(outputs) == 1, "Single step should only have 1 output." output = outputs[0] assert isinstance(output, CompletionSequenceGroupOutput) single_step_process_prompt_logprob(self, seq_group, output) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index a066836b92708..f05ff62c4766b 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -270,3 +270,8 @@ class EngineClient(ABC): async def stop_profile(self) -> None: """Start profiling the engine""" ... + + @abstractmethod + async def add_lora(self, lora_request: LoRARequest) -> None: + """Load a new LoRA adapter into the engine for future requests.""" + ... diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 923c7459f6948..beedf5d16ab86 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -403,8 +403,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): if model_type.startswith("llava"): return self._cached_token_str(self._tokenizer, hf_config.image_token_index) - if model_type in ("chameleon", "internvl_chat", "NVLM_D", - "h2ovl_chat"): + if model_type in ("chameleon", "deepseek_vl_v2", "internvl_chat", + "NVLM_D", "h2ovl_chat"): return "" if model_type == "mllama": return "<|image|>" diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index acb4db85632a8..27386daa4bbc9 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1,11 +1,13 @@ import itertools import warnings from contextlib import contextmanager -from typing import (Any, ClassVar, Dict, List, Optional, Sequence, Tuple, Type, - Union, cast, overload) +from typing import (Any, Callable, ClassVar, Dict, List, Optional, Sequence, + Tuple, Type, Union, cast, overload) +import cloudpickle +import torch.nn as nn from tqdm import tqdm -from typing_extensions import deprecated +from typing_extensions import TypeVar, deprecated from vllm import envs from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, @@ -41,6 +43,8 @@ from vllm.utils import Counter, deprecate_args, deprecate_kwargs, is_list_of logger = init_logger(__name__) +_R = TypeVar("_R", default=Any) + class LLM: """An LLM for generating texts from given prompts and sampling parameters. @@ -186,6 +190,13 @@ class LLM: if "disable_log_stats" not in kwargs: kwargs["disable_log_stats"] = True + if "worker_cls" in kwargs: + worker_cls = kwargs["worker_cls"] + # if the worker_cls is not qualified string name, + # we serialize it using cloudpickle to avoid pickling issues + if isinstance(worker_cls, type): + kwargs["worker_cls"] = cloudpickle.dumps(worker_cls) + if compilation_config is not None: if isinstance(compilation_config, (int, dict)): compilation_config_instance = CompilationConfig.from_cli( @@ -455,6 +466,44 @@ class LLM: outputs = self._run_engine(use_tqdm=use_tqdm) return self.engine_class.validate_outputs(outputs, RequestOutput) + def collective_rpc(self, + method: Union[str, Callable[..., _R]], + timeout: Optional[float] = None, + args: Tuple = (), + kwargs: Optional[Dict[str, Any]] = None) -> List[_R]: + """ + Execute an RPC call on all workers. + + Args: + method: Name of the worker method to execute, or a callable that + is serialized and sent to all workers to execute. + + If the method is a callable, it should accept an additional + `self` argument, in addition to the arguments passed in `args` + and `kwargs`. The `self` argument will be the worker object. + timeout: Maximum time in seconds to wait for execution. Raises a + :exc:`TimeoutError` on timeout. `None` means wait indefinitely. + args: Positional arguments to pass to the worker method. + kwargs: Keyword arguments to pass to the worker method. + + Returns: + A list containing the results from each worker. + + Note: + It is recommended to use this API to only pass control messages, + and set up data-plane communication to pass data. + """ + executor = self.llm_engine.model_executor + return executor.collective_rpc(method, timeout, args, kwargs) + + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + """ + Run a function directly on the model inside each worker, + returning the result for each of them. + """ + executor = self.llm_engine.model_executor + return executor.apply_model(func) + def beam_search( self, prompts: List[Union[TokensPrompt, TextPrompt]], diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index bc1471e1f534d..1aeefe86cd05e 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -14,7 +14,7 @@ from argparse import Namespace from contextlib import asynccontextmanager from functools import partial from http import HTTPStatus -from typing import AsyncIterator, Optional, Set, Tuple +from typing import AsyncIterator, Dict, Optional, Set, Tuple, Union import uvloop from fastapi import APIRouter, FastAPI, HTTPException, Request @@ -420,6 +420,8 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): "use the Pooling API (`/pooling`) instead.") res = await fallback_handler.create_pooling(request, raw_request) + + generator: Union[ErrorResponse, EmbeddingResponse] if isinstance(res, PoolingResponse): generator = EmbeddingResponse( id=res.id, @@ -494,7 +496,7 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): return await create_score(request, raw_request) -TASK_HANDLERS = { +TASK_HANDLERS: Dict[str, Dict[str, tuple]] = { "generate": { "messages": (ChatCompletionRequest, create_chat_completion), "default": (CompletionRequest, create_completion), @@ -652,7 +654,7 @@ def build_app(args: Namespace) -> FastAPI: module_path, object_name = middleware.rsplit(".", 1) imported = getattr(importlib.import_module(module_path), object_name) if inspect.isclass(imported): - app.add_middleware(imported) + app.add_middleware(imported) # type: ignore[arg-type] elif inspect.iscoroutinefunction(imported): app.middleware("http")(imported) else: @@ -662,7 +664,7 @@ def build_app(args: Namespace) -> FastAPI: return app -def init_app_state( +async def init_app_state( engine_client: EngineClient, model_config: ModelConfig, state: State, @@ -690,12 +692,13 @@ def init_app_state( logger.info("Using supplied chat template:\n%s", resolved_chat_template) state.openai_serving_models = OpenAIServingModels( + engine_client=engine_client, model_config=model_config, base_model_paths=base_model_paths, lora_modules=args.lora_modules, prompt_adapters=args.prompt_adapters, ) - # TODO: The chat template is now broken for lora adapters :( + await state.openai_serving_models.init_static_loras() state.openai_serving_chat = OpenAIServingChat( engine_client, model_config, @@ -794,7 +797,7 @@ async def run_server(args, **uvicorn_kwargs) -> None: app = build_app(args) model_config = await engine_client.get_model_config() - init_app_state(engine_client, model_config, app.state, args) + await init_app_state(engine_client, model_config, app.state, args) shutdown_task = await serve_http( app, diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 22206ef8dbfe6..35445449463e9 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -79,29 +79,29 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument("--host", type=nullable_str, default=None, - help="host name") - parser.add_argument("--port", type=int, default=8000, help="port number") + help="Host name.") + parser.add_argument("--port", type=int, default=8000, help="Port number.") parser.add_argument( "--uvicorn-log-level", type=str, default="info", choices=['debug', 'info', 'warning', 'error', 'critical', 'trace'], - help="log level for uvicorn") + help="Log level for uvicorn.") parser.add_argument("--allow-credentials", action="store_true", - help="allow credentials") + help="Allow credentials.") parser.add_argument("--allowed-origins", type=json.loads, default=["*"], - help="allowed origins") + help="Allowed origins.") parser.add_argument("--allowed-methods", type=json.loads, default=["*"], - help="allowed methods") + help="Allowed methods.") parser.add_argument("--allowed-headers", type=json.loads, default=["*"], - help="allowed headers") + help="Allowed headers.") parser.add_argument("--api-key", type=nullable_str, default=None, @@ -115,10 +115,10 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: action=LoRAParserAction, help="LoRA module configurations in either 'name=path' format" "or JSON format. " - "Example (old format): 'name=path' " + "Example (old format): ``'name=path'`` " "Example (new format): " - "'{\"name\": \"name\", \"local_path\": \"path\", " - "\"base_model_name\": \"id\"}'") + "``{\"name\": \"name\", \"local_path\": \"path\", " + "\"base_model_name\": \"id\"}``") parser.add_argument( "--prompt-adapters", type=nullable_str, @@ -132,7 +132,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=None, help="The file path to the chat template, " "or the template in single-line form " - "for the specified model") + "for the specified model.") parser.add_argument( '--chat-template-content-format', type=str, @@ -141,38 +141,39 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help='The format to render message content within a chat template.' '\n\n' '* "string" will render the content as a string. ' - 'Example: "Hello World"\n' + 'Example: ``"Hello World"``\n' '* "openai" will render the content as a list of dictionaries, ' 'similar to OpenAI schema. ' - 'Example: [{"type": "text", "text": "Hello world!"}]') + 'Example: ``[{"type": "text", "text": "Hello world!"}]``') parser.add_argument("--response-role", type=nullable_str, default="assistant", help="The role name to return if " - "`request.add_generation_prompt=true`.") + "``request.add_generation_prompt=true``.") parser.add_argument("--ssl-keyfile", type=nullable_str, default=None, - help="The file path to the SSL key file") + help="The file path to the SSL key file.") parser.add_argument("--ssl-certfile", type=nullable_str, default=None, - help="The file path to the SSL cert file") + help="The file path to the SSL cert file.") parser.add_argument("--ssl-ca-certs", type=nullable_str, default=None, - help="The CA certificates file") + help="The CA certificates file.") parser.add_argument( "--ssl-cert-reqs", type=int, default=int(ssl.CERT_NONE), - help="Whether client certificate is required (see stdlib ssl module's)" + help="Whether client certificate is required (see stdlib ssl module's)." ) parser.add_argument( "--root-path", type=nullable_str, default=None, - help="FastAPI root_path when app is behind a path based routing proxy") + help="FastAPI root_path when app is behind a path based routing proxy." + ) parser.add_argument( "--middleware", type=nullable_str, @@ -182,15 +183,15 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "We accept multiple --middleware arguments. " "The value should be an import path. " "If a function is provided, vLLM will add it to the server " - "using @app.middleware('http'). " + "using ``@app.middleware('http')``. " "If a class is provided, vLLM will add it to the server " - "using app.add_middleware(). ") + "using ``app.add_middleware()``. ") parser.add_argument( "--return-tokens-as-token-ids", action="store_true", - help="When --max-logprobs is specified, represents single tokens as " - "strings of the form 'token_id:{token_id}' so that tokens that " - "are not JSON-encodable can be identified.") + help="When ``--max-logprobs`` is specified, represents single tokens " + " as strings of the form 'token_id:{token_id}' so that tokens " + "that are not JSON-encodable can be identified.") parser.add_argument( "--disable-frontend-multiprocessing", action="store_true", @@ -205,9 +206,8 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--enable-auto-tool-choice", action="store_true", default=False, - help= - "Enable auto tool choice for supported models. Use --tool-call-parser" - " to specify which parser to use") + help="Enable auto tool choice for supported models. Use " + "``--tool-call-parser`` to specify which parser to use.") valid_tool_parsers = ToolParserManager.tool_parsers.keys() parser.add_argument( @@ -219,7 +219,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help= "Select the tool call parser depending on the model that you're using." " This is used to parse the model-generated tool call into OpenAI API " - "format. Required for --enable-auto-tool-choice.") + "format. Required for ``--enable-auto-tool-choice``.") parser.add_argument( "--tool-parser-plugin", @@ -228,7 +228,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help= "Special the tool parser plugin write to parse the model-generated tool" " into OpenAI API format, the name register in this plugin can be used " - "in --tool-call-parser.") + "in ``--tool-call-parser``.") parser = AsyncEngineArgs.add_cli_args(parser) @@ -243,7 +243,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--disable-fastapi-docs", action='store_true', default=False, - help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint" + help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint." ) parser.add_argument( "--enable-prompt-tokens-details", diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 822c0f5f7c211..f8f136f9d5024 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -215,6 +215,7 @@ async def main(args): # Create the openai serving objects. openai_serving_models = OpenAIServingModels( + engine_client=engine, model_config=model_config, base_model_paths=base_model_paths, lora_modules=None, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 88859255f202a..3da447be06430 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -203,15 +203,19 @@ class OpenAIServing: ) -> TextTokensPrompt: token_num = len(input_ids) - # Note: EmbeddingRequest doesn't have max_tokens - if isinstance(request, - (EmbeddingChatRequest, EmbeddingCompletionRequest)): + # Note: EmbeddingRequest and ScoreRequest doesn't have max_tokens + if isinstance( + request, + (EmbeddingChatRequest, EmbeddingCompletionRequest, ScoreRequest)): + + operation = "score" if isinstance(request, ScoreRequest) \ + else "embedding generation" if token_num > self.max_model_len: raise ValueError( f"This model's maximum context length is " f"{self.max_model_len} tokens. However, you requested " - f"{token_num} tokens in the input for embedding " - f"generation. Please reduce the length of the input.") + f"{token_num} tokens in the input for {operation}. " + f"Please reduce the length of the input.") return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids) diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 26966896bc272..fc422f0917bd5 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -5,15 +5,19 @@ from http import HTTPStatus from typing import List, Optional, Union from vllm.config import ModelConfig +from vllm.engine.protocol import EngineClient from vllm.entrypoints.openai.protocol import (ErrorResponse, LoadLoraAdapterRequest, ModelCard, ModelList, ModelPermission, UnloadLoraAdapterRequest) +from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.utils import AtomicCounter +logger = init_logger(__name__) + @dataclass class BaseModelPath: @@ -45,6 +49,7 @@ class OpenAIServingModels: def __init__( self, + engine_client: EngineClient, model_config: ModelConfig, base_model_paths: List[BaseModelPath], *, @@ -55,20 +60,11 @@ class OpenAIServingModels: self.base_model_paths = base_model_paths self.max_model_len = model_config.max_model_len + self.engine_client = engine_client + self.static_lora_modules = lora_modules + self.lora_requests: List[LoRARequest] = [] self.lora_id_counter = AtomicCounter(0) - self.lora_requests = [] - if lora_modules is not None: - self.lora_requests = [ - LoRARequest(lora_name=lora.name, - lora_int_id=i, - lora_path=lora.path, - base_model_name=lora.base_model_name - if lora.base_model_name - and self.is_base_model(lora.base_model_name) else - self.base_model_paths[0].name) - for i, lora in enumerate(lora_modules, start=1) - ] self.prompt_adapter_requests = [] if prompt_adapters is not None: @@ -84,6 +80,19 @@ class OpenAIServingModels: prompt_adapter_local_path=prompt_adapter.local_path, prompt_adapter_num_virtual_tokens=num_virtual_tokens)) + async def init_static_loras(self): + """Loads all static LoRA modules. + Raises if any fail to load""" + if self.static_lora_modules is None: + return + for lora in self.static_lora_modules: + load_request = LoadLoraAdapterRequest(lora_path=lora.path, + lora_name=lora.name) + load_result = await self.load_lora_adapter( + request=load_request, base_model_name=lora.base_model_name) + if isinstance(load_result, ErrorResponse): + raise ValueError(load_result.message) + def is_base_model(self, model_name): return any(model.name == model_name for model in self.base_model_paths) @@ -129,17 +138,39 @@ class OpenAIServingModels: async def load_lora_adapter( self, - request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]: + request: LoadLoraAdapterRequest, + base_model_name: Optional[str] = None + ) -> Union[ErrorResponse, str]: error_check_ret = await self._check_load_lora_adapter_request(request) if error_check_ret is not None: return error_check_ret lora_name, lora_path = request.lora_name, request.lora_path unique_id = self.lora_id_counter.inc(1) - self.lora_requests.append( - LoRARequest(lora_name=lora_name, - lora_int_id=unique_id, - lora_path=lora_path)) + lora_request = LoRARequest(lora_name=lora_name, + lora_int_id=unique_id, + lora_path=lora_path) + if base_model_name is not None and self.is_base_model(base_model_name): + lora_request.base_model_name = base_model_name + + # Validate that the adapter can be loaded into the engine + # This will also pre-load it for incoming requests + try: + await self.engine_client.add_lora(lora_request) + except BaseException as e: + error_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + if isinstance(e, ValueError) and "No adapter found" in str(e): + error_type = "NotFoundError" + status_code = HTTPStatus.NOT_FOUND + + return create_error_response(message=str(e), + err_type=error_type, + status_code=status_code) + + self.lora_requests.append(lora_request) + logger.info("Loaded new LoRA adapter: name '%s', path '%s'", lora_name, + lora_path) return f"Success: LoRA adapter '{lora_name}' added successfully." async def unload_lora_adapter( @@ -155,6 +186,7 @@ class OpenAIServingModels: lora_request for lora_request in self.lora_requests if lora_request.lora_name != lora_name ] + logger.info("Removed LoRA adapter: name '%s'", lora_name) return f"Success: LoRA adapter '{lora_name}' removed successfully." async def _check_load_lora_adapter_request( @@ -195,8 +227,8 @@ class OpenAIServingModels: return create_error_response( message= f"The lora adapter '{request.lora_name}' cannot be found.", - err_type="InvalidUserInput", - status_code=HTTPStatus.BAD_REQUEST) + err_type="NotFoundError", + status_code=HTTPStatus.NOT_FOUND) return None diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 5d3e7139d7a17..381edf8fac49e 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -101,6 +101,38 @@ class OpenAIServingScores(OpenAIServing): if not self.model_config.is_cross_encoder: raise ValueError("Model is not cross encoder.") + if truncate_prompt_tokens is not None and \ + truncate_prompt_tokens > self.max_model_len: + raise ValueError( + f"truncate_prompt_tokens value ({truncate_prompt_tokens}) " + f"is greater than max_model_len ({self.max_model_len})." + f" Please, select a smaller truncation size.") + + input_pairs = make_pairs(request.text_1, request.text_2) + for q, t in input_pairs: + request_prompt = f"{q}{tokenizer.sep_token}{t}" + + tokenization_kwargs: Dict[str, Any] = {} + if truncate_prompt_tokens is not None: + tokenization_kwargs["truncation"] = True + tokenization_kwargs["max_length"] = truncate_prompt_tokens + + tokenize_async = make_async(tokenizer.__call__, + executor=self._tokenizer_executor) + prompt_inputs = await tokenize_async(text=q, + text_pair=t, + **tokenization_kwargs) + + input_ids = prompt_inputs["input_ids"] + text_token_prompt = \ + self._validate_input(request, input_ids, request_prompt) + engine_prompt = TokensPrompt( + prompt_token_ids=text_token_prompt["prompt_token_ids"], + token_type_ids=prompt_inputs.get("token_type_ids")) + + request_prompts.append(request_prompt) + engine_prompts.append(engine_prompt) + except ValueError as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) @@ -108,28 +140,6 @@ class OpenAIServingScores(OpenAIServing): # Schedule the request and get the result generator. generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] - input_pairs = make_pairs(request.text_1, request.text_2) - - for q, t in input_pairs: - request_prompt = f"{q}{tokenizer.sep_token}{t}" - - tokenization_kwargs: Dict[str, Any] = {} - if truncate_prompt_tokens is not None: - tokenization_kwargs["truncation"] = True - tokenization_kwargs["max_length"] = truncate_prompt_tokens - - tokenize_async = make_async(tokenizer.__call__, - executor=self._tokenizer_executor) - prompt_inputs = await tokenize_async(text=q, - text_pair=t, - **tokenization_kwargs) - engine_prompt = TokensPrompt( - prompt_token_ids=prompt_inputs["input_ids"], - token_type_ids=prompt_inputs.get("token_type_ids")) - - request_prompts.append(request_prompt) - engine_prompts.append(engine_prompt) - try: pooling_params = request.to_pooling_params() diff --git a/vllm/envs.py b/vllm/envs.py index c4a568c680db0..b7b597ea15af3 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -45,7 +45,7 @@ if TYPE_CHECKING: VLLM_USE_RAY_SPMD_WORKER: bool = False VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True - VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = True + VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False VLLM_WORKER_MULTIPROC_METHOD: str = "fork" VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets") VLLM_IMAGE_FETCH_TIMEOUT: int = 5 @@ -340,11 +340,11 @@ environment_variables: Dict[str, Callable[[], Any]] = { lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1")) ), - # If the env var is set, it enables GPU communication overlap in - # Ray's compiled DAG. This flag is ignored if + # If the env var is set, it enables GPU communication overlap + # (experimental feature) in Ray's compiled DAG. This flag is ignored if # VLLM_USE_RAY_COMPILED_DAG is not set. "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM": - lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "1")) + lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0")) ), # Use dedicated multiprocess context for workers. diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py deleted file mode 100644 index c7f018d9a203e..0000000000000 --- a/vllm/executor/cpu_executor.py +++ /dev/null @@ -1,302 +0,0 @@ -import os -from functools import partial -from typing import Any, Awaitable, List, Optional, Set, Tuple, Union - -from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase -from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper, - ResultHandler, WorkerMonitor) -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sequence import ExecuteModelRequest -from vllm.utils import get_distributed_init_method, get_open_port, make_async -from vllm.worker.worker_base import WorkerWrapperBase - -logger = init_logger(__name__) - - -class CPUExecutor(ExecutorBase): - - uses_ray: bool = False - - def _init_executor(self) -> None: - assert self.device_config.device_type == "cpu" - # Reminder: Please update docs/source/features/compatibility_matrix.md - # If the feature combo become valid - assert self.lora_config is None, "cpu backend doesn't support LoRA" - - # - # Environment variables for CPU executor - # - - # Disable torch async compiling which won't work with daemonic processes - os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" - - # Intel OpenMP setting - ld_prealod_str = os.getenv("LD_PRELOAD", "") - if "libiomp5.so" in ld_prealod_str: - # The time(milliseconds) that a thread should wait after - # completing the execution of a parallel region, before sleeping. - os.environ['KMP_BLOCKTIME'] = "1" - # Prevents the CPU to run into low performance state - os.environ['KMP_TPAUSE'] = "0" - # Provides fine granularity parallelism - os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist" - os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist" - os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist" - - # To hint IPEX uses shared memory based AllReduce - os.environ["LOCAL_WORLD_SIZE"] = str( - self.parallel_config.tensor_parallel_size) - - # Multiprocessing-based executor does not support multi-node setting. - # Since it only works for single node, we can use the loopback address - # 127.0.0.1 for communication. - ip = "127.0.0.1" - port = get_open_port() - self.distributed_init_method = get_distributed_init_method(ip, port) - - is_async = isinstance(self, CPUExecutorAsync) - - world_size = self.parallel_config.tensor_parallel_size - result_handler = ResultHandler() - self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None - self.workers = [] - - if is_async: - self.workers = [ - ProcessWorkerWrapper( - result_handler, - partial( - self._create_worker, - rank=rank, - local_rank=rank, - )) for rank in range(0, world_size) - ] - self.driver_worker = self.workers[0] - self.workers = self.workers[1:] - self.driver_method_invoker = _async_driver_method_invoker - else: - self.driver_worker = self._create_worker() - self.driver_method_invoker = _driver_method_invoker - - if world_size != 1: - self.workers = [ - ProcessWorkerWrapper( - result_handler, - partial( - self._create_worker, - rank=rank, - local_rank=rank, - )) for rank in range(1, world_size) - ] - - self.worker_monitor = None - if world_size != 1 or is_async: - if is_async: - async_worker_list = self.workers + [self.driver_worker] - else: - async_worker_list = self.workers - self.worker_monitor = WorkerMonitor(async_worker_list, - result_handler) - result_handler.start() - self.worker_monitor.start() - - self._run_workers("init_device") - self._run_workers("load_model") - - def _create_worker( - self, - local_rank: int = 0, - rank: int = 0, - ): - - wrapper = WorkerWrapperBase(vllm_config=self.vllm_config) - - assert self.distributed_init_method is not None - - kwargs = dict( - vllm_config=self.vllm_config, - local_rank=local_rank, - rank=rank, - distributed_init_method=self.distributed_init_method, - kv_cache_dtype=self.cache_config.cache_dtype, - is_driver_worker=rank == 0, - ) - wrapper.init_worker(**kwargs) - - return wrapper.worker - - def _run_workers( - self, - method: str, - *args, - async_run_remote_workers_only: bool = False, - max_concurrent_workers: Optional[int] = None, - **kwargs, - ) -> Any: - """Runs the given method on all workers. - - Args: - async_run_remote_workers_only: If True the method will be run only - in the remote workers, not the driver worker. It will also be - run asynchronously and return a list of futures rather than - blocking on the results. - """ - - if max_concurrent_workers: - raise NotImplementedError( - "max_concurrent_workers is not supported yet.") - - # Start the workers first. - worker_outputs = [ - worker.execute_method(method, *args, **kwargs) - for worker in self.workers - ] - - if async_run_remote_workers_only: - # Just return futures - return worker_outputs - - driver_worker_output = self.driver_method_invoker( - self.driver_worker, method, *args, **kwargs) - - # Get the results of the workers. - return [driver_worker_output - ] + [output.get() for output in worker_outputs] - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Determine the number of available KV blocks by invoking the - underlying worker. - """ - return self.driver_method_invoker(self.driver_worker, - "determine_num_available_blocks") - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - """Initialize the KV cache by invoking the underlying worker. - """ - # NOTE: We log here to avoid multiple logs when number of workers is - # greater than one. We could log in the engine, but not all executors - # have GPUs. - # NOTE: `cpu block` for CPU backend is located on CPU memory but is - # referred as `gpu block`. Because we want to reuse the existing block - # management procedure. - logger.info("# CPU blocks: %d", num_gpu_blocks) - - self._run_workers("initialize_cache", - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=num_cpu_blocks) - - def execute_model( - self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: - if (self.parallel_config.tensor_parallel_size > 1 - and self.parallel_worker_tasks is None): - self.parallel_worker_tasks = self._run_workers( - "start_worker_execution_loop", - async_run_remote_workers_only=True, - ) - output = self.driver_method_invoker(self.driver_worker, - "execute_model", execute_model_req) - return output - - def stop_remote_worker_execution_loop(self) -> None: - if self.parallel_worker_tasks is None: - return - """ - Passing None will cause the driver to stop the model execution - loop running in each of the remote workers. - """ - self.driver_method_invoker(self.driver_worker, "execute_model", None) - parallel_worker_tasks = self.parallel_worker_tasks - self.parallel_worker_tasks = None - # Ensure that workers exit model loop cleanly - # (this will raise otherwise) - self._wait_for_tasks_completion(parallel_worker_tasks) - - def add_lora(self, lora_request: LoRARequest) -> bool: - return all(self._run_workers("add_lora", lora_request)) - - def remove_lora(self, lora_id: int) -> bool: - return all(self._run_workers("remove_lora", lora_id)) - - def pin_lora(self, lora_id: int) -> bool: - assert lora_id > 0, "lora_id must be greater than 0." - return all(self._run_workers( - "pin_lora", - lora_id=lora_id, - )) - - def list_loras(self) -> Set[int]: - return self.driver_method_invoker(self.driver_worker, "list_loras") - - def add_prompt_adapter( - self, prompt_adapter_request: PromptAdapterRequest) -> bool: - return all( - self._run_workers( - "add_prompt_adapter", - prompt_adapter_request, - )) - - def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - return all( - self._run_workers( - "remove_prompt_adapter", - prompt_adapter_id, - )) - - def list_prompt_adapters(self) -> Set[int]: - return self.driver_method_invoker(self.driver_worker, - "list_prompt_adapters") - - def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - return all(self._run_workers( - "pin_prompt_adapter", - prompt_adapter_id, - )) - - def check_health(self) -> None: - """Raises an error if engine is unhealthy.""" - if self.worker_monitor is not None and not self.worker_monitor.is_alive( - ): - raise RuntimeError("Worker processes are not running") - - def shutdown(self): - if (worker_monitor := getattr(self, "worker_monitor", - None)) is not None: - worker_monitor.close() - - def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: - """Wait for futures returned from _run_workers() with - async_run_remote_workers_only to complete.""" - for result in parallel_worker_tasks: - result.get() - - def start_profile(self) -> None: - self.driver_method_invoker(self.driver_worker, "start_profile") - - def stop_profile(self) -> None: - self.driver_method_invoker(self.driver_worker, "stop_profile") - - -class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase): - - async def execute_model_async( - self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: - output = await make_async(self.execute_model - )(execute_model_req=execute_model_req, ) - return output - - async def check_health_async(self) -> None: - self.check_health() - - -def _driver_method_invoker(driver, method: str, *args, **kwargs): - return getattr(driver, method)(*args, **kwargs) - - -def _async_driver_method_invoker(driver, method: str, *args, **kwargs): - return driver.execute_method(method, *args, **kwargs).get() diff --git a/vllm/executor/distributed_gpu_executor.py b/vllm/executor/distributed_gpu_executor.py deleted file mode 100644 index deb7cb1c97ef5..0000000000000 --- a/vllm/executor/distributed_gpu_executor.py +++ /dev/null @@ -1,212 +0,0 @@ -import asyncio -from abc import abstractmethod -from typing import Any, Awaitable, Dict, List, Optional, Set, Tuple, Union - -from vllm.executor.executor_base import ExecutorAsyncBase -from vllm.executor.gpu_executor import GPUExecutor -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest - -logger = init_logger(__name__) - - -class DistributedGPUExecutor(GPUExecutor): - """Abstract superclass of multi-GPU executor implementations.""" - - def __init__(self, *args, **kwargs): - # This is non-None when the execute model loop is running - # in the parallel workers. It's a coroutine in the AsyncLLMEngine case. - self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None - # Updated by implementations that require additional args to be passed - # to the _run_workers execute_model call - self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {} - - super().__init__(*args, **kwargs) - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Determine the number of available KV blocks. - - This invokes `determine_num_available_blocks` on each worker and takes - the min of the results, guaranteeing that the selected cache sizes are - compatible with all workers. - - Returns: - - tuple[num_gpu_blocks, num_cpu_blocks] - """ - # Get the maximum number of blocks that can be allocated on GPU and CPU. - num_blocks = self._run_workers("determine_num_available_blocks", ) - - # Since we use a shared centralized controller, we take the minimum - # number of blocks across all workers to make sure all the memory - # operators can be applied to all workers. - num_gpu_blocks = min(b[0] for b in num_blocks) - num_cpu_blocks = min(b[1] for b in num_blocks) - - return num_gpu_blocks, num_cpu_blocks - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - """Initialize the KV cache in all workers. - """ - - # NOTE: We log here to avoid multiple logs when number of workers is - # greater than one. We could log in the engine, but not all executors - # have GPUs. - logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks, - num_cpu_blocks) - max_concurrency = (num_gpu_blocks * self.cache_config.block_size / - self.model_config.max_model_len) - logger.info("Maximum concurrency for %s tokens per request: %.2fx", - self.model_config.max_model_len, max_concurrency) - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - self._run_workers("initialize_cache", - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=num_cpu_blocks) - - def execute_model( - self, - execute_model_req: ExecuteModelRequest, - ) -> List[SamplerOutput]: - if self.parallel_worker_tasks is None: - self.parallel_worker_tasks = self._run_workers( - "start_worker_execution_loop", - async_run_tensor_parallel_workers_only=True, - **self.extra_execute_model_run_workers_kwargs) - - # Only the driver worker returns the sampling results. - driver_outputs = self._driver_execute_model(execute_model_req) - assert driver_outputs is not None - return driver_outputs - - def stop_remote_worker_execution_loop(self) -> None: - if self.parallel_worker_tasks is None: - return - - self._driver_execute_model(execute_model_req=None) - parallel_worker_tasks = self.parallel_worker_tasks - self.parallel_worker_tasks = None - # Ensure that workers exit model loop cleanly - # (this will raise otherwise) - self._wait_for_tasks_completion(parallel_worker_tasks) - - def add_lora(self, lora_request: LoRARequest) -> bool: - assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." - return self._run_workers( - "add_lora", - lora_request=lora_request, - ) - - def remove_lora(self, lora_id: int) -> bool: - assert lora_id > 0, "lora_id must be greater than 0." - return self._run_workers( - "remove_lora", - lora_id=lora_id, - ) - - def pin_lora(self, lora_id: int) -> bool: - assert lora_id > 0, "lora_id must be greater than 0." - return self._run_workers( - "pin_lora", - lora_id=lora_id, - ) - - def list_loras(self) -> Set[int]: - return self._run_workers("list_loras") - - def save_sharded_state( - self, - path: str, - pattern: Optional[str] = None, - max_size: Optional[int] = None, - ) -> None: - self._run_workers("save_sharded_state", - path=path, - pattern=pattern, - max_size=max_size) - - @abstractmethod - def _driver_execute_model( - self, execute_model_req: Optional[ExecuteModelRequest] - ) -> Optional[List[SamplerOutput]]: - """Run execute_model in the driver worker. - - Passing None will cause the driver to stop the model execution loop - running in each of the remote workers. In this case, this method - returns None. Otherwise, this method returns the model output. - """ - raise NotImplementedError - - @abstractmethod - def _run_workers( - self, - method: str, - *args, - async_run_tensor_parallel_workers_only: bool = False, - max_concurrent_workers: Optional[int] = None, - **kwargs, - ) -> Any: - """Runs the given method on all workers. - - Args: - async_run_tensor_parallel_workers_only: If True the method will be - run only in the remote TP workers, not the driver worker. - It will also be run asynchronously and return a list of futures - rather than blocking on the results. - """ - raise NotImplementedError - - @abstractmethod - def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: - """Wait for futures returned from _run_workers() with - async_run_remote_workers_only to complete.""" - raise NotImplementedError - - -class DistributedGPUExecutorAsync(DistributedGPUExecutor, ExecutorAsyncBase): - - async def execute_model_async( - self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: - if self.parallel_worker_tasks is None: - # Start model execution loop running in the parallel workers - self.parallel_worker_tasks = asyncio.create_task( - self._start_worker_execution_loop()) - - # Only the driver worker returns the sampling results. - return await self._driver_execute_model_async(execute_model_req) - - async def stop_remote_worker_execution_loop_async(self) -> None: - if self.parallel_worker_tasks is None: - return - - await self._driver_execute_model_async() - parallel_worker_tasks = self.parallel_worker_tasks - self.parallel_worker_tasks = None - # Ensure that workers exit model loop cleanly - # (this will raise otherwise) - await parallel_worker_tasks - - @abstractmethod - async def _driver_execute_model_async( - self, - execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> List[SamplerOutput]: - """Execute the model asynchronously in the driver worker. - - Passing None will cause the driver to stop the model execution - loop running in each of the remote workers. - """ - raise NotImplementedError - - @abstractmethod - async def _start_worker_execution_loop(self): - """Run execution loop on all workers. It guarantees all workers run - the loop or None of them is running the loop. Loop can be stopped by - `stop_remote_worker_execution_loop`. - The API is idempotent (guarantee only 1 loop run at any moment).""" - raise NotImplementedError diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 9cba189dd57f9..859e105f15d97 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -1,18 +1,31 @@ +import asyncio from abc import ABC, abstractmethod -from typing import List, Optional, Set, Tuple +from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple, + Union) + +import torch.nn as nn +from typing_extensions import TypeVar from vllm.config import VllmConfig +from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.platforms import current_platform from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sequence import ExecuteModelRequest +from vllm.sequence import ExecuteModelRequest, PoolerOutput +from vllm.utils import make_async +from vllm.worker.worker_base import WorkerBase + +logger = init_logger(__name__) + +_R = TypeVar("_R", default=Any) class ExecutorBase(ABC): """Base class for all executors. - An executor is responsible for executing the model on a specific device - type (e.g., CPU, GPU, Neuron, etc.). Or it can be a distributed executor + An executor is responsible for executing the model on one device, + or it can be a distributed executor that can execute the model on multiple devices. """ @@ -37,9 +50,38 @@ class ExecutorBase(ABC): @abstractmethod def _init_executor(self) -> None: - pass + raise NotImplementedError @abstractmethod + def collective_rpc(self, + method: Union[str, Callable[..., _R]], + timeout: Optional[float] = None, + args: Tuple = (), + kwargs: Optional[Dict[str, Any]] = None) -> List[_R]: + """ + Execute an RPC call on all workers. + + Args: + method: Name of the worker method to execute, or a callable that + is serialized and sent to all workers to execute. + + If the method is a callable, it should accept an additional + `self` argument, in addition to the arguments passed in `args` + and `kwargs`. The `self` argument will be the worker object. + timeout: Maximum time in seconds to wait for execution. Raises a + :exc:`TimeoutError` on timeout. `None` means wait indefinitely. + args: Positional arguments to pass to the worker method. + kwargs: Keyword arguments to pass to the worker method. + + Returns: + A list containing the results from each worker. + + Note: + It is recommended to use this API to only pass control messages, + and set up data-plane communication to pass data. + """ + raise NotImplementedError + def determine_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of available blocks for the GPU KV cache and swappable CPU KV cache. @@ -53,58 +95,114 @@ class ExecutorBase(ABC): num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be appended to. """ - raise NotImplementedError + results = self.collective_rpc("determine_num_available_blocks") + a = min([r[0] for r in results]) + b = min([r[1] for r in results]) + return a, b - @abstractmethod - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - """Initialize the KV cache with the given size in blocks. + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: + """Initialize the KV cache by invoking the underlying worker. """ - raise NotImplementedError + # NOTE: This is logged in the executor because there can be >1 workers. + logger.info("# %s blocks: %d, # CPU blocks: %d", + current_platform.dispatch_key, num_gpu_blocks, + num_cpu_blocks) + max_concurrency = (num_gpu_blocks * self.cache_config.block_size / + self.model_config.max_model_len) + logger.info("Maximum concurrency for %s tokens per request: %.2fx", + self.model_config.max_model_len, max_concurrency) + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + self.collective_rpc("initialize_cache", + args=(num_gpu_blocks, num_cpu_blocks)) + + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + """ + Run a function directly on the model inside each worker, + returning the result for each of them. + """ + + def rpc_func(worker: WorkerBase) -> _R: + return func(worker.get_model()) + + return self.collective_rpc(rpc_func) - @abstractmethod def execute_model( self, execute_model_req: ExecuteModelRequest - ) -> Optional[List[SamplerOutput]]: - """Executes at least one model step on the given sequences.""" - raise NotImplementedError + ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]: + output = self.collective_rpc("execute_model", + args=(execute_model_req, )) + return output[0] def stop_remote_worker_execution_loop(self) -> None: """Releases parallel workers from model loop.""" return - @abstractmethod def add_lora(self, lora_request: LoRARequest) -> bool: - raise NotImplementedError + assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." + return all(self.collective_rpc("add_lora", args=(lora_request, ))) - @abstractmethod def remove_lora(self, lora_id: int) -> bool: - raise NotImplementedError + assert lora_id > 0, "lora_id must be greater than 0." + return all(self.collective_rpc("remove_lora", args=(lora_id, ))) - @abstractmethod def pin_lora(self, lora_id: int) -> bool: - raise NotImplementedError # type: ignore + assert lora_id > 0, "lora_id must be greater than 0." + return all(self.collective_rpc("pin_lora", args=(lora_id, ))) - @abstractmethod def list_loras(self) -> Set[int]: - raise NotImplementedError + sets = self.collective_rpc("list_loras") + for s in sets: + assert s == sets[0], "All workers should have the same LORAs." + return sets[0] - @abstractmethod def add_prompt_adapter( self, prompt_adapter_request: PromptAdapterRequest) -> bool: - raise NotImplementedError + assert prompt_adapter_request.prompt_adapter_id > 0, \ + "prompt_adapter_id must be greater than 0." + return all( + self.collective_rpc("add_prompt_adapter", + args=(prompt_adapter_request, ))) - @abstractmethod def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError + assert prompt_adapter_id > 0, \ + "prompt_adapter_id must be greater than 0." + return all( + self.collective_rpc("remove_prompt_adapter", + args=(prompt_adapter_id, ))) - @abstractmethod def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError # type: ignore + assert prompt_adapter_id > 0, \ + "prompt_adapter_id must be greater than 0." + return all( + self.collective_rpc("pin_prompt_adapter", + args=(prompt_adapter_id, ))) - @abstractmethod def list_prompt_adapters(self) -> Set[int]: - raise NotImplementedError + sets = self.collective_rpc("list_prompt_adapters") + for s in sets: + assert (s == sets[0] + ), "All workers should have the same prompt adapters." + return sets[0] + + def start_profile(self) -> None: + self.collective_rpc("start_profile") + + def stop_profile(self) -> None: + self.collective_rpc("stop_profile") + + def save_sharded_state( + self, + path: str, + pattern: Optional[str] = None, + max_size: Optional[int] = None, + ) -> None: + self.collective_rpc("save_sharded_state", + kwargs=dict(path=path, + pattern=pattern, + max_size=max_size)) @abstractmethod def check_health(self) -> None: @@ -119,15 +217,12 @@ class ExecutorBase(ABC): def __del__(self): self.shutdown() - -class ExecutorAsyncBase(ExecutorBase): - - @abstractmethod async def execute_model_async( self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: """Executes one model step on the given sequences.""" - raise NotImplementedError + output = await make_async(self.execute_model)(execute_model_req) + return output async def stop_remote_worker_execution_loop_async(self) -> None: """Releases parallel workers from model loop.""" @@ -137,3 +232,128 @@ class ExecutorAsyncBase(ExecutorBase): """Checks if the executor is healthy. If not, it should raise an exception.""" self.check_health() + + +class DistributedExecutorBase(ExecutorBase): + """Abstract superclass of distributed executor implementations.""" + + def __init__(self, *args, **kwargs): + # This is non-None when the execute model loop is running + # in the parallel workers. It's a coroutine in the AsyncLLMEngine case. + self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None + + super().__init__(*args, **kwargs) + + def execute_model( + self, + execute_model_req: ExecuteModelRequest, + ) -> List[SamplerOutput]: + # TODO: unify into collective_rpc + if self.parallel_worker_tasks is None: + self.parallel_worker_tasks = self._run_workers( + "start_worker_execution_loop", + async_run_tensor_parallel_workers_only=True) + + # Only the driver worker returns the sampling results. + driver_outputs = self._driver_execute_model(execute_model_req) + assert driver_outputs is not None + return driver_outputs + + def stop_remote_worker_execution_loop(self) -> None: + if self.parallel_worker_tasks is None: + return + + self._driver_execute_model(execute_model_req=None) + parallel_worker_tasks = self.parallel_worker_tasks + self.parallel_worker_tasks = None + # Ensure that workers exit model loop cleanly + # (this will raise otherwise) + self._wait_for_tasks_completion(parallel_worker_tasks) + + @abstractmethod + def _driver_execute_model( + self, execute_model_req: Optional[ExecuteModelRequest] + ) -> Optional[List[SamplerOutput]]: + """Run execute_model in the driver worker. + + Passing None will cause the driver to stop the model execution loop + running in each of the remote workers. In this case, this method + returns None. Otherwise, this method returns the model output. + """ + raise NotImplementedError + + def collective_rpc(self, + method: Union[str, Callable], + timeout: Optional[float] = None, + args: Tuple = (), + kwargs: Optional[Dict] = None) -> List[Any]: + return self._run_workers(method, *args, **(kwargs or {})) + + @abstractmethod + def _run_workers( + self, + method: Union[str, Callable], + *args, + async_run_tensor_parallel_workers_only: bool = False, + max_concurrent_workers: Optional[int] = None, + **kwargs, + ) -> Any: + """Runs the given method on all workers. + + Args: + async_run_tensor_parallel_workers_only: If True the method will be + run only in the remote TP workers, not the driver worker. + It will also be run asynchronously and return a list of futures + rather than blocking on the results. + + # TODO: simplify and merge with collective_rpc + """ + raise NotImplementedError + + @abstractmethod + def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: + """Wait for futures returned from _run_workers() with + async_run_remote_workers_only to complete.""" + raise NotImplementedError + + async def execute_model_async( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + if self.parallel_worker_tasks is None: + # Start model execution loop running in the parallel workers + self.parallel_worker_tasks = asyncio.create_task( + self._start_worker_execution_loop()) + + # Only the driver worker returns the sampling results. + return await self._driver_execute_model_async(execute_model_req) + + async def stop_remote_worker_execution_loop_async(self) -> None: + if self.parallel_worker_tasks is None: + return + + await self._driver_execute_model_async() + parallel_worker_tasks = self.parallel_worker_tasks + self.parallel_worker_tasks = None + # Ensure that workers exit model loop cleanly + # (this will raise otherwise) + await parallel_worker_tasks + + @abstractmethod + async def _driver_execute_model_async( + self, + execute_model_req: Optional[ExecuteModelRequest] = None, + ) -> List[SamplerOutput]: + """Execute the model asynchronously in the driver worker. + + Passing None will cause the driver to stop the model execution + loop running in each of the remote workers. + """ + raise NotImplementedError + + @abstractmethod + async def _start_worker_execution_loop(self): + """Run execution loop on all workers. It guarantees all workers run + the loop or None of them is running the loop. Loop can be stopped by + `stop_remote_worker_execution_loop`. + The API is idempotent (guarantee only 1 loop run at any moment).""" + raise NotImplementedError diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py deleted file mode 100644 index 7fa34456028dd..0000000000000 --- a/vllm/executor/gpu_executor.py +++ /dev/null @@ -1,145 +0,0 @@ -from typing import Any, Dict, List, Optional, Set, Tuple, Union - -from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sequence import ExecuteModelRequest, PoolerOutput -from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, - make_async) -from vllm.worker.worker_base import WorkerWrapperBase - -logger = init_logger(__name__) - - -def create_worker(**kwargs): - vllm_config = kwargs.get("vllm_config") - wrapper = WorkerWrapperBase(vllm_config=vllm_config) - wrapper.init_worker(**kwargs) - return wrapper.worker - - -class GPUExecutor(ExecutorBase): - - uses_ray: bool = False - - def _init_executor(self) -> None: - """Initialize the worker and load the model. - """ - assert self.parallel_config.world_size == 1, ( - "GPUExecutor only supports single GPU.") - - self.driver_worker = self._create_worker() - self.driver_worker.init_device() - self.driver_worker.load_model() - - def _get_worker_kwargs( - self, - local_rank: int = 0, - rank: int = 0, - distributed_init_method: Optional[str] = None) -> Dict[str, Any]: - """Return worker init args for a given rank.""" - if distributed_init_method is None: - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - return dict( - vllm_config=self.vllm_config, - local_rank=local_rank, - rank=rank, - distributed_init_method=distributed_init_method, - is_driver_worker=(not self.parallel_config) - or (rank % self.parallel_config.tensor_parallel_size == 0), - ) - - def _create_worker(self, - local_rank: int = 0, - rank: int = 0, - distributed_init_method: Optional[str] = None): - return create_worker(**self._get_worker_kwargs( - local_rank=local_rank, - rank=rank, - distributed_init_method=distributed_init_method)) - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Determine the number of available KV blocks by invoking the - underlying worker. - """ - return self.driver_worker.determine_num_available_blocks() - - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: - """Initialize the KV cache by invoking the underlying worker. - """ - # NOTE: This is logged in the executor because there can be >1 worker - # with other executors. We could log in the engine level, but work - # remains to abstract away the device for non-GPU configurations. - logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks, - num_cpu_blocks) - max_concurrency = (num_gpu_blocks * self.cache_config.block_size / - self.model_config.max_model_len) - logger.info("Maximum concurrency for %s tokens per request: %.2fx", - self.model_config.max_model_len, max_concurrency) - - self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - - def execute_model( - self, execute_model_req: ExecuteModelRequest - ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]: - output = self.driver_worker.execute_model(execute_model_req) - return output - - def add_lora(self, lora_request: LoRARequest) -> bool: - assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." - return self.driver_worker.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - assert lora_id > 0, "lora_id must be greater than 0." - return self.driver_worker.remove_lora(lora_id) - - def pin_lora(self, lora_id: int) -> bool: - assert lora_id > 0, "lora_id must be greater than 0." - return self.driver_worker.pin_lora(lora_id) - - def list_loras(self) -> Set[int]: - return self.driver_worker.list_loras() - - def add_prompt_adapter( - self, prompt_adapter_request: PromptAdapterRequest) -> bool: - assert prompt_adapter_request.prompt_adapter_id > 0, \ - "prompt_adapter_id must be greater than 0." - return self.driver_worker.add_prompt_adapter(prompt_adapter_request) - - def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - assert prompt_adapter_id > 0, \ - "prompt_adapter_id must be greater than 0." - return self.driver_worker.remove_prompt_adapter(prompt_adapter_id) - - def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - assert prompt_adapter_id > 0, \ - "prompt_adapter_id must be greater than 0." - return self.driver_worker.pin_prompt_adapter(prompt_adapter_id) - - def list_prompt_adapters(self) -> Set[int]: - return self.driver_worker.list_prompt_adapters() - - def check_health(self) -> None: - # GPUExecutor will always be healthy as long as - # it's running. - return - - def start_profile(self) -> None: - self.driver_worker.start_profile() - - def stop_profile(self) -> None: - self.driver_worker.stop_profile() - - -class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase): - - async def execute_model_async( - self, - execute_model_req: ExecuteModelRequest, - ) -> List[Union[SamplerOutput, PoolerOutput]]: - output = await make_async(self.driver_worker.execute_model - )(execute_model_req=execute_model_req) - return output diff --git a/vllm/executor/hpu_executor.py b/vllm/executor/hpu_executor.py deleted file mode 100644 index c9b7bfa71edfa..0000000000000 --- a/vllm/executor/hpu_executor.py +++ /dev/null @@ -1,202 +0,0 @@ -############################################################################### -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company -############################################################################### - -import contextlib -import os -from typing import Any, Dict, List, Optional, Set, Tuple - -from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sequence import ExecuteModelRequest -from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, - make_async) -from vllm.worker.worker_base import WorkerWrapperBase - -logger = init_logger(__name__) - - -class HPUExecutor(ExecutorBase): - - uses_ray: bool = False - - def _init_executor(self) -> None: - """Initialize the worker and load the model.""" - self._init_worker() - - def _get_worker_kwargs( - self, - local_rank: int = 0, - rank: int = 0, - distributed_init_method: Optional[str] = None) -> Dict[str, Any]: - """Return worker init args for a given rank.""" - if distributed_init_method is None: - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - return dict( - vllm_config=self.vllm_config, - local_rank=local_rank, - rank=rank, - distributed_init_method=distributed_init_method, - is_driver_worker=rank == 0, - ) - - def _create_worker(self, - local_rank: int = 0, - rank: int = 0, - distributed_init_method: Optional[str] = None): - wrapper = WorkerWrapperBase(vllm_config=self.vllm_config) - wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank, - distributed_init_method)) - return wrapper.worker - - def _init_worker(self): - assert self.parallel_config.world_size == 1, ( - "GPUExecutor only supports single GPU.") - - self.driver_worker = self._create_worker() - self.driver_worker.init_device() - self.driver_worker.load_model() - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Determine the number of available KV blocks by invoking the - underlying worker. - """ - return self.driver_worker.determine_num_available_blocks() - - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: - """Initialize the KV cache by invoking the underlying worker. - """ - # NOTE: This is logged in the executor because there can be >1 worker - # with other executors. We could log in the engine level, but work - # remains to abstract away the device for non-GPU configurations. - logger.info("# HPU blocks: %d, # CPU blocks: %d", num_gpu_blocks, - num_cpu_blocks) - from vllm_hpu_extension.profiler import HabanaMemoryProfiler - with HabanaMemoryProfiler() as cache_init_m: - self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - msg = f"init_cache_engine took {cache_init_m.get_summary_string()}" - logger.info(msg) - - def finish_measurements(self): - self.driver_worker.finish_measurements() - - def execute_model( - self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: - # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501 - # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501 - # VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any # noqa:E501 - # VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL - will log cpu fallbacks per engine step, always, even if there were none # noqa:E501 - log_graph_compilation_all = os.environ.get( - 'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0' - log_graph_compilation = os.environ.get( - 'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION', - '0') != '0' or log_graph_compilation_all - log_cpu_fallbacks_all = os.environ.get( - 'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0' - log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS', - '0') != '0' or log_cpu_fallbacks_all - if log_graph_compilation or log_cpu_fallbacks: - from habana_frameworks.torch.hpu.metrics import metric_localcontext - seq_group_metadata_list = execute_model_req.seq_group_metadata_list - is_prompt = any([ - seq_group_metadata.is_prompt - for seq_group_metadata in seq_group_metadata_list - ]) - max_context_len = max([ - max([ - len(v.prompt_token_ids) + len(v.output_token_ids) - for v in seq_group_metadata.seq_data.values() - ]) for seq_group_metadata in seq_group_metadata_list - ]) # whoa, that's some spicy stuff right here - max_num_blocks = ( - (max_context_len - 1) // self.cache_config.block_size) + 1 - input_stats = (f'is_prompt: {is_prompt}, ' - f'num_seqs: {len(seq_group_metadata_list)}, ' - f'max_context_len: {max_context_len}, ' - f'max_num_blocks {max_num_blocks}') - gc_ctx = metric_localcontext( - "graph_compilation" - ) if log_graph_compilation else contextlib.nullcontext() - cpu_fallback_ctx = metric_localcontext( - "cpu_fallback" - ) if log_cpu_fallbacks else contextlib.nullcontext() - with gc_ctx as gc_local_metric, \ - cpu_fallback_ctx as cpu_fallback_local_metric: - output = self.driver_worker.execute_model(execute_model_req) - if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0 - ) or log_graph_compilation_all: - msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: " - f"{gc_local_metric.stats()}, {input_stats}") - logger.warning(msg) - if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] > - 0) or log_cpu_fallbacks_all: - msg = ("VLLM_HPU_STEP_CPU_FALLBACK: " - f"{cpu_fallback_local_metric.stats()}, {input_stats}") - logger.warning(msg) - - return output - - output = self.driver_worker.execute_model(execute_model_req) - return output - - def add_lora(self, lora_request: LoRARequest) -> bool: - assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." - return self.driver_worker.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - assert lora_id > 0, "lora_id must be greater than 0." - return self.driver_worker.remove_lora(lora_id) - - def pin_lora(self, lora_id: int) -> bool: - assert lora_id > 0, "lora_id must be greater than 0." - return self.driver_worker.pin_lora(lora_id) - - def list_loras(self) -> Set[int]: - return self.driver_worker.list_loras() - - def add_prompt_adapter( - self, prompt_adapter_request: PromptAdapterRequest) -> bool: - raise NotImplementedError( - "Prompt Adapter is not implemented for HPU backend.") - - def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError( - "Prompt Adapter is not implemented for HPU backend.") - - def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError( - "Prompt Adapter is not implemented for HPU backend.") - - def list_prompt_adapters(self) -> Set[int]: - raise NotImplementedError( - "Prompt Adapter is not implemented for HPU backend.") - - def check_health(self) -> None: - # GPUExecutor will always be healthy as long as - # it's running. - return - - def start_profile(self) -> None: - self.driver_worker.start_profile() - - def stop_profile(self) -> None: - self.driver_worker.stop_profile() - - def shutdown(self) -> None: - self.driver_worker.shutdown_inc() - - -class HPUExecutorAsync(HPUExecutor, ExecutorAsyncBase): - - async def execute_model_async( - self, - execute_model_req: ExecuteModelRequest, - ) -> List[SamplerOutput]: - output = await make_async(self.driver_worker.execute_model - )(execute_model_req=execute_model_req, ) - return output diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/mp_distributed_executor.py similarity index 71% rename from vllm/executor/multiproc_gpu_executor.py rename to vllm/executor/mp_distributed_executor.py index fc58163cade64..78c86321d861d 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/mp_distributed_executor.py @@ -1,11 +1,10 @@ import asyncio import os -from functools import partial -from typing import Any, List, Optional +from typing import Any, Callable, List, Optional, Union -from vllm.executor.distributed_gpu_executor import ( # yapf: disable - DistributedGPUExecutor, DistributedGPUExecutorAsync) -from vllm.executor.gpu_executor import create_worker +import cloudpickle + +from vllm.executor.executor_base import DistributedExecutorBase from vllm.executor.multiproc_worker_utils import ( ProcessWorkerWrapper, ResultHandler, WorkerMonitor, set_multiprocessing_worker_envs) @@ -13,19 +12,50 @@ from vllm.logger import init_logger from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless, - get_distributed_init_method, get_open_port, make_async, - update_environment_variables) + get_distributed_init_method, get_ip, get_open_port, + make_async, run_method, update_environment_variables) +from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) -class MultiprocessingGPUExecutor(DistributedGPUExecutor): - """Python multiprocessing-based multi-GPU executor""" +class MultiprocessingDistributedExecutor(DistributedExecutorBase): + """Python multiprocessing-based distributed executor""" uses_ray: bool = False + def _check_cuda(self) -> None: + """Check that the number of GPUs is sufficient for the parallel + configuration. Separate from _init_executor to reduce the number of + indented blocks. + """ + parallel_config = self.parallel_config + world_size = parallel_config.world_size + tensor_parallel_size = parallel_config.tensor_parallel_size + + cuda_device_count = cuda_device_count_stateless() + # Use confusing message for more common TP-only case. + if tensor_parallel_size > cuda_device_count: + raise RuntimeError( + f"please set tensor_parallel_size ({tensor_parallel_size}) " + f"to less than max local gpu count ({cuda_device_count})") + + if world_size > cuda_device_count: + raise RuntimeError( + f"please ensure that world_size ({world_size}) " + f"is less than than max local gpu count ({cuda_device_count})") + + # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers + if "CUDA_VISIBLE_DEVICES" not in os.environ: + update_environment_variables({ + "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size)))) + }) + def _init_executor(self) -> None: - self._check_executor_parameters() + + from vllm.platforms import current_platform + if current_platform.is_cuda_alike(): + self._check_cuda() # Create the parallel GPU workers. world_size = self.parallel_config.world_size @@ -55,15 +85,9 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor): else: result_handler = ResultHandler() for rank in range(1, world_size): - worker = ProcessWorkerWrapper( - result_handler, - partial( - create_worker, - **self._get_worker_kwargs( - rank=rank, - local_rank=rank, - distributed_init_method=distributed_init_method, - ))) + worker = ProcessWorkerWrapper(result_handler, + WorkerWrapperBase, + self.vllm_config, rank) self.workers.append(worker) if rank % tensor_parallel_size == 0: self.tp_driver_workers.append(worker) @@ -77,32 +101,30 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor): # Set up signal handlers to shutdown the executor cleanly # sometimes gc does not work well - self.driver_worker = self._create_worker( - distributed_init_method=distributed_init_method) + self.driver_worker = WorkerWrapperBase(self.vllm_config, 0) + + all_kwargs = [] + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + for i in range(world_size): + local_rank = i + rank = i + kwargs = dict( + vllm_config=self.vllm_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + is_driver_worker=(not self.parallel_config) + or (rank % self.parallel_config.tensor_parallel_size == 0), + ) + all_kwargs.append(kwargs) + self._run_workers("init_worker", all_kwargs) self._run_workers("init_device") self._run_workers("load_model", max_concurrent_workers=self.parallel_config. max_parallel_loading_workers) - - def _check_executor_parameters(self): - world_size = self.parallel_config.world_size - tensor_parallel_size = self.parallel_config.tensor_parallel_size - - # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers - if "CUDA_VISIBLE_DEVICES" not in os.environ: - update_environment_variables({ - "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size)))) - }) - - cuda_device_count = cuda_device_count_stateless() - # Use confusing message for more common TP-only case. - assert tensor_parallel_size <= cuda_device_count, ( - f"please set tensor_parallel_size ({tensor_parallel_size}) " - f"to less than max local gpu count ({cuda_device_count})") - - assert world_size <= cuda_device_count, ( - f"please ensure that world_size ({world_size}) " - f"is less than than max local gpu count ({cuda_device_count})") + self.driver_exec_model = make_async(self.driver_worker.execute_model) + self.pp_locks: Optional[List[asyncio.Lock]] = None def shutdown(self): if (worker_monitor := getattr(self, "worker_monitor", @@ -121,12 +143,12 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor): def _run_workers( self, - method: str, + method: Union[str, Callable], *args, async_run_tensor_parallel_workers_only: bool = False, max_concurrent_workers: Optional[int] = None, **kwargs, - ) -> Any: + ) -> List[Any]: """Runs the given method on all workers. Args: @@ -135,6 +157,11 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor): It will also be run asynchronously and return a list of futures rather than blocking on the results. """ + if isinstance(method, str): + sent_method = method + else: + sent_method = cloudpickle.dumps(method) + del method if max_concurrent_workers: raise NotImplementedError( @@ -143,18 +170,18 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor): if async_run_tensor_parallel_workers_only: # Run only non-driver workers and just return futures. return [ - worker.execute_method(method, *args, **kwargs) + worker.execute_method(sent_method, *args, **kwargs) for worker in self.non_driver_workers ] # Start all remote workers first. worker_outputs = [ - worker.execute_method(method, *args, **kwargs) + worker.execute_method(sent_method, *args, **kwargs) for worker in self.workers ] - driver_worker_method = getattr(self.driver_worker, method) - driver_worker_output = driver_worker_method(*args, **kwargs) + driver_worker_output = run_method(self.driver_worker, sent_method, + args, kwargs) # Get the results of the workers. return [driver_worker_output @@ -172,15 +199,6 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor): for result in parallel_worker_tasks: result.get() - -class MultiprocessingGPUExecutorAsync(MultiprocessingGPUExecutor, - DistributedGPUExecutorAsync): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.driver_exec_model = make_async(self.driver_worker.execute_model) - self.pp_locks: Optional[List[asyncio.Lock]] = None - async def _driver_execute_model_async( self, execute_model_req: Optional[ExecuteModelRequest] = None diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index bc32826529eef..539b6ae2d3572 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -12,9 +12,10 @@ from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO, import torch +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.triton_utils.importing import HAS_TRITON -from vllm.utils import _check_multiproc_method, get_mp_context +from vllm.utils import _check_multiproc_method, get_mp_context, run_method if HAS_TRITON: from vllm.triton_utils import maybe_set_triton_cache_manager @@ -147,7 +148,8 @@ class ProcessWorkerWrapper: for handling single-node multi-GPU tensor parallel.""" def __init__(self, result_handler: ResultHandler, - worker_factory: Callable[[], Any]) -> None: + worker_factory: Callable[[VllmConfig, int], Any], + vllm_config: VllmConfig, rank: int) -> None: self.mp = get_mp_context() self._task_queue = self.mp.Queue() self.result_queue = result_handler.result_queue @@ -159,13 +161,15 @@ class ProcessWorkerWrapper: worker_factory=worker_factory, task_queue=self._task_queue, result_queue=self.result_queue, + vllm_config=vllm_config, + rank=rank, ), daemon=True) self.process.start() def _enqueue_task(self, future: Union[ResultFuture, asyncio.Future], - method: str, args, kwargs): + method: Union[str, bytes], args, kwargs): task_id = uuid.uuid4() self.tasks[task_id] = future try: @@ -176,12 +180,13 @@ class ProcessWorkerWrapper: del self.tasks[task_id] raise ChildProcessError("worker died") from e - def execute_method(self, method: str, *args, **kwargs): + def execute_method(self, method: Union[str, bytes], *args, **kwargs): future: ResultFuture = ResultFuture() self._enqueue_task(future, method, args, kwargs) return future - async def execute_method_async(self, method: str, *args, **kwargs): + async def execute_method_async(self, method: Union[str, bytes], *args, + **kwargs): future = asyncio.get_running_loop().create_future() self._enqueue_task(future, method, args, kwargs) return await future @@ -199,9 +204,11 @@ class ProcessWorkerWrapper: def _run_worker_process( - worker_factory: Callable[[], Any], + worker_factory: Callable[[VllmConfig, int], Any], task_queue: Queue, result_queue: Queue, + vllm_config: VllmConfig, + rank: int, ) -> None: """Worker process event loop""" @@ -212,7 +219,7 @@ def _run_worker_process( _add_prefix(sys.stderr, process_name, pid) # Initialize worker - worker = worker_factory() + worker = worker_factory(vllm_config, rank) del worker_factory # Accept tasks from the engine in task_queue @@ -224,8 +231,7 @@ def _run_worker_process( exception = None task_id, method, args, kwargs = items try: - executor = getattr(worker, method) - output = executor(*args, **kwargs) + output = run_method(worker, method, args, kwargs) except SystemExit: raise except KeyboardInterrupt: diff --git a/vllm/executor/multiproc_xpu_executor.py b/vllm/executor/multiproc_xpu_executor.py deleted file mode 100644 index a66afbf939ef0..0000000000000 --- a/vllm/executor/multiproc_xpu_executor.py +++ /dev/null @@ -1,26 +0,0 @@ -import vllm.envs as envs -from vllm.executor.multiproc_gpu_executor import ( - MultiprocessingGPUExecutor, MultiprocessingGPUExecutorAsync) -from vllm.executor.xpu_executor import XPUExecutor -from vllm.logger import init_logger -from vllm.utils import make_async - -logger = init_logger(__name__) - - -class MultiprocessingXPUExecutor(MultiprocessingGPUExecutor, XPUExecutor): - """Python multiprocessing-based multi-XPU executor""" - - def _check_executor_parameters(self): - mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD - if mp_method != "spawn": - raise RuntimeError( - "XPU multiprocess executor only support spawn as mp method") - - -class MultiprocessingXPUExecutorAsync(MultiprocessingXPUExecutor, - MultiprocessingGPUExecutorAsync): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.driver_exec_model = make_async(self.driver_worker.execute_model) diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py deleted file mode 100644 index a9efc4f9a801c..0000000000000 --- a/vllm/executor/neuron_executor.py +++ /dev/null @@ -1,114 +0,0 @@ -from typing import List, Set, Tuple - -from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest -from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, - make_async) -from vllm.worker.worker_base import WorkerWrapperBase - -logger = init_logger(__name__) - - -class NeuronExecutor(ExecutorBase): - - uses_ray: bool = False - - def _init_executor(self) -> None: - assert (self.lora_config is - None), "LoRA is not supported for Neuron backend." - assert (not self.speculative_config - ), "Speculative decoding not yet supported for Neuron backend." - - # Instantiate the worker and load the model to the device. - self._init_worker() - - def _init_worker(self): - wrapper = WorkerWrapperBase(vllm_config=self.vllm_config) - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - wrapper.init_worker( - vllm_config=self.vllm_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - ) - self.driver_worker = wrapper.worker - self.driver_worker.init_device() - self.driver_worker.load_model() - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Determine the number of available KV blocks by invoking the - underlying worker. - """ - return self.driver_worker.determine_num_available_blocks() - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - """Initialize the KV cache by invoking the underlying worker. - """ - self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - - def execute_model( - self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: - assert (not execute_model_req.blocks_to_swap_in - and not execute_model_req.blocks_to_swap_out - and not execute_model_req.blocks_to_copy), ( - "Cache operations are not supported for Neuron backend.") - assert execute_model_req.num_lookahead_slots == 0, ( - "lookahead not supported for Neuron backend.") - - output = self.driver_worker.execute_model(execute_model_req) - return output - - def add_lora(self, lora_request: LoRARequest) -> bool: - return self.driver_worker.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - return self.driver_worker.remove_lora(lora_id) - - def pin_lora(self, lora_id: int) -> bool: - return self.driver_worker.pin_lora(lora_id) - - def list_loras(self) -> Set[int]: - return self.driver_worker.list_loras() - - def add_prompt_adapter(self, prompt_adapter_request) -> bool: - raise NotImplementedError( - "Soft prompt is currently not supported by the Neuron backend.") - - def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError( - "Soft prompt is currently not supported by the Neuron backend.") - - def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError( - "Soft prompt is currently not supported by the Neuron backend.") - - def list_prompt_adapters(self) -> Set[int]: - raise NotImplementedError( - "Soft prompt is currently not supported by the Neuron backend.") - - def check_health(self) -> None: - # NeuronExecutor will always be healthy as long as - # it's running. - return - - -class NeuronExecutorAsync(NeuronExecutor, ExecutorAsyncBase): - - async def execute_model_async( - self, - execute_model_req: ExecuteModelRequest, - ) -> List[SamplerOutput]: - output = await make_async(self.driver_worker.execute_model - )(execute_model_req=execute_model_req, ) - return output - - async def check_health_async(self) -> None: - # NeuronExecutor will always be healthy as long as - # it's running. - return diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py deleted file mode 100644 index 057a32364e512..0000000000000 --- a/vllm/executor/openvino_executor.py +++ /dev/null @@ -1,125 +0,0 @@ -from typing import List, Set, Tuple - -import openvino as ov - -import vllm.envs as envs -from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.platforms import current_platform -from vllm.sequence import ExecuteModelRequest -from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, - make_async) -from vllm.worker.worker_base import WorkerWrapperBase - -logger = init_logger(__name__) - - -class OpenVINOExecutor(ExecutorBase): - - uses_ray: bool = False - - def _init_executor(self) -> None: - assert self.device_config.device_type == "openvino" - assert self.lora_config is None, "OpenVINO backend doesn't support LoRA" - assert current_platform.is_openvino_cpu() or \ - current_platform.is_openvino_gpu(), \ - "OpenVINO backend supports only CPU and GPU devices" - - # Instantiate the worker and load the model to CPU. - self._init_worker() - - def _init_worker(self): - - wrapper = WorkerWrapperBase(vllm_config=self.vllm_config) - - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - wrapper.init_worker( - ov_core=ov.Core(), - vllm_config=self.vllm_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - kv_cache_dtype=self.cache_config.cache_dtype, - is_driver_worker=True, - ) - self.driver_worker = wrapper.worker - self.driver_worker.init_device() - self.driver_worker.load_model() - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Determine the number of available KV blocks by invoking the - underlying worker. - """ - return self.driver_worker.determine_num_available_blocks() - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - """Initialize the KV cache by invoking the underlying worker.""" - # NOTE: We log here to avoid multiple logs when number of workers is - # greater than one. We could log in the engine, but not all executors - # have GPUs. - # NOTE: In case of a CPU device, `cpu block` for OpenVINO backend - # is located on CPU memory but is referred as `gpu block`. - # Because we want to reuse the existing block management procedure. - device_blocks = num_gpu_blocks - swap_blocks = num_cpu_blocks - logger.info("OpenVINO %s: # device blocks: %d; # swap blocks: %d", - envs.VLLM_OPENVINO_DEVICE, device_blocks, swap_blocks) - self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - - def execute_model( - self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: - output = self.driver_worker.execute_model(execute_model_req) - return output - - def add_lora(self, lora_request: LoRARequest) -> bool: - return self.driver_worker.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - return self.driver_worker.remove_lora(lora_id) - - def pin_lora(self, lora_id: int) -> bool: - return self.driver_worker.pin_lora(lora_id) - - def list_loras(self) -> Set[int]: - return self.driver_worker.list_loras() - - def add_prompt_adapter(self, prompt_adapter_request) -> bool: - raise NotImplementedError( - "Soft prompt is currently not supported by the OPENVINO backend.") - - def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError( - "Soft prompt is currently not supported by the OPENVINO backend.") - - def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError( - "Soft prompt is currently not supported by the OPENVINO backend.") - - def list_prompt_adapters(self) -> Set[int]: - raise NotImplementedError( - "Soft prompt is currently not supported by the OPENVINO backend.") - - def check_health(self) -> None: - # OpenVINOExecutor will always be healthy as long as - # it's running. - return - - -class OpenVINOExecutorAsync(OpenVINOExecutor, ExecutorAsyncBase): - - async def execute_model_async( - self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: - output = await make_async(self.driver_worker.execute_model - )(execute_model_req=execute_model_req, ) - return output - - async def check_health_async(self) -> None: - # OpenVINOExecutor will always be healthy as long as - # it's running. - return diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_distributed_executor.py similarity index 77% rename from vllm/executor/ray_gpu_executor.py rename to vllm/executor/ray_distributed_executor.py index e2c549cbd5331..2afd99f99b353 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -1,24 +1,30 @@ import asyncio import os from collections import defaultdict -from itertools import islice, repeat -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union +import cloudpickle import msgspec import vllm.envs as envs -from vllm.executor.distributed_gpu_executor import ( # yapf: disable - DistributedGPUExecutor, DistributedGPUExecutorAsync) +from vllm.executor.executor_base import ( + DistributedExecutorBase) # yapf: disable from vllm.executor.msgspec_utils import encode_hook -from vllm.executor.ray_utils import RayWorkerWrapper, ray +from vllm.executor.ray_utils import (RayWorkerWrapper, initialize_ray_cluster, + ray) from vllm.logger import init_logger from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.platforms import current_platform from vllm.sequence import ExecuteModelRequest from vllm.utils import (_run_task_with_lock, get_distributed_init_method, get_ip, get_open_port, make_async) if ray is not None: + from ray.actor import ActorHandle from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy +else: + ActorHandle = None if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -26,12 +32,29 @@ if TYPE_CHECKING: logger = init_logger(__name__) -class RayGPUExecutor(DistributedGPUExecutor): +@dataclass +class RayWorkerMetaData: + """ + Metadata for a Ray worker. + The order of ray worker creation can be random, + and we need to reset the rank after creating all workers. + """ + worker: ActorHandle + created_rank: int + adjusted_rank: int = -1 + ip: str = "" + + +class RayDistributedExecutor(DistributedExecutorBase): uses_ray: bool = True def _init_executor(self) -> None: self.forward_dag: Optional[ray.dag.CompiledDAG] = None + if envs.VLLM_USE_V1: + # v1 always uses the compiled DAG and SPMD worker. + os.environ["VLLM_USE_RAY_SPMD_WORKER"] = "1" + os.environ["VLLM_USE_RAY_COMPILED_DAG"] = "1" # If the env var is set, it uses the Ray's compiled DAG API # which optimizes the control plane overhead. # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. @@ -53,6 +76,7 @@ class RayGPUExecutor(DistributedGPUExecutor): "VLLM_USE_RAY_COMPILED_DAG=1") assert self.uses_ray + initialize_ray_cluster(self.parallel_config) placement_group = self.parallel_config.placement_group # Disable Ray usage stats collection. @@ -66,6 +90,13 @@ class RayGPUExecutor(DistributedGPUExecutor): self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook) self.output_decoder = msgspec.msgpack.Decoder( Optional[List[SamplerOutput]]) + self.use_v1 = envs.VLLM_USE_V1 + + self.pp_locks: Optional[List[asyncio.Lock]] = None + self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER + if not self.use_ray_compiled_dag: + self.driver_exec_method = make_async( + self.driver_worker.execute_method) def shutdown(self) -> None: if hasattr(self, "forward_dag") and self.forward_dag is not None: @@ -123,9 +154,10 @@ class RayGPUExecutor(DistributedGPUExecutor): # Create the workers. driver_ip = get_ip() - workers = [] + rank = 0 + worker_metadata: List[RayWorkerMetaData] = [] for bundle_id, bundle in enumerate(placement_group.bundle_specs): - if not bundle.get("GPU", 0): + if not bundle.get(current_platform.ray_device_key, 0): continue scheduling_strategy = PlacementGroupSchedulingStrategy( placement_group=placement_group, @@ -133,38 +165,51 @@ class RayGPUExecutor(DistributedGPUExecutor): placement_group_bundle_index=bundle_id, ) - worker = ray.remote( - num_cpus=0, - num_gpus=num_gpus, - scheduling_strategy=scheduling_strategy, - **ray_remote_kwargs, - )(RayWorkerWrapper).remote(vllm_config=self.vllm_config) - workers.append(worker) + if current_platform.ray_device_key == "GPU": + # NV+AMD GPUs, and Intel XPUs + worker = ray.remote( + num_cpus=0, + num_gpus=num_gpus, + scheduling_strategy=scheduling_strategy, + **ray_remote_kwargs, + )(RayWorkerWrapper).remote(vllm_config=self.vllm_config, + rpc_rank=rank) + else: + worker = ray.remote( + num_cpus=0, + num_gpus=0, + resources={current_platform.ray_device_key: num_gpus}, + scheduling_strategy=scheduling_strategy, + **ray_remote_kwargs, + )(RayWorkerWrapper).remote(vllm_config=self.vllm_config, + rpc_rank=rank) + worker_metadata.append( + RayWorkerMetaData(worker=worker, created_rank=rank)) + rank += 1 - worker_ip_refs = [ - worker.get_node_ip.remote() # type: ignore[attr-defined] - for worker in workers - ] - worker_ips = ray.get(worker_ip_refs) + worker_ips = ray.get([ + each.worker.get_node_ip.remote() # type: ignore[attr-defined] + for each in worker_metadata + ]) + + for each, ip in zip(worker_metadata, worker_ips): + each.ip = ip if not self.use_ray_spmd_worker: - for i in range(len(workers)): - worker = workers[i] - worker_ip = worker_ips[i] + for i, each in enumerate(worker_metadata): + # find and remove the dummy worker from the list + worker = each.worker + worker_ip = each.ip if self.driver_dummy_worker is None and worker_ip == driver_ip: # If the worker is on the same node as the driver, we use it # as the resource holder for the driver process. self.driver_dummy_worker = worker self.driver_worker = RayWorkerWrapper( - vllm_config=self.vllm_config) - workers.pop(i) - worker_ips.pop(i) - self.workers = workers + vllm_config=self.vllm_config, rpc_rank=0) + worker_metadata.pop(i) break - else: - self.workers = workers - logger.debug("workers: %s", self.workers) + logger.debug("workers: %s", worker_metadata) logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker) if not self.use_ray_spmd_worker and self.driver_dummy_worker is None: raise ValueError( @@ -176,9 +221,7 @@ class RayGPUExecutor(DistributedGPUExecutor): for ip in worker_ips: ip_counts[ip] = ip_counts.get(ip, 0) + 1 - worker_to_ip = dict(zip(self.workers, worker_ips)) - - def sort_by_driver_then_worker_ip(worker): + def sort_by_driver_then_worker_ip(item: RayWorkerMetaData): """ Sort the workers based on 3 properties: 1. If the worker is on the same node as the driver (vllm engine), @@ -188,13 +231,23 @@ class RayGPUExecutor(DistributedGPUExecutor): 3. Finally, if the work is on a node with smaller IP address, it should be placed first. """ - ip = worker_to_ip[worker] - return (ip != driver_ip, ip_counts[ip], ip) + ip = item.ip + return (0 if ip == driver_ip else 1, ip_counts[ip], ip) # After sorting, the workers on the same node will be # close to each other, and the workers on the driver # node will be placed first. - self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip) + sorted_worker_metadata = sorted(worker_metadata, + key=sort_by_driver_then_worker_ip) + start_rank = 0 if self.use_ray_spmd_worker else 1 + for i, item in enumerate(sorted_worker_metadata): + item.adjusted_rank = i + start_rank + self.workers = [item.worker for item in sorted_worker_metadata] + rerank_mapping = { + item.created_rank: item.adjusted_rank + for item in sorted_worker_metadata + } + self._run_workers("adjust_rank", rerank_mapping) # Get the set of GPU IDs used on each node. worker_node_and_gpu_ids = [] @@ -235,21 +288,29 @@ class RayGPUExecutor(DistributedGPUExecutor): " each node.") # Set environment variables for the driver and workers. - all_args_to_update_environment_variables = [({ - "CUDA_VISIBLE_DEVICES": + all_args_to_update_environment_variables = [{ + current_platform.device_control_env_var: ",".join(map(str, node_gpus[node_id])), - "VLLM_TRACE_FUNCTION": - str(envs.VLLM_TRACE_FUNCTION), - **({ - "VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND - } if envs.VLLM_ATTENTION_BACKEND is not None else {}) - }, ) for (node_id, _) in worker_node_and_gpu_ids] + } for (node_id, _) in worker_node_and_gpu_ids] + + for args in all_args_to_update_environment_variables: + # some carry-over env vars from the driver + # TODO: refactor platform-specific env vars + for name in [ + "VLLM_ATTENTION_BACKEND", + "TPU_CHIPS_PER_HOST_BOUNDS", + "TPU_HOST_BOUNDS", + "VLLM_USE_V1", + "VLLM_TRACE_FUNCTION", + ]: + if name in os.environ: + args[name] = os.environ[name] self._env_vars_for_all_workers = ( all_args_to_update_environment_variables) self._run_workers("update_environment_variables", - all_args=self._get_env_vars_to_be_updated()) + self._get_env_vars_to_be_updated()) if len(node_gpus) == 1: # in single node case, we don't need to get the IP address. @@ -265,14 +326,19 @@ class RayGPUExecutor(DistributedGPUExecutor): driver_ip, get_open_port()) # Initialize the actual workers inside worker wrapper. - init_worker_all_kwargs = [ - self._get_worker_kwargs( - local_rank=node_workers[node_id].index(rank), + all_kwargs = [] + for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids): + local_rank = node_workers[node_id].index(rank) + kwargs = dict( + vllm_config=self.vllm_config, + local_rank=local_rank, rank=rank, distributed_init_method=distributed_init_method, - ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids) - ] - self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) + is_driver_worker=(not self.parallel_config) + or (rank % self.parallel_config.tensor_parallel_size == 0), + ) + all_kwargs.append(kwargs) + self._run_workers("init_worker", all_kwargs) self._run_workers("init_device") self._run_workers("load_model", @@ -332,18 +398,22 @@ class RayGPUExecutor(DistributedGPUExecutor): if self.forward_dag is None: self.forward_dag = self._compiled_ray_dag(enable_asyncio=False) - serialized_data = self.input_encoder.encode(execute_model_req) + if self.use_v1: + serialized_data = execute_model_req + else: + serialized_data = self.input_encoder.encode(execute_model_req) outputs = ray.get(self.forward_dag.execute(serialized_data)) - output = self.output_decoder.decode(outputs[0]) + if self.use_v1: + output = outputs[0] + else: + output = self.output_decoder.decode(outputs[0]) return output def _run_workers( self, - method: str, + method: Union[str, Callable], *args, async_run_tensor_parallel_workers_only: bool = False, - all_args: Optional[List[Tuple[Any, ...]]] = None, - all_kwargs: Optional[List[Dict[str, Any]]] = None, max_concurrent_workers: Optional[int] = None, **kwargs, ) -> Any: @@ -356,9 +426,12 @@ class RayGPUExecutor(DistributedGPUExecutor): It will also be run asynchronously and return a list of futures rather than blocking on the results. - args/kwargs: All workers share the same args/kwargs - - all_args/all_kwargs: args/kwargs for each worker are specified - individually """ + if isinstance(method, str): + sent_method = method + else: + sent_method = cloudpickle.dumps(method) + del method if self.use_ray_spmd_worker: assert not async_run_tensor_parallel_workers_only, ( "async_run_tensor_parallel_workers_only is not supported for " @@ -368,26 +441,13 @@ class RayGPUExecutor(DistributedGPUExecutor): raise NotImplementedError( "max_concurrent_workers is not supported yet.") - count = len(self.workers) if not \ - async_run_tensor_parallel_workers_only \ - else len(self.non_driver_workers) - # If using SPMD worker, all workers are the same, so we should execute - # the args on all workers. Otherwise, we skip the first worker's args - # because those args will go to the driver worker. - first_worker_args_index: int = 0 if self.use_ray_spmd_worker else 1 - all_worker_args = repeat(args, count) if all_args is None \ - else islice(all_args, first_worker_args_index, None) - all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \ - else islice(all_kwargs, first_worker_args_index, None) - # Start the ray workers first. ray_workers = self.workers if async_run_tensor_parallel_workers_only: ray_workers = self.non_driver_workers ray_worker_outputs = [ - worker.execute_method.remote(method, *worker_args, **worker_kwargs) - for (worker, worker_args, worker_kwargs - ) in zip(ray_workers, all_worker_args, all_worker_kwargs) + worker.execute_method.remote(sent_method, *args, **kwargs) + for worker in ray_workers ] if async_run_tensor_parallel_workers_only: @@ -399,13 +459,9 @@ class RayGPUExecutor(DistributedGPUExecutor): # so we only explicitly execute on the driver worker if using a # non-SPMD worker class. if not self.use_ray_spmd_worker: - driver_args = args if all_args is None else all_args[0] - driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] - # Start the driver worker after all the ray workers. driver_worker_output = [ - self.driver_worker.execute_method(method, *driver_args, - **driver_kwargs) + self.driver_worker.execute_method(sent_method, *args, **kwargs) ] # Get the results of the ray workers. @@ -467,11 +523,18 @@ class RayGPUExecutor(DistributedGPUExecutor): for pp_rank, tp_group in enumerate(self.pp_tp_workers): # Each PP worker takes in the output of the previous PP worker, # and the TP group executes in SPMD fashion. - outputs = [ - worker.execute_model_spmd. - bind( # type: ignore[attr-defined] - outputs[i]) for i, worker in enumerate(tp_group) - ] + if self.use_v1: + outputs = [ + worker.execute_model. + bind( # type: ignore[attr-defined] + outputs[i]) for i, worker in enumerate(tp_group) + ] + else: + outputs = [ + worker.execute_model_spmd. + bind( # type: ignore[attr-defined] + outputs[i]) for i, worker in enumerate(tp_group) + ] last_pp_rank = len(self.pp_tp_workers) - 1 if pp_rank < last_pp_rank: @@ -497,17 +560,6 @@ class RayGPUExecutor(DistributedGPUExecutor): def __del__(self): self.shutdown() - -class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.pp_locks: Optional[List[asyncio.Lock]] = None - self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER - if not self.use_ray_compiled_dag: - self.driver_exec_method = make_async( - self.driver_worker.execute_method) - async def execute_model_async( self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: @@ -568,5 +620,7 @@ class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync): ] return await asyncio.gather(*coros) - def __del__(self): - self.shutdown() + def check_health(self) -> None: + # Assume that the Ray workers are healthy. + # TODO: check the health of the Ray workers + return diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py deleted file mode 100644 index f3025cb537ab8..0000000000000 --- a/vllm/executor/ray_hpu_executor.py +++ /dev/null @@ -1,515 +0,0 @@ -import asyncio -import os -from collections import defaultdict -from itertools import islice, repeat -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple - -import msgspec - -import vllm.envs as envs -from vllm.executor.distributed_gpu_executor import ( # yapf: disable - DistributedGPUExecutor, DistributedGPUExecutorAsync) -from vllm.executor.msgspec_utils import encode_hook -from vllm.executor.ray_utils import RayWorkerWrapper, ray -from vllm.logger import init_logger -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest -from vllm.utils import (_run_task_with_lock, get_distributed_init_method, - get_ip, get_open_port, make_async) - -if ray is not None: - from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy - -if TYPE_CHECKING: - from ray.util.placement_group import PlacementGroup - -logger = init_logger(__name__) - - -class RayHPUExecutor(DistributedGPUExecutor): - - uses_ray: bool = True - - def _init_executor(self) -> None: - self.forward_dag: Optional[ray.dag.CompiledDAG] = None - # If the env var is set, it uses the Ray's compiled DAG API - # which optimizes the control plane overhead. - # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. - # Currently, this requires USE_RAY_SPMD_WORKER=True. - self.use_ray_compiled_dag = envs.VLLM_USE_RAY_COMPILED_DAG - # If the env var is set, then we do not distinguish between the - # "driver worker" vs other workers. Also, the rank 0 worker will - # be executed in a remote Ray worker. Currently this requires - # USE_RAY_COMPILED_DAG=True. - self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER - if self.use_ray_compiled_dag: - assert self.use_ray_spmd_worker, ( - "VLLM_USE_RAY_COMPILED_DAG=1 requires " - "VLLM_USE_RAY_SPMD_WORKER=1") - if self.use_ray_spmd_worker: - # TODO: Support SPMD worker for non-DAG Ray executor. - assert self.use_ray_compiled_dag, ( - "VLLM_USE_RAY_SPMD_WORKER=1 requires " - "VLLM_USE_RAY_COMPILED_DAG=1") - - assert self.uses_ray - placement_group = self.parallel_config.placement_group - - # Disable Ray usage stats collection. - ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") - if ray_usage != "1": - os.environ["RAY_USAGE_STATS_ENABLED"] = "0" - - # Create the parallel GPU workers. - self._init_workers_ray(placement_group) - - self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook) - self.output_decoder = msgspec.msgpack.Decoder( - Optional[List[SamplerOutput]]) - - def shutdown(self) -> None: - if hasattr(self, "forward_dag") and self.forward_dag is not None: - self.forward_dag.teardown() - import ray - for worker in self.workers: - ray.kill(worker) - self.forward_dag = None - - def finish_measurements(self): - self._run_workers("finish_measurements") - - def _init_workers_ray(self, placement_group: "PlacementGroup", - **ray_remote_kwargs): - # Otherwise, the ray workers are allocated with a full GPU. - num_gpus = 1 - - # The driver dummy worker does not actually use any resources. - # It holds the resource for the driver worker. - self.driver_dummy_worker: Optional[RayWorkerWrapper] = None - # The remaining workers are the actual ray actors. - self.workers: List[RayWorkerWrapper] = [] - - # Used in ray compiled DAG: indexed first by PP rank, - # and then TP rank. In other words, the inner list is - # the TP group of workers for a PP rank. - self.pp_tp_workers: List[List[RayWorkerWrapper]] = [] - - logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker) - - # Create the workers. - driver_ip = get_ip() - for bundle_id, bundle in enumerate(placement_group.bundle_specs): - if not bundle.get("HPU", 0): - continue - scheduling_strategy = PlacementGroupSchedulingStrategy( - placement_group=placement_group, - placement_group_capture_child_tasks=True, - placement_group_bundle_index=bundle_id, - ) - - worker = ray.remote( - num_cpus=0, - num_gpus=0, - resources={'HPU': num_gpus}, - scheduling_strategy=scheduling_strategy, - **ray_remote_kwargs, - )(RayWorkerWrapper).remote(vllm_config=self.vllm_config) - - if self.use_ray_spmd_worker: - self.workers.append(worker) - else: - worker_ip = ray.get(worker.get_node_ip.remote()) - if worker_ip == driver_ip and self.driver_dummy_worker is None: - # If the worker is on the same node as the driver, we use it - # as the resource holder for the driver process. - self.driver_dummy_worker = worker - self.driver_worker = RayWorkerWrapper( - vllm_config=self.vllm_config) - else: - # Else, added to the list of workers. - self.workers.append(worker) - - logger.debug("workers: %s", self.workers) - logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker) - if not self.use_ray_spmd_worker and self.driver_dummy_worker is None: - raise ValueError( - "Ray does not allocate any GPUs on the driver node. Consider " - "adjusting the Ray placement group or running the driver on a " - "GPU node.") - - worker_ips = [ - ray.get(worker.get_node_ip.remote()) # type: ignore[attr-defined] - for worker in self.workers - ] - ip_counts: Dict[str, int] = {} - for ip in worker_ips: - ip_counts[ip] = ip_counts.get(ip, 0) + 1 - - def sort_by_driver_then_worker_ip(worker): - """ - Sort the workers based on 3 properties: - 1. If the worker is on the same node as the driver (vllm engine), - it should be placed first. - 2. Then, if the worker is on a node with fewer workers, it should - be placed first. - 3. Finally, if the work is on a node with smaller IP address, it - should be placed first. - """ - ip = ray.get(worker.get_node_ip.remote()) - return (ip != driver_ip, ip_counts[ip], ip) - - # After sorting, the workers on the same node will be - # close to each other, and the workers on the driver - # node will be placed first. - self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip) - - worker_node_and_gpu_ids = [] - for worker in [self.driver_dummy_worker] + self.workers: - if worker is None: - # driver_dummy_worker can be None when using ray spmd worker. - continue - worker_node_and_gpu_ids.append( - ray.get(worker.get_node_and_gpu_ids.remote()) \ - ) # type: ignore - - node_workers = defaultdict(list) # node id -> list of worker ranks - node_gpus = defaultdict(list) # node id -> list of gpu ids - - for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): - node_workers[node_id].append(i) - # `gpu_ids` can be a list of strings or integers. - # convert them to integers for consistency. - # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs), - # string sorting is not sufficient. - # see https://github.com/vllm-project/vllm/issues/5590 - gpu_ids = [int(x) for x in gpu_ids] - node_gpus[node_id].extend(gpu_ids) - for node_id, gpu_ids in node_gpus.items(): - node_gpus[node_id] = sorted(gpu_ids) - - all_ips = set(worker_ips + [driver_ip]) - n_ips = len(all_ips) - n_nodes = len(node_workers) - - if n_nodes != n_ips: - raise RuntimeError( - f"Every node should have a unique IP address. Got {n_nodes}" - f" nodes with node ids {list(node_workers.keys())} and " - f"{n_ips} unique IP addresses {all_ips}. Please check your" - " network configuration. If you set `VLLM_HOST_IP` " - "environment variable, make sure it is unique for" - " each node.") - - # Set environment variables for the driver and workers. - all_args_to_update_environment_variables = [({ - "VLLM_TRACE_FUNCTION": - str(envs.VLLM_TRACE_FUNCTION), - }, ) for (node_id, _) in worker_node_and_gpu_ids] - self._run_workers("update_environment_variables", - all_args=all_args_to_update_environment_variables) - - if len(node_gpus) == 1: - # in single node case, we don't need to get the IP address. - # the loopback address is sufficient - # NOTE: a node may have several IP addresses, one for each - # network interface. `get_ip()` might return any of them, - # while they might not work for communication inside the node - # if the network setup is complicated. Using the loopback address - # solves this issue, as it always works for communication inside - # the node. - driver_ip = "127.0.0.1" - distributed_init_method = get_distributed_init_method( - driver_ip, get_open_port()) - - # Initialize the actual workers inside worker wrapper. - init_worker_all_kwargs = [ - self._get_worker_kwargs( - local_rank=node_workers[node_id].index(rank), - rank=rank, - distributed_init_method=distributed_init_method, - ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids) - ] - self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) - - self._run_workers("init_device") - self._run_workers("load_model", - max_concurrent_workers=self.parallel_config. - max_parallel_loading_workers) - - if self.use_ray_spmd_worker: - for pp_rank in range(self.parallel_config.pipeline_parallel_size): - self.pp_tp_workers.append([]) - for tp_rank in range( - self.parallel_config.tensor_parallel_size): - # PP=2, TP=4 - # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]] - rank = (pp_rank * self.parallel_config.tensor_parallel_size - ) + tp_rank - assert len(self.pp_tp_workers[pp_rank]) == tp_rank - assert pp_rank < len(self.pp_tp_workers) - self.pp_tp_workers[pp_rank].append(self.workers[rank]) - - # This is the list of workers that are rank 0 of each TP group EXCEPT - # global rank 0. These are the workers that will broadcast to the - # rest of the workers. - self.tp_driver_workers: List[RayWorkerWrapper] = [] - # This is the list of workers that are not drivers and not the first - # worker in a TP group. These are the workers that will be - # broadcasted to. - self.non_driver_workers: List[RayWorkerWrapper] = [] - - # Enforce rank order for correct rank to return final output. - for index, worker in enumerate(self.workers): - # The driver worker is rank 0 and not in self.workers. - rank = index + 1 - if rank % self.parallel_config.tensor_parallel_size == 0: - self.tp_driver_workers.append(worker) - else: - self.non_driver_workers.append(worker) - - def _driver_execute_model( - self, execute_model_req: Optional[ExecuteModelRequest] - ) -> Optional[List[SamplerOutput]]: - """Run execute_model in the driver worker. - - Passing None will cause the driver to stop the model execution - loop running in each of the remote workers. - """ - assert not self.use_ray_spmd_worker, ( - "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1") - return self.driver_worker.execute_method("execute_model", - execute_model_req) - - def execute_model( - self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: - if not self.use_ray_spmd_worker: - return super().execute_model(execute_model_req) - - if self.forward_dag is None: - self.forward_dag = self._compiled_ray_dag(enable_asyncio=False) - - serialized_data = self.input_encoder.encode(execute_model_req) - outputs = ray.get(self.forward_dag.execute(serialized_data)) - output = self.output_decoder.decode(outputs[0]) - return output - - def _run_workers( - self, - method: str, - *args, - async_run_tensor_parallel_workers_only: bool = False, - all_args: Optional[List[Tuple[Any, ...]]] = None, - all_kwargs: Optional[List[Dict[str, Any]]] = None, - max_concurrent_workers: Optional[int] = None, - **kwargs, - ) -> Any: - """Runs the given method on all workers. Can be used in the following - ways: - - Args: - - async_run_tensor_parallel_workers_only: If True the method will be - run only in the remote TP workers, not the driver worker. - It will also be run asynchronously and return a list of futures - rather than blocking on the results. - - args/kwargs: All workers share the same args/kwargs - - all_args/all_kwargs: args/kwargs for each worker are specified - individually - """ - if self.use_ray_spmd_worker: - assert not async_run_tensor_parallel_workers_only, ( - "async_run_tensor_parallel_workers_only is not supported for " - "spmd mode.") - - if max_concurrent_workers: - raise NotImplementedError( - "max_concurrent_workers is not supported yet.") - - count = len(self.workers) if not \ - async_run_tensor_parallel_workers_only \ - else len(self.non_driver_workers) - # If using SPMD worker, all workers are the same, so we should execute - # the args on all workers. Otherwise, we skip the first worker's args - # because those args will go to the driver worker. - first_worker_args_index: int = 0 if self.use_ray_spmd_worker else 1 - all_worker_args = repeat(args, count) if all_args is None \ - else islice(all_args, first_worker_args_index, None) - all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \ - else islice(all_kwargs, first_worker_args_index, None) - - # Start the ray workers first. - ray_workers = self.workers - if async_run_tensor_parallel_workers_only: - ray_workers = self.non_driver_workers - ray_worker_outputs = [ - worker.execute_method.remote(method, *worker_args, **worker_kwargs) - for (worker, worker_args, worker_kwargs - ) in zip(ray_workers, all_worker_args, all_worker_kwargs) - ] - - if async_run_tensor_parallel_workers_only: - # Just return futures - return ray_worker_outputs - - driver_worker_output = [] - # In SPMD mode, the driver worker is the same as any other worker, - # so we only explicitly execute on the driver worker if using a - # non-SPMD worker class. - if not self.use_ray_spmd_worker: - driver_args = args if all_args is None else all_args[0] - driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] - - # Start the driver worker after all the ray workers. - driver_worker_output = [ - self.driver_worker.execute_method(method, *driver_args, - **driver_kwargs) - ] - - # Get the results of the ray workers. - if self.workers: - ray_worker_outputs = ray.get(ray_worker_outputs) - - return driver_worker_output + ray_worker_outputs - - def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: - """Wait for futures returned from _run_workers() with - async_run_remote_workers_only to complete.""" - ray.get(parallel_worker_tasks) - - def _check_ray_adag_installation(self): - import pkg_resources - from packaging import version - - required_version = version.parse("2.35") - current_version = version.parse( - pkg_resources.get_distribution("ray").version) - # TODO: update the constraint once we adapt to the backward - # incompatible API change from ray 2.36 - if current_version != required_version: - raise ValueError(f"Ray version {required_version} is " - f"required, but found {current_version}") - - import importlib.util - adag_spec = importlib.util.find_spec( - "ray.experimental.compiled_dag_ref") - if adag_spec is None: - raise ValueError("Ray accelerated DAG is not installed. " - "Run `pip install ray[adag]` to install it.") - - def _compiled_ray_dag(self, enable_asyncio: bool): - assert self.parallel_config.use_ray - self._check_ray_adag_installation() - from ray.dag import InputNode, MultiOutputNode - from ray.experimental.channel.torch_tensor_type import TorchTensorType - - with InputNode() as input_data: - # Example DAG: PP=2, TP=4 - # (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput # noqa: E501 - # -> 1 -> (ExecuteModelReq, IntermediateOutput) -> 5 -> SamplerOutput # noqa: E501 - # -> 2 -> (ExecuteModelReq, IntermediateOutput) -> 6 -> SamplerOutput # noqa: E501 - # -> 3 -> (ExecuteModelReq, IntermediateOutput) -> 7 -> SamplerOutput # noqa: E501 - - # All workers in the first TP group will take in the - # ExecuteModelRequest as input. - outputs = [input_data for _ in self.pp_tp_workers[0]] - for pp_rank, tp_group in enumerate(self.pp_tp_workers): - # Each PP worker takes in the output of the previous PP worker, - # and the TP group executes in SPMD fashion. - outputs = [ - worker.execute_model_spmd. - bind( # type: ignore[attr-defined] - outputs[i]) for i, worker in enumerate(tp_group) - ] - - last_pp_rank = len(self.pp_tp_workers) - 1 - if pp_rank < last_pp_rank: - # Specify how intermediate tensors should be passed - # between pp stages, no need to specify for the last - # pp stage. - transport = "auto" - outputs = [ - output.with_type_hint( - TorchTensorType(transport=transport)) - for output in outputs - ] - - forward_dag = MultiOutputNode(outputs) - - return forward_dag.experimental_compile(enable_asyncio=enable_asyncio) - - def __del__(self): - self.shutdown() - - -class RayHPUExecutorAsync(RayHPUExecutor, DistributedGPUExecutorAsync): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.pp_locks: Optional[List[asyncio.Lock]] = None - self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER - if not self.use_ray_compiled_dag: - self.driver_exec_method = make_async( - self.driver_worker.execute_method) - - async def execute_model_async( - self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: - if not self.use_ray_spmd_worker: - return await super().execute_model_async(execute_model_req) - - if self.forward_dag is None: - self.forward_dag = self._compiled_ray_dag(enable_asyncio=True) - - serialized_data = self.input_encoder.encode(execute_model_req) - dag_future = await self.forward_dag.execute_async(serialized_data) - outputs = await dag_future - return self.output_decoder.decode(outputs[0]) - - async def _driver_execute_model_async( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: - assert not self.use_ray_spmd_worker, ( - "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1") - if not self.tp_driver_workers: - return await self.driver_exec_method("execute_model", - execute_model_req) - if self.pp_locks is None: - # This locks each pipeline parallel stage so multiple virtual - # engines can't execute on the same stage at the same time - # We create the locks here to avoid creating them in the constructor - # which uses a different asyncio loop. - self.pp_locks = [ - asyncio.Lock() - for _ in range(self.parallel_config.pipeline_parallel_size) - ] - - tasks = [ - asyncio.create_task( - _run_task_with_lock(self.driver_exec_method, self.pp_locks[0], - "execute_model", execute_model_req)) - ] - for pp_rank, driver_worker in enumerate(self.tp_driver_workers, - start=1): - tasks.append( - asyncio.create_task( - _run_task_with_lock(driver_worker.execute_method.remote, - self.pp_locks[pp_rank], - "execute_model", execute_model_req))) - - results = await asyncio.gather(*tasks) - - # Only the last PP stage has the final results. - return results[-1] - - async def _start_worker_execution_loop(self): - assert not self.use_ray_spmd_worker, ( - "worker loop is disabled for VLLM_USE_RAY_SPMD_WORKER=1") - coros = [ - worker.execute_method.remote("start_worker_execution_loop") - for worker in self.non_driver_workers - ] - return await asyncio.gather(*coros) - - def __del__(self): - self.shutdown() diff --git a/vllm/executor/ray_tpu_executor.py b/vllm/executor/ray_tpu_executor.py deleted file mode 100644 index 5118c13934f0d..0000000000000 --- a/vllm/executor/ray_tpu_executor.py +++ /dev/null @@ -1,343 +0,0 @@ -import asyncio -import os -from collections import defaultdict -from itertools import islice, repeat -from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Tuple, - Union) - -import vllm.envs as envs -from vllm.executor.executor_base import ExecutorAsyncBase -from vllm.executor.ray_utils import RayWorkerWrapper, ray -from vllm.executor.tpu_executor import TPUExecutor -from vllm.logger import init_logger -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest -from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, - make_async) - -if ray is not None: - from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy - -if TYPE_CHECKING: - from ray.util.placement_group import PlacementGroup - -logger = init_logger(__name__) - - -class RayTPUExecutor(TPUExecutor): - - uses_ray: bool = True - - def __init__(self, *args, **kwargs): - # This is non-None when the execute model loop is running - # in the parallel workers. It's a coroutine in the AsyncLLMEngine case. - self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None - # Updated by implementations that require additional args to be passed - # to the _run_workers execute_model call - self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {} - - super().__init__(*args, **kwargs) - - def _init_executor(self) -> None: - assert self.parallel_config.distributed_executor_backend == "ray" - placement_group = self.parallel_config.placement_group - - # Disable Ray usage stats collection. - ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") - if ray_usage != "1": - os.environ["RAY_USAGE_STATS_ENABLED"] = "0" - - # Create the parallel TPU workers. - self._init_workers_ray(placement_group) - - def _init_workers_ray(self, placement_group: "PlacementGroup", - **ray_remote_kwargs): - # The driver dummy worker does not actually use any resources. - # It holds the resource for the driver worker. - self.driver_dummy_worker: Optional[RayWorkerWrapper] = None - # The remaining workers are the actual ray actors. - self.workers: List[RayWorkerWrapper] = [] - - # Create the workers. - driver_ip = get_ip() - for bundle_id, bundle in enumerate(placement_group.bundle_specs): - if not bundle.get("TPU", 0): - continue - scheduling_strategy = PlacementGroupSchedulingStrategy( - placement_group=placement_group, - placement_group_capture_child_tasks=True, - placement_group_bundle_index=bundle_id, - ) - - # GKE does not fetch environment information from metadata server - # and instead sets these from within the Ray process. Therefore we - # need to override the Ray environment variables manually. - override_env = {} - if "TPU_CHIPS_PER_HOST_BOUNDS" in os.environ: - override_env.update({ - "TPU_CHIPS_PER_HOST_BOUNDS": - os.environ["TPU_CHIPS_PER_HOST_BOUNDS"] - }) - if "TPU_HOST_BOUNDS" in os.environ: - override_env.update( - {"TPU_HOST_BOUNDS": os.environ["TPU_HOST_BOUNDS"]}) - - worker = ray.remote( - num_cpus=0, - resources={"TPU": 1}, - scheduling_strategy=scheduling_strategy, - **ray_remote_kwargs, - )(RayWorkerWrapper).remote(vllm_config=self.vllm_config) - if override_env: - worker.override_env_vars.remote(override_env) - - worker_ip = ray.get(worker.get_node_ip.remote()) - if worker_ip == driver_ip and self.driver_dummy_worker is None: - # If the worker is on the same node as the driver, we use it - # as the resource holder for the driver process. - self.driver_dummy_worker = worker - self.driver_worker = RayWorkerWrapper( - vllm_config=self.vllm_config) - else: - # Else, added to the list of workers. - self.workers.append(worker) - - logger.debug("workers: %s", self.workers) - logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker) - if self.driver_dummy_worker is None: - raise ValueError( - "Ray does not allocate any TPUs on the driver node. Consider " - "adjusting the Ray placement group or running the driver on a " - "TPU node.") - - worker_ips = [ - ray.get(worker.get_node_ip.remote()) # type: ignore[attr-defined] - for worker in self.workers - ] - ip_counts: Dict[str, int] = {} - for ip in worker_ips: - ip_counts[ip] = ip_counts.get(ip, 0) + 1 - - def sort_by_driver_then_worker_ip(worker): - """ - Sort the workers based on 3 properties: - 1. If the worker is on the same node as the driver (vllm engine), - it should be placed first. - 2. Then, if the worker is on a node with fewer workers, it should - be placed first. - 3. Finally, if the work is on a node with smaller IP address, it - should be placed first. - """ - ip = ray.get(worker.get_node_ip.remote()) - return (ip != driver_ip, ip_counts[ip], ip) - - # After sorting, the workers on the same node will be - # close to each other, and the workers on the driver - # node will be placed first. - self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip) - - # Get the set of TPU IDs used on each node. - worker_node_and_gpu_ids = [] - for worker in [self.driver_dummy_worker] + self.workers: - if worker is None: - # driver_dummy_worker can be None when using ray spmd worker. - continue - worker_node_and_gpu_ids.append( - ray.get(worker.get_node_and_gpu_ids.remote()) \ - ) # type: ignore - - node_workers = defaultdict(list) - for i, (node_id, _) in enumerate(worker_node_and_gpu_ids): - node_workers[node_id].append(i) - - # Set environment variables for the driver and workers. - all_args_to_update_environment_variables = [({ - "VLLM_TRACE_FUNCTION": - str(envs.VLLM_TRACE_FUNCTION), - }, ) for _ in worker_node_and_gpu_ids] - self._run_workers("update_environment_variables", - all_args=all_args_to_update_environment_variables) - - if len(node_workers) == 1: - # in single node case, we don't need to get the IP address. - # the loopback address is sufficient - # NOTE: a node may have several IP addresses, one for each - # network interface. `get_ip()` might return any of them, - # while they might not work for communication inside the node - # if the network setup is complicated. Using the loopback address - # solves this issue, as it always works for communication inside - # the node. - driver_ip = "127.0.0.1" - distributed_init_method = get_distributed_init_method( - driver_ip, get_open_port()) - - # Initialize the actual workers inside worker wrapper. - init_worker_all_kwargs = [ - self._get_worker_kwargs( - local_rank=node_workers[node_id].index(rank), - rank=rank, - distributed_init_method=distributed_init_method, - ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids) - ] - self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) - - self._run_workers("init_device") - self._run_workers("load_model", - max_concurrent_workers=self.parallel_config. - max_parallel_loading_workers) - - def _driver_execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: - """Run execute_model in the driver worker. - - Passing None will cause the driver to stop the model execution - loop running in each of the remote workers. - """ - return self.driver_worker.execute_method("execute_model", - execute_model_req) - - def _run_workers( - self, - method: str, - *args, - async_run_remote_workers_only: bool = False, - all_args: Optional[List[Tuple[Any, ...]]] = None, - all_kwargs: Optional[List[Dict[str, Any]]] = None, - max_concurrent_workers: Optional[int] = None, - use_ray_compiled_dag: bool = False, - **kwargs, - ) -> Any: - """Runs the given method on all workers. Can be used in the following - ways: - - - async_run_remote_workers_only: If True the method will be run only - in the remote workers, not the driver worker. It will also be - run asynchronously and return a list of futures rather than blocking - on the results. - - args/kwargs: All workers share the same args/kwargs - - all_args/all_kwargs: args/kwargs for each worker are specified - individually - """ - - if max_concurrent_workers: - raise NotImplementedError( - "max_concurrent_workers is not supported yet.") - - count = len(self.workers) - all_worker_args = repeat(args, count) if all_args is None \ - else islice(all_args, 1, None) - all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \ - else islice(all_kwargs, 1, None) - - # Start the ray workers first. - ray_worker_outputs = [ - worker.execute_method.remote(method, *worker_args, **worker_kwargs) - for (worker, worker_args, worker_kwargs - ) in zip(self.workers, all_worker_args, all_worker_kwargs) - ] - - if async_run_remote_workers_only: - # Just return futures - return ray_worker_outputs - - driver_args = args if all_args is None else all_args[0] - driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] - - # Start the driver worker after all the ray workers. - driver_worker_output = self.driver_worker.execute_method( - method, *driver_args, **driver_kwargs) - # Get the results of the ray workers. - if self.workers: - ray_worker_outputs = ray.get(ray_worker_outputs) - - return [driver_worker_output] + ray_worker_outputs - - def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: - """Wait for futures returned from _run_workers() with - async_run_remote_workers_only to complete.""" - ray.get(parallel_worker_tasks) - - def determine_num_available_blocks(self) -> Tuple[int, int]: - num_blocks = self._run_workers("determine_num_available_blocks", ) - num_tpu_blocks = min(b[0] for b in num_blocks) - num_cpu_blocks = min(b[1] for b in num_blocks) - return num_tpu_blocks, num_cpu_blocks - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks, - num_cpu_blocks) - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - self._run_workers("initialize_cache", - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=num_cpu_blocks) - - def execute_model( - self, - execute_model_req: ExecuteModelRequest, - ) -> List[SamplerOutput]: - if self.parallel_worker_tasks is None: - self.parallel_worker_tasks = self._run_workers( - "start_worker_execution_loop", - async_run_remote_workers_only=True, - **self.extra_execute_model_run_workers_kwargs) - - # Only the driver worker returns the sampling results. - return self._driver_execute_model(execute_model_req) - - def stop_remote_worker_execution_loop(self) -> None: - if self.parallel_worker_tasks is None: - return - - self._driver_execute_model() - parallel_worker_tasks = self.parallel_worker_tasks - self.parallel_worker_tasks = None - # Ensure that workers exit model loop cleanly - # (this will raise otherwise) - self._wait_for_tasks_completion(parallel_worker_tasks) - - -class RayTPUExecutorAsync(RayTPUExecutor, ExecutorAsyncBase): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.driver_exec_method = make_async(self.driver_worker.execute_method) - - async def execute_model_async( - self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: - if self.parallel_worker_tasks is None: - # Start model execution loop running in the parallel workers - self.parallel_worker_tasks = asyncio.create_task( - self._start_worker_execution_loop()) - - # Only the driver worker returns the sampling results. - return await self._driver_execute_model_async(execute_model_req) - - async def stop_remote_worker_execution_loop_async(self) -> None: - if self.parallel_worker_tasks is None: - return - - await self._driver_execute_model_async() - parallel_worker_tasks = self.parallel_worker_tasks - self.parallel_worker_tasks = None - # Ensure that workers exit model loop cleanly - # (this will raise otherwise) - await parallel_worker_tasks - - async def _driver_execute_model_async( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: - return await self.driver_exec_method("execute_model", - execute_model_req) - - async def _start_worker_execution_loop(self): - coros = [ - worker.execute_method.remote("start_worker_execution_loop") - for worker in self.workers - ] - return await asyncio.gather(*coros) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 8d766bad1a072..e55155ea06225 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -1,17 +1,22 @@ import os import time from collections import defaultdict -from typing import Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union import msgspec from vllm.config import ParallelConfig from vllm.executor.msgspec_utils import decode_hook, encode_hook from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.sequence import ExecuteModelRequest, IntermediateTensors from vllm.utils import get_ip from vllm.worker.worker_base import WorkerWrapperBase +if TYPE_CHECKING: + from vllm.v1.core.scheduler import SchedulerOutput + from vllm.v1.outputs import ModelRunnerOutput + logger = init_logger(__name__) PG_WAIT_TIMEOUT = 1800 @@ -47,7 +52,12 @@ try: def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]: node_id = ray.get_runtime_context().get_node_id() - gpu_ids = ray.get_gpu_ids() + device_key = current_platform.ray_device_key + if not device_key: + raise RuntimeError("current platform %s does not support ray.", + current_platform.device_name) + gpu_ids = ray.get_runtime_context().get_accelerator_ids( + )[device_key] return node_id, gpu_ids def execute_model_spmd( @@ -89,6 +99,26 @@ try: return output + def setup_device_if_necessary(self): + # TODO(swang): This is needed right now because Ray CG executes + # on a background thread, so we need to reset torch's current + # device. + # We can remove this API after it is fixed in compiled graph. + import torch + assert self.worker is not None, "Worker is not initialized" + if not self.compiled_dag_cuda_device_set: + torch.cuda.set_device(self.worker.device) + self.compiled_dag_cuda_device_set = True + + def execute_model( + self, + scheduler_output: "SchedulerOutput", + ) -> "ModelRunnerOutput": + self.setup_device_if_necessary() + assert self.worker is not None, "Worker is not initialized" + output = self.worker.model_runner.execute_model(scheduler_output) + return output + def override_env_vars(self, vars: Dict[str, str]): os.environ.update(vars) @@ -249,11 +279,12 @@ def initialize_ray_cluster( # Placement group is already set. return - device_str = "GPU" - if current_platform.is_tpu(): - device_str = "TPU" - elif current_platform.is_hpu(): - device_str = 'HPU' + device_str = current_platform.ray_device_key + if not device_str: + raise ValueError( + f"current platform {current_platform.device_name} does not " + "support ray.") + # Create placement group for worker processes current_placement_group = ray.util.get_current_placement_group() if current_placement_group: diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py deleted file mode 100644 index d2086f5fef26c..0000000000000 --- a/vllm/executor/ray_xpu_executor.py +++ /dev/null @@ -1,40 +0,0 @@ -import asyncio -from typing import List, Optional - -import ray - -import vllm.envs as envs -from vllm.executor.ray_gpu_executor import RayGPUExecutor, RayGPUExecutorAsync -from vllm.executor.xpu_executor import XPUExecutor -from vllm.logger import init_logger -from vllm.utils import make_async - -logger = init_logger(__name__) - - -class RayXPUExecutor(RayGPUExecutor, XPUExecutor): - - def _get_env_vars_to_be_updated(self): - # Get the set of GPU IDs used on each node. - worker_node_and_gpu_ids = [] - for worker in [self.driver_dummy_worker] + self.workers: - if worker is None: - # driver_dummy_worker can be None when using ray spmd worker. - continue - worker_node_and_gpu_ids.append( - ray.get(worker.get_node_and_gpu_ids.remote())) # type: ignore - - # Set environment variables for the driver and workers. - all_args_to_update_environment_variables = [({ - "VLLM_TRACE_FUNCTION": - str(envs.VLLM_TRACE_FUNCTION), - }, ) for (_, _) in worker_node_and_gpu_ids] - return all_args_to_update_environment_variables - - -class RayXPUExecutorAsync(RayXPUExecutor, RayGPUExecutorAsync): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.driver_exec_method = make_async(self.driver_worker.execute_method) - self.pp_locks: Optional[List[asyncio.Lock]] = None diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py deleted file mode 100644 index e37e8973790db..0000000000000 --- a/vllm/executor/tpu_executor.py +++ /dev/null @@ -1,142 +0,0 @@ -from typing import Any, Dict, List, Optional, Set, Tuple - -import torch - -from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest -from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, - make_async) - -logger = init_logger(__name__) - - -class TPUExecutor(ExecutorBase): - - uses_ray: bool = False - - def _init_executor(self) -> None: - assert not self.scheduler_config.chunked_prefill_enabled, ( - "Chunked prefill is not yet supported for TPU backend") - assert not self.speculative_config, ( - "Speculative decoding is not yet supported for TPU backend") - if self.model_config.dtype in (torch.float16, torch.float32): - logger.warning( - "The TPU backend currently does not support %s. " - "Using bfloat16 instead.", self.model_config.dtype) - self.model_config.dtype = torch.bfloat16 - - # Instantiate the worker and load the model to the device. - self.driver_worker = self._create_worker() - self.driver_worker.init_device() - self.driver_worker.load_model() - - def _get_worker_kwargs( - self, - local_rank: int = 0, - rank: int = 0, - distributed_init_method: Optional[str] = None, - ) -> Dict[str, Any]: - """Return worker init args for a given rank.""" - if distributed_init_method is None: - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - return dict( - vllm_config=self.vllm_config, - local_rank=local_rank, - rank=rank, - distributed_init_method=distributed_init_method, - is_driver_worker=rank == 0, - ) - - def _create_worker( - self, - local_rank: int = 0, - rank: int = 0, - distributed_init_method: Optional[str] = None, - ): - if self.scheduler_config.is_multi_step: - from vllm.worker.multi_step_tpu_worker import MultiStepTPUWorker - worker = MultiStepTPUWorker(**self._get_worker_kwargs( - local_rank, rank, distributed_init_method)) - return worker - else: - from vllm.worker.tpu_worker import TPUWorker - - worker = TPUWorker(**self._get_worker_kwargs( - local_rank, rank, distributed_init_method)) - return worker - - def initialize_cache( - self, - num_gpu_blocks: int, - num_cpu_blocks: int, - ) -> None: - """Initialize the KV cache by invoking the underlying worker.""" - # NOTE: This is logged in the executor because there can be >1 worker - # with other executors. We could log in the engine level, but work - # remains to abstract away the device for non-GPU configurations. - logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks, - num_cpu_blocks) - self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Determine the number of available KV blocks by invoking the - underlying worker.""" - return self.driver_worker.determine_num_available_blocks() - - def execute_model( - self, - execute_model_req: ExecuteModelRequest, - ) -> List[SamplerOutput]: - output = self.driver_worker.execute_model(execute_model_req) - return output - - def add_lora(self, lora_request: LoRARequest) -> bool: - raise NotImplementedError( - "LoRA is currently not supported by the TPU backend.") - - def remove_lora(self, lora_id: int) -> bool: - raise NotImplementedError( - "LoRA is currently not supported by the TPU backend.") - - def pin_lora(self, lora_id: int) -> bool: - raise NotImplementedError( - "LoRA is currently not supported by the TPU backend.") - - def list_loras(self) -> Set[int]: - raise NotImplementedError( - "LoRA is currently not supported by the TPU backend.") - - def add_prompt_adapter(self, prompt_adapter_request) -> bool: - raise NotImplementedError( - "Soft prompt is currently not supported by the TPU backend.") - - def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError( - "Soft prompt is currently not supported by the TPU backend.") - - def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError( - "Soft prompt is currently not supported by the TPU backend.") - - def list_prompt_adapters(self) -> Set[int]: - raise NotImplementedError( - "Soft prompt is currently not supported by the TPU backend.") - - def check_health(self) -> None: - # TPUExecutor will always be healthy as long as it's running. - return - - -class TPUExecutorAsync(TPUExecutor, ExecutorAsyncBase): - - async def execute_model_async( - self, - sexecute_model_req: ExecuteModelRequest, - ) -> SamplerOutput: - output = await make_async(self.driver_worker.execute_model - )(sexecute_model_req) - return output diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py new file mode 100644 index 0000000000000..a5c4dcf0ec7f9 --- /dev/null +++ b/vllm/executor/uniproc_executor.py @@ -0,0 +1,132 @@ +import os +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import torch +import torch.distributed as dist + +import vllm.envs as envs +from vllm.executor.executor_base import ExecutorBase +from vllm.logger import init_logger +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + run_method) +from vllm.worker.worker_base import WorkerWrapperBase + +logger = init_logger(__name__) + + +class UniProcExecutor(ExecutorBase): + + uses_ray: bool = False + + def _init_executor(self) -> None: + """Initialize the worker and load the model. + """ + self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config, + rpc_rank=0) + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + local_rank = 0 + rank = 0 + kwargs = dict( + vllm_config=self.vllm_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + is_driver_worker=(not self.parallel_config) + or (rank % self.parallel_config.tensor_parallel_size == 0), + ) + self.collective_rpc("init_worker", args=([kwargs], )) + self.collective_rpc("init_device") + self.collective_rpc("load_model") + + def collective_rpc(self, + method: Union[str, Callable], + timeout: Optional[float] = None, + args: Tuple = (), + kwargs: Optional[Dict] = None) -> List[Any]: + if kwargs is None: + kwargs = {} + answer = run_method(self.driver_worker, method, args, kwargs) + return [answer] + + def check_health(self) -> None: + # UniProcExecutor will always be healthy as long as + # it's running. + return + + +UniProcExecutorAsync = UniProcExecutor + + +class ExecutorWithExternalLauncher(UniProcExecutor): + """An executor that uses external launchers to launch engines, + specially designed for torchrun-compatible launchers, for + offline inference with tensor parallelism. + + see https://github.com/vllm-project/vllm/issues/11400 for + the motivation, and examples/offline_inference/torchrun_example.py + for the usage example. + + The key idea: although it is tensor-parallel inference, we only + create one worker per executor, users will launch multiple + engines with torchrun-compatible launchers, and all these engines + work together to process the same prompts. When scheduling is + deterministic, all the engines will generate the same outputs, + and they don't need to synchronize the states with each other. + """ + uses_ray: bool = False + + def _init_executor(self) -> None: + """Initialize the worker and load the model. + """ + assert self.vllm_config.parallel_config.pipeline_parallel_size == 1, \ + ("ExecutorWithExternalLauncher does not " + "support pipeline parallelism.") + assert self.vllm_config.scheduler_config.delay_factor == 0.0, \ + ("ExecutorWithExternalLauncher needs deterministic " + "execution, so it" + "does not support delay_factor in scheduling") + assert not envs.VLLM_USE_V1, \ + ("V1 architecture cannot guarantee deterministic execution, " + "so it is not supported in ExecutorWithExternalLauncher.") + self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config, + rpc_rank=0) + # engines are launched in torchrun-compatible launchers + # so we can use the env:// method. + # required env vars: + # - RANK + # - MASTER_ADDR + # - MASTER_PORT + distributed_init_method = "env://" + rank = int(os.environ["RANK"]) + local_rank = rank + is_driver_worker = True + kwargs = dict( + vllm_config=self.vllm_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + is_driver_worker=is_driver_worker, + ) + self.collective_rpc("init_worker", args=([kwargs], )) + self.collective_rpc("init_device") + self.collective_rpc("load_model") + + def determine_num_available_blocks(self) -> Tuple[int, int]: + """ + Determine the number of available KV blocks. + Add an additional all_reduce to get the min across all ranks. + Note that even if we have the same `gpu_memory_utilization` and + `swap_space`, the available memory in every rank might still + differ because NCCL can take different amounts of memory in + different ranks. Therefore, it is necessary to test if all ranks + agree on the same KV cache configuration. + """ + a, b = super().determine_num_available_blocks() + from vllm.distributed.parallel_state import get_world_group + cpu_group = get_world_group().cpu_group + a_tensor = torch.tensor([a], device="cpu", dtype=torch.int64) + b_tensor = torch.tensor([b], device="cpu", dtype=torch.int64) + dist.all_reduce(a_tensor, group=cpu_group, op=dist.ReduceOp.MIN) + dist.all_reduce(b_tensor, group=cpu_group, op=dist.ReduceOp.MIN) + return a_tensor.item(), b_tensor.item() diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py deleted file mode 100644 index 722b86a95ff8a..0000000000000 --- a/vllm/executor/xpu_executor.py +++ /dev/null @@ -1,39 +0,0 @@ -from typing import List, Optional, Union - -from vllm.executor.executor_base import ExecutorAsyncBase -from vllm.executor.gpu_executor import GPUExecutor -from vllm.logger import init_logger -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest, PoolerOutput -from vllm.utils import make_async - -logger = init_logger(__name__) - - -class XPUExecutor(GPUExecutor): - - uses_ray: bool = False - - def _init_executor(self) -> None: - assert self.device_config.device_type == "xpu" - assert self.speculative_config is None, ( - "Speculative decoding not yet supported for XPU backend") - - GPUExecutor._init_executor(self) - - def execute_model( - self, execute_model_req: ExecuteModelRequest - ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]: - output = self.driver_worker.execute_model(execute_model_req) - return output - - -class XPUExecutorAsync(XPUExecutor, ExecutorAsyncBase): - - async def execute_model_async( - self, - execute_model_req: ExecuteModelRequest, - ) -> List[SamplerOutput]: - output = await make_async(self.driver_worker.execute_model - )(execute_model_req=execute_model_req) - return output diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 7f56575279e9b..828b394ec5d21 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -2,7 +2,7 @@ import time from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass -from typing import Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional import torch @@ -10,6 +10,9 @@ import vllm.envs as envs from vllm.config import VllmConfig from vllm.logger import init_logger +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + logger = init_logger(__name__) track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0 @@ -21,9 +24,12 @@ batchsize_forward_time: defaultdict = defaultdict(list) @dataclass class ForwardContext: - static_forward_context: Dict[str, Any] + # copy from vllm_config.compilation_config.static_forward_context + attn_layers: Dict[str, Any] # TODO: extend to support per-layer dynamic forward context - dynamic_forward_context: Any + attn_metadata: "AttentionMetadata" # set dynamically for each forward pass + # TODO: remove after making all virtual_engines share the same kv cache + virtual_engine: int # set dynamically for each forward pass _forward_context: Optional[ForwardContext] = None @@ -38,34 +44,35 @@ def get_forward_context() -> ForwardContext: @contextmanager -def set_forward_context(context: Any, vllm_config: VllmConfig): +def set_forward_context(attn_metadata: Any, + vllm_config: VllmConfig, + virtual_engine: int = 0): """A context manager that stores the current forward context, can be attention metadata, etc. Here we can inject common logic for every model forward pass. """ global forward_start_time - need_to_track_batchsize = track_batchsize and context is not None + need_to_track_batchsize = track_batchsize and attn_metadata is not None if need_to_track_batchsize: forward_start_time = time.perf_counter() global _forward_context prev_context = _forward_context _forward_context = ForwardContext( - static_forward_context=vllm_config.compilation_config. - static_forward_context, - dynamic_forward_context=context) + attn_layers=vllm_config.compilation_config.static_forward_context, + virtual_engine=virtual_engine, + attn_metadata=attn_metadata) try: yield finally: - global batchsize_counter global last_logging_time, batchsize_logging_interval if need_to_track_batchsize: - if hasattr(context, "num_prefill_tokens"): + if hasattr(attn_metadata, "num_prefill_tokens"): # for v0 attention backends - batchsize = context.num_prefill_tokens + \ - context.num_decode_tokens + batchsize = attn_metadata.num_prefill_tokens + \ + attn_metadata.num_decode_tokens else: # for v1 attention backends - batchsize = context.num_input_tokens + batchsize = attn_metadata.num_input_tokens # we use synchronous scheduling right now, # adding a sync point here should not affect # scheduling of the next batch diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index aaeecab7ffde1..a0dd89f69bacd 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -11,9 +11,6 @@ INPUT_REGISTRY = InputRegistry() """ The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine` to dispatch data processing according to the target model. - -See also: - :ref:`input-processing-pipeline` """ __all__ = [ diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index cdaf6dd76eaa1..b8163a7acde1d 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -44,13 +44,13 @@ class TokensPrompt(TypedDict): multi_modal_data: NotRequired["MultiModalDataDict"] """ - DEPRECATED: Optional multi-modal data to pass to the model, + Optional multi-modal data to pass to the model, if the model supports it. """ mm_processor_kwargs: NotRequired[Dict[str, Any]] """ - DEPRECATED: Optional multi-modal processor kwargs to be forwarded to the + Optional multi-modal processor kwargs to be forwarded to the multimodal input mapper & processor. Note that if multiple modalities have registered mappers etc for the model being considered, we attempt to pass the mm_processor_kwargs to each of them. diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index a738ffe18e3ae..0890883cc984f 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -279,10 +279,6 @@ class InputPreprocessor: mm_processor = self.mm_registry.create_processor( self.model_config, tokenizer) - if isinstance(prompt, list): - logger.warning("Passing `multi_modal_data` in TokensPrompt is" - "deprecated and will be removed in a future update") - prompt = tokenizer.decode(prompt) if mm_processor_kwargs is None: mm_processor_kwargs = {} diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index aad0dfab94a01..4b73ade7af5f0 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -313,9 +313,6 @@ class InputRegistry: The model is identified by ``model_config``. - See also: - :ref:`enabling-multimodal-inputs` - Note: This should be called after :meth:`~MultiModalRegistry.init_mm_limits_per_prompt`. @@ -384,10 +381,8 @@ class InputRegistry: Register an input processor to a model class. The provided function is invoked on each input to the model. This - happens before :meth:`~vllm.multimodal.MultiModalRegistry.map_input`. - - See also: - :ref:`input-processing-pipeline` + happens before + :meth:`~vllm.multimodal.registry.MultiModalRegistry.map_input`. """ def wrapper(model_cls: N) -> N: @@ -429,9 +424,6 @@ class InputRegistry: Apply an input processor to an instance of model inputs. The model is identified by ``model_config``. - - See also: - :ref:`input-processing-pipeline` """ # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index a933ccaecf15e..e6f26d2b74b2f 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -51,6 +51,9 @@ def _get_lora_device(base_layer: nn.Module) -> torch.device: # marlin elif hasattr(base_layer, "B"): return base_layer.B.device + # HQQ marlin + elif hasattr(base_layer, "W_q"): + return base_layer.W_q.device else: raise ValueError(f"Unsupported base layer: {base_layer}") @@ -937,8 +940,8 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA): return self.base_layer.soft_cap @property - def use_gather(self): - return self.base_layer.use_gather + def use_all_gather(self): + return self.base_layer.use_all_gather @property def org_vocab_size(self): diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 5b7225bdc8f37..9809405ca9a61 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -1,5 +1,4 @@ import copy -import json import math import os import re @@ -180,8 +179,8 @@ class LoRAModel(AdapterModel): cls, lora_dir: str, expected_lora_modules: List[str], + peft_helper: PEFTHelper, *, - max_position_embeddings: Optional[int] = None, lora_model_id: Optional[int] = None, device: str = "cuda", dtype: Optional[torch.dtype] = None, @@ -196,9 +195,7 @@ class LoRAModel(AdapterModel): lora_dir: The local path that has lora data. expected_lora_modules: Name of modules that are expected to be replaced by lora. - max_position_embeddings: Max position embedding length. Used to - scaling the largest context length. If None, the lora model's - context length is not scaled. + peft_helper: Loaded lora configuration information. lora_model_id: Lora model id. If not given, automatically set by a global counter. device: Device where the lora model is loaded. @@ -207,18 +204,13 @@ class LoRAModel(AdapterModel): Returns: Loaded LoRA Model. """ - lora_config_path = os.path.join(lora_dir, "adapter_config.json") lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors") lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin") new_embeddings_tensor_path = os.path.join( lora_dir, "new_embeddings.safetensors") new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin") - with open(lora_config_path) as f: - config = json.load(f) - config["vllm_max_position_embeddings"] = max_position_embeddings - peft_helper = PEFTHelper.from_dict(config) unexpected_modules: List[Union[list[str], str]] if os.path.isfile(lora_tensor_path): tensors: Dict[str, torch.Tensor] = {} diff --git a/vllm/lora/ops/torch_ops/__init__.py b/vllm/lora/ops/torch_ops/__init__.py new file mode 100644 index 0000000000000..9c9159b95f308 --- /dev/null +++ b/vllm/lora/ops/torch_ops/__init__.py @@ -0,0 +1,13 @@ +from vllm.lora.ops.torch_ops.lora_ops import bgmv_expand # noqa: F401 +from vllm.lora.ops.torch_ops.lora_ops import (bgmv_expand_slice, bgmv_shrink, + sgmv_expand, sgmv_expand_slice, + sgmv_shrink) + +__all__ = [ + "bgmv_expand", + "bgmv_expand_slice", + "bgmv_shrink", + "sgmv_expand", + "sgmv_expand_slice", + "sgmv_shrink", +] diff --git a/vllm/lora/ops/torch_ops/lora_ops.py b/vllm/lora/ops/torch_ops/lora_ops.py new file mode 100644 index 0000000000000..5f5aafd516159 --- /dev/null +++ b/vllm/lora/ops/torch_ops/lora_ops.py @@ -0,0 +1,113 @@ +import torch + + +def sgmv_expand(inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + b_seq_start_loc: torch.Tensor, + seq_len_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batches: int, + max_seq_length: int, + token_nums: int, + add_inputs: bool = False): + exploded_indices = torch.repeat_interleave(lora_indices_tensor, + seq_len_tensor) + + bgmv_expand(inputs, lora_b_weights, output_tensor, exploded_indices, + add_inputs) + + +def bgmv_expand(inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + add_inputs: bool = True): + selected_loras = lora_b_weights[lora_indices_tensor].to( + dtype=output_tensor.dtype) + if len(selected_loras.shape) == 4: + selected_loras = selected_loras.squeeze(dim=1) + inputs = inputs.to(dtype=output_tensor.dtype) + outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras) + + limit = output_tensor.shape[0] + if outputs.shape[0] == 1 and output_tensor.shape[0] != 1: + limit = 1 + + if add_inputs: + output_tensor[:, :outputs.shape[1]] += outputs[:limit, :] + else: + output_tensor[:, :outputs.shape[1]] = outputs[:limit, :] + + +def sgmv_shrink( + inputs: torch.Tensor, + lora_a_weights: torch.Tensor, + output_tensor: torch.Tensor, + b_seq_start_loc: torch.Tensor, + seq_len_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batches: int, + max_seq_length: int, + token_nums: int, + scaling: float, +): + exploded_indices = torch.repeat_interleave(lora_indices_tensor, + seq_len_tensor) + + bgmv_shrink(inputs, lora_a_weights, output_tensor, exploded_indices, + scaling) + + +def bgmv_shrink(inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + scaling: float = 1.0): + selected_loras = lora_b_weights[lora_indices_tensor].to( + dtype=output_tensor.dtype) + if len(selected_loras.shape) == 4: + selected_loras = selected_loras.squeeze(dim=1) + inputs = inputs.to(dtype=output_tensor.dtype) + outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras) + + output_tensor[:, :outputs.shape[1]] = scaling * outputs[:] + + +def sgmv_expand_slice(inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + b_seq_start_loc: torch.Tensor, + seq_len_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batches: int, + max_seq_length: int, + token_nums: int, + slice_offset: int, + slice_size: int, + add_inputs: bool = False): + exploded_indices = torch.repeat_interleave(lora_indices_tensor, + seq_len_tensor) + + bgmv_expand_slice(inputs, lora_b_weights, output_tensor, exploded_indices, + slice_offset, slice_size, add_inputs) + + +def bgmv_expand_slice(inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + slice_offset: int, + slice_size: int, + add_inputs: bool = True): + selected_loras = lora_b_weights[lora_indices_tensor].to( + dtype=output_tensor.dtype) + inputs = inputs.to(dtype=output_tensor.dtype) + if len(selected_loras.shape) == 4: + selected_loras = selected_loras.squeeze(dim=1) + outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras) + + if add_inputs: + output_tensor[:, slice_offset:slice_offset + slice_size] += outputs[:] + else: + output_tensor[:, slice_offset:slice_offset + slice_size] = outputs[:] diff --git a/vllm/lora/ops/triton_ops/__init__.py b/vllm/lora/ops/triton_ops/__init__.py new file mode 100644 index 0000000000000..9805b6dd5038e --- /dev/null +++ b/vllm/lora/ops/triton_ops/__init__.py @@ -0,0 +1,13 @@ +from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand +from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice +from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink +from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand +from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink # noqa: F401 + +__all__ = [ + "bgmv_expand", + "bgmv_expand_slice", + "bgmv_shrink", + "sgmv_expand", + "sgmv_shrink", +] diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/triton_ops/bgmv_expand.py similarity index 100% rename from vllm/lora/ops/bgmv_expand.py rename to vllm/lora/ops/triton_ops/bgmv_expand.py diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/triton_ops/bgmv_expand_slice.py similarity index 100% rename from vllm/lora/ops/bgmv_expand_slice.py rename to vllm/lora/ops/triton_ops/bgmv_expand_slice.py diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/triton_ops/bgmv_shrink.py similarity index 100% rename from vllm/lora/ops/bgmv_shrink.py rename to vllm/lora/ops/triton_ops/bgmv_shrink.py diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/triton_ops/sgmv_expand.py similarity index 100% rename from vllm/lora/ops/sgmv_expand.py rename to vllm/lora/ops/triton_ops/sgmv_expand.py diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/triton_ops/sgmv_shrink.py similarity index 100% rename from vllm/lora/ops/sgmv_shrink.py rename to vllm/lora/ops/triton_ops/sgmv_shrink.py diff --git a/vllm/lora/ops/utils.py b/vllm/lora/ops/triton_ops/utils.py similarity index 100% rename from vllm/lora/ops/utils.py rename to vllm/lora/ops/triton_ops/utils.py diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index dacfb9ebd1480..b9c506f6e0bfd 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -1,9 +1,12 @@ # Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py +import json import math +import os from dataclasses import MISSING, dataclass, field, fields -from typing import Literal, Optional, Union +from typing import List, Literal, Optional, Union +from vllm.config import LoRAConfig from vllm.logger import init_logger logger = init_logger(__name__) @@ -11,6 +14,12 @@ logger = init_logger(__name__) @dataclass class PEFTHelper: + """ + A helper class for PEFT configurations, specifically designed for LoRA. + This class handles configuration validation, compatibility checks for + various LoRA implementations. + """ + # Required fields r: int lora_alpha: int @@ -29,20 +38,18 @@ class PEFTHelper: vllm_max_position_embeddings: Optional[int] = field(default=False) vllm_long_context_scaling_factor: Optional[float] = field(default=None) - def _validate_features(self): + def _validate_features(self) -> List[str]: + """ + Check if there are any unsupported Lora features. + """ error_msg = [] - if self.modules_to_save: error_msg.append("vLLM only supports modules_to_save being None.") - if self.use_dora: error_msg.append("vLLM does not yet support DoRA.") - - if error_msg: - raise ValueError(f"{', '.join(error_msg)}") + return error_msg def __post_init__(self): - self._validate_features() if self.use_rslora: logger.info_once("Loading LoRA weights trained with rsLoRA.") self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r) @@ -78,3 +85,29 @@ class PEFTHelper: for k, v in config_dict.items() if k in class_fields } return cls(**filtered_dict) + + @classmethod + def from_local_dir(cls, lora_path: str, + max_position_embeddings: Optional[int]) -> "PEFTHelper": + lora_config_path = os.path.join(lora_path, "adapter_config.json") + + with open(lora_config_path) as f: + config = json.load(f) + config["vllm_max_position_embeddings"] = max_position_embeddings + return cls.from_dict(config) + + def validate_legal(self, lora_config: LoRAConfig) -> None: + """ + Validates the LoRA configuration settings against application + constraints and requirements. + """ + error_msg = self._validate_features() + if self.r > lora_config.max_lora_rank: + error_msg.append( + f"LoRA rank {self.r} is greater than max_lora_rank" + f" {lora_config.max_lora_rank}.") + if self.bias != "none" and not lora_config.bias_enabled: + error_msg.append( + "Adapter bias cannot be used without bias_enabled.") + if error_msg: + raise ValueError(f"{' '.join(error_msg)}") diff --git a/vllm/lora/punica_wrapper/punica_cpu.py b/vllm/lora/punica_wrapper/punica_cpu.py new file mode 100644 index 0000000000000..b9ae3e07492c0 --- /dev/null +++ b/vllm/lora/punica_wrapper/punica_cpu.py @@ -0,0 +1,346 @@ +from typing import Callable, Optional, Tuple, Union + +import torch + +from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice, + bgmv_shrink, sgmv_expand, + sgmv_expand_slice, sgmv_shrink) + +from .punica_base import PunicaWrapperBase + + +# The platforms that are compatible with the PyTorch-native implementation can +# inherit this class +class PunicaWrapperCPU(PunicaWrapperBase): + """ + PunicaWrapperCPU is designed to manage and provide metadata for the punica + kernel. The main function is to maintain the state information for + Multi-LoRA, and to provide the interface for the pytorch punica ops. + """ + + def __init__(self, max_num_batched_tokens: int, max_batches: int, + device: Union[torch.device, str], **kwargs): + PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, + device) + + def _shrink_prefill( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + scale: float, + ): + #No LoRA request, so return directly + if self.no_lora: + return + sgmv_shrink( + x, + w_t_all, + y, + *self.prefill_metadata, + scale, + ) + + def _shrink_decode( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + scale: float, + ): + bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale) + + def _expand_prefill( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + add_inputs: bool, + ): + #No LoRA request, so return directly + if self.no_lora: + return + sgmv_expand( + x, + w_t_all, + y, + *self.prefill_metadata, + add_inputs, + ) + + def _expand_decode( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + add_inputs: bool, + ): + bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_inputs) + + def _expand_slice_prefill( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + y_offset: int, + y_slice_size: int, + add_inputs: bool, + ): + #No LoRA request, so return directly + if self.no_lora: + return + sgmv_expand_slice( + x, + w_t_all, + y, + *self.prefill_metadata, + y_offset, + y_slice_size, + add_inputs, + ) + + def _expand_slice_decode( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + y_offset: int, + y_slice_size: int, + add_inputs: bool, + ): + bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset, + y_slice_size, add_inputs) + + def _apply_expand( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + y_offset: int, + y_slice_size: int, + add_inputs: bool = True, + ): + """ + Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` + computation, which is suitable for the + GEMM of lora'b. + """ + + expand_slice_fun: Callable = (self._expand_slice_prefill + if self.is_prefill else + self._expand_slice_decode) + expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_inputs) + + def _apply_shrink(self, y: torch.Tensor, x: torch.Tensor, + w_t_all: torch.Tensor, scale: float): + """ + Perform the ` y+=x@w_t_all` computation, which is suitable for the + GEMM of lora'a. + When `is_prefill is` true, it indicates that it is currently the + prefill stage, and the `_shrink_prefill` function should be called. + Otherwise, it is the decode stage, and the _shrink_decode function + should be called. + """ + y_org = y + y = y.view(-1, y.shape[-1]) + shrink_fun: Callable = (self._shrink_prefill + if self.is_prefill else self._shrink_decode) + shrink_fun(y, x, w_t_all, scale) + y = y.view_as(y_org) + + def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor], + x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...], + scale: float, **kwargs): + """ + Performs GEMM for multiple slices of lora_a. + When `is_prefill is` true, it indicates that it is currently the + prefill stage, and the `_shrink_prefill` function should be called. + Otherwise, it is the decode stage, and the _shrink_decode function + should be called. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += (x @ lora_a_stacked[i]) * scale + + Args: + y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors + x (torch.Tensor): Input tensor + lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights + scale (float): Scaling factor for the operation + """ + + x = x.view(-1, x.shape[-1]) + # TODO fuse these kernels + for slice_idx in range(len(lora_a_stacked)): + self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx], + scale) + + def add_expand(self, + y: torch.Tensor, + x: Union[Tuple[torch.Tensor, ...], torch.Tensor], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + output_slices: Tuple[int, ...], + offset_start: int = 0, + add_inputs=True, + **kwargs) -> None: + """ + Performs GEMM and bias addition for multiple slices of lora_b. + + Semantics: + for i in range(len(lora_b_stacked)): + slice = output_slices[i] + y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + + lora_bias_stacked[i] + offset += slice + + Args: + y (torch.Tensor): Output tensor. + x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors + lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): + bias's weight + output_slices (Tuple[int, ...]): Every slice's size + add_inputs (bool): Defaults to True. + """ + y_org = y + y = y.view(-1, y.shape[-1]) + offset_left = offset_start + if lora_bias_stacked is not None: + self._apply_bias(self.token_lora_indices, y, output_slices, + lora_bias_stacked) + for slice_idx in range(len(lora_b_stacked)): + self._apply_expand( + y, + x[slice_idx], + lora_b_stacked[slice_idx], + offset_left, + output_slices[slice_idx], + add_inputs=add_inputs, + ) + offset_left += output_slices[slice_idx] + y = y.view_as(y_org) + + def add_lora_embedding(self, + y: torch.Tensor, + x: torch.Tensor, + lora_b_stacked: torch.Tensor, + add_inputs: bool = True, + **kwargs) -> None: + """ + Applies lora specifically for VocabParallelEmbeddingWithLoRA. + + Semantics: + y += x @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_b_stacked (torch.Tensor): lora_b's weights. + add_inputs (bool): Default to True. + """ + + # Embedding layer only need expand op + expand_fun: Callable = (self._expand_prefill + if self.is_prefill else self._expand_decode) + expand_fun(y, x, lora_b_stacked, add_inputs) + + def add_lora_linear(self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: Tuple[torch.Tensor, ...], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + scale: float, + output_slices: Tuple[int, ...], + *, + buffer: Optional[Tuple[torch.Tensor, ...]] = None, + **kwargs) -> None: + """ + Applicable to linear-related lora. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += ( + x[i].unsqueeze(0) + @ lora_a_stacked[indices[i], layer_idx, :, :] + @ lora_b_stacked[indices[i], layer_idx, :, :] + * scale + ).squeeze(0)+lora_bias_stacked[i] + + Args: + y (torch.Tensor): Output tensor. Will be changed in-place. + x (torch.Tensor): Input tensor + lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight. + lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight. + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias. + scale (float): Scaling factor. + output_slices (Tuple[int, ...]): Every slice's size. + buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None. + """ + + assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) + if lora_bias_stacked is not None: + assert len(lora_bias_stacked) == len(output_slices) + y = self._apply_bias(self.token_lora_indices, y, output_slices, + lora_bias_stacked) + + if buffer is None: + r = lora_b_stacked[0].size(-1) + # We set the buffer to be float32 by default, consistent with the + # triton op + buffer = tuple( + torch.zeros( + (x.size(0), r), dtype=torch.float32, device=x.device) + for _ in range(len(output_slices))) + self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs) + self.add_expand(y, + buffer, + lora_b_stacked, + None, + output_slices, + add_inputs=True, + **kwargs) + + def add_lora_logits(self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: torch.Tensor, + lora_b_stacked: torch.Tensor, + scale, + *, + buffer: Optional[torch.Tensor] = None, + **kwargs) -> None: + """ + Applies lora specifically for LogitsProcessorWithLoRA. + + Semantics: + buffer = (x @ lora_a_stacked) * scale + y += buffer @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_a_stacked (torch.Tensor): lora_a's weights. + lora_b_stacked (torch.Tensor):lora_b's weights. + scale (float): Scaling factor. + buffer (Optional[torch.Tensor]):Default to None. + """ + y_org = y + y = y.view(-1, y.shape[-1]) + x = x.view(-1, x.shape[-1]) + r = lora_b_stacked.size(-1) + if buffer is None: + # We set the buffer to be float32 by default, consistent with the + # triton op + buffer = torch.zeros((x.size(0), r), + dtype=torch.float32, + device=x.device) + # LogitsProcessorWithLoRA always using bgmv. + bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale) + bgmv_expand(buffer, + lora_b_stacked, + y, + self.sampler_indices, + add_inputs=True) + y = y.view_as(y_org) diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index 278f7b5a8e9f4..451f23e49f27c 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -12,11 +12,11 @@ import torch from vllm.triton_utils import HAS_TRITON if HAS_TRITON: - from vllm.lora.ops.bgmv_expand import bgmv_expand - from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice - from vllm.lora.ops.bgmv_shrink import bgmv_shrink - from vllm.lora.ops.sgmv_expand import sgmv_expand - from vllm.lora.ops.sgmv_shrink import sgmv_shrink + from vllm.lora.ops.triton_ops import bgmv_expand + from vllm.lora.ops.triton_ops import bgmv_expand_slice + from vllm.lora.ops.triton_ops import bgmv_shrink + from vllm.lora.ops.triton_ops import sgmv_expand + from vllm.lora.ops.triton_ops import sgmv_shrink from .punica_base import PunicaWrapperBase diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py index 9791d492d8e48..a293224651992 100644 --- a/vllm/lora/punica_wrapper/punica_selector.py +++ b/vllm/lora/punica_wrapper/punica_selector.py @@ -1,5 +1,6 @@ from vllm.logger import init_logger from vllm.platforms import current_platform +from vllm.utils import resolve_obj_by_qualname from .punica_base import PunicaWrapperBase @@ -7,15 +8,11 @@ logger = init_logger(__name__) def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase: - if current_platform.is_cuda_alike(): - # Lazy import to avoid ImportError - from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU - logger.info_once("Using PunicaWrapperGPU.") - return PunicaWrapperGPU(*args, **kwargs) - elif current_platform.is_hpu(): - # Lazy import to avoid ImportError - from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU - logger.info_once("Using PunicaWrapperHPU.") - return PunicaWrapperHPU(*args, **kwargs) - else: - raise NotImplementedError + punica_wrapper_qualname = current_platform.get_punica_wrapper() + punica_wrapper_cls = resolve_obj_by_qualname(punica_wrapper_qualname) + punica_wrapper = punica_wrapper_cls(*args, **kwargs) + assert punica_wrapper is not None, \ + "the punica_wrapper_qualname(" + punica_wrapper_qualname + ") is wrong." + logger.info_once("Using " + punica_wrapper_qualname.rsplit(".", 1)[1] + + ".") + return punica_wrapper diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 10976fac23028..a64296f7fd902 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -12,6 +12,7 @@ from vllm.config import LoRAConfig from vllm.logger import init_logger from vllm.lora.models import (LoRAModel, LoRAModelManager, LRUCacheLoRAModelManager, create_lora_manager) +from vllm.lora.peft_helper import PEFTHelper from vllm.lora.request import LoRARequest from vllm.lora.utils import get_adapter_absolute_path @@ -95,6 +96,13 @@ class WorkerLoRAManager(AbstractWorkerManager): expected_lora_modules = list(set(expected_lora_modules)) lora_path = get_adapter_absolute_path(lora_request.lora_path) + peft_helper = PEFTHelper.from_local_dir( + lora_path, self.max_position_embeddings) + + # Validates the LoRA configuration against requirements before + # loading weights, throwing an exception if validation fails. + peft_helper.validate_legal(self.lora_config) + # For some models like Qwen2VL, we need to use hf_to_vllm_mapper # to ensure correct loading of lora weights. hf_to_vllm_mapper = None @@ -105,7 +113,7 @@ class WorkerLoRAManager(AbstractWorkerManager): lora = self._lora_model_cls.from_local_checkpoint( lora_path, expected_lora_modules, - max_position_embeddings=self.max_position_embeddings, + peft_helper=peft_helper, lora_model_id=lora_request.lora_int_id, device="cpu", dtype=self.lora_config.lora_dtype, @@ -115,12 +123,19 @@ class WorkerLoRAManager(AbstractWorkerManager): embedding_padding_modules=self.embedding_padding_modules, weights_mapper=hf_to_vllm_mapper) - except Exception as e: - raise RuntimeError(f"Loading lora {lora_path} failed") from e - if lora.rank > self.lora_config.max_lora_rank: + except FileNotFoundError as e: + # FileNotFoundError should be raised if both + # - No adapter found to download from huggingface (or in + # offline mode) + # - No local adapter files found at `lora_request.lora_path` + # For NotFoundError raise ValueError( - f"LoRA rank {lora.rank} is greater than max_lora_rank " - f"{self.lora_config.max_lora_rank}.") + f"Loading lora {lora_request.lora_name} failed: No adapter " + f"found for {lora_path}") from e + except Exception as e: + # For BadRequestError + raise e + if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size: raise ValueError(f"LoRA added vocab size {lora.extra_vocab_size} " f"is greater than lora_extra_vocab_size " @@ -209,12 +224,19 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager): def add_adapter(self, lora_request: LoRARequest) -> bool: if lora_request.lora_int_id not in self.list_adapters(): - # Remove before we load the new lora to save memory + # Load the new adapter first to ensure it is actually valid, before + # evicting any existing adapters. + # This may cause the # of loaded lora adapters to very temporarily + # exceed `--max-cpu-loras`. + lora = self._load_adapter(lora_request) + + # Loading succeeded, now check if we will exceed cache capacity and + # evict if the oldest adapter if so if len(self._adapter_manager) + 1 > self._adapter_manager.capacity: assert isinstance(self._adapter_manager, LRUCacheLoRAModelManager) self._adapter_manager.remove_oldest_adapter() - lora = self._load_adapter(lora_request) + # Then add the new adapter to the cache loaded = self._adapter_manager.add_adapter(lora) else: # If the lora is already loaded, just touch it to diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 401606e8c76f0..96995c56bf504 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -57,6 +57,11 @@ class CustomOp(nn.Module): # PyTorch-native implementation. return self.forward_native(*args, **kwargs) + def forward_oot(self, *args, **kwargs): + # By default, we assume that OOT ops are compatible with the + # PyTorch-native implementation. + return self.forward_native(*args, **kwargs) + def dispatch_forward(self): # NOTE(woosuk): Here we assume that vLLM was built for only one # specific backend. Currently, we do not support dynamic dispatching. @@ -81,6 +86,8 @@ class CustomOp(nn.Module): return self.forward_tpu elif current_platform.is_xpu(): return self.forward_xpu + elif current_platform.is_out_of_tree(): + return self.forward_oot else: return self.forward_cuda diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index f10a8fb8e03cf..2d8594cb8aafa 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -298,8 +298,11 @@ class XGrammarLogitsProcessor: # token_bitmask is a CPU tensor for use with accept_token and # fill_next_token_bitmask so we move it to the device of scores device_type = scores.device.type + dtype = scores.dtype if device_type != "cuda": - scores = scores.to("cpu").unsqueeze(0) + # xgrammar on cpu only supports float32 scores + # see: https://github.com/mlc-ai/xgrammar/blob/c1b64920cad24f44f235778c1c00bb52d57da01a/python/xgrammar/kernels/apply_token_bitmask_inplace_cpu.py#L22 + scores = scores.to("cpu").float().unsqueeze(0) # Note: In this method, if the tensors have different dimensions # on CPU device fails, but on GPU it runs without error. Hence the @@ -307,7 +310,7 @@ class XGrammarLogitsProcessor: xgr.apply_token_bitmask_inplace(scores, self.token_bitmask.to(scores.device)) if device_type != "cuda": - scores = scores.to(device_type).squeeze() + scores = scores.to(dtype).to(device_type).squeeze() return scores diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 32456fee06a28..fb9684ac1c184 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -30,6 +30,10 @@ class FatreluAndMul(CustomOp): def __init__(self, threshold: float = 0.): super().__init__() self.threshold = threshold + if current_platform.is_cuda_alike(): + self.op = torch.ops._C.fatrelu_and_mul + elif current_platform.is_cpu(): + self._forward_method = self.forward_native def forward_native(self, x: torch.Tensor) -> torch.Tensor: d = x.shape[-1] // 2 @@ -39,12 +43,10 @@ class FatreluAndMul(CustomOp): return x1 * x2 def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: - from vllm import _custom_ops as ops - d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - ops.fatrelu_and_mul(out, x, self.threshold) + self.op(out, x, self.threshold) return out @@ -87,6 +89,43 @@ class SiluAndMul(CustomOp): return out +@CustomOp.register("mul_and_silu") +class MulAndSilu(CustomOp): + """An activation function for SwiGLU. + + The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2. + + Shapes: + x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d) + return: (num_tokens, d) or (batch_size, seq_len, d) + """ + + def __init__(self): + super().__init__() + if current_platform.is_cuda_alike(): + self.op = torch.ops._C.mul_and_silu + elif current_platform.is_xpu(): + from vllm._ipex_ops import ipex_ops + self.op = ipex_ops.silu_and_mul + elif current_platform.is_cpu(): + self._forward_method = self.forward_native + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + d = x.shape[-1] // 2 + return x[..., :d] * F.silu(x[..., d:]) + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + self.op(out, x) + return out + + # TODO implement forward_xpu for MulAndSilu + # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + + @CustomOp.register("gelu_and_mul") class GeluAndMul(CustomOp): """An activation function for GeGLU. @@ -103,6 +142,17 @@ class GeluAndMul(CustomOp): self.approximate = approximate if approximate not in ("none", "tanh"): raise ValueError(f"Unknown approximate mode: {approximate}") + if current_platform.is_cuda_alike() or current_platform.is_cpu(): + if approximate == "none": + self.op = torch.ops._C.gelu_and_mul + elif approximate == "tanh": + self.op = torch.ops._C.gelu_tanh_and_mul + elif current_platform.is_xpu(): + from vllm._ipex_ops import ipex_ops + if approximate == "none": + self.op = ipex_ops.gelu_and_mul + else: + self.op = ipex_ops.gelu_tanh_and_mul def forward_native(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" @@ -110,27 +160,17 @@ class GeluAndMul(CustomOp): return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:] def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: - from vllm import _custom_ops as ops - d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - if self.approximate == "none": - ops.gelu_and_mul(out, x) - elif self.approximate == "tanh": - ops.gelu_tanh_and_mul(out, x) + self.op(out, x) return out def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: - from vllm._ipex_ops import ipex_ops as ops - d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - if self.approximate == "none": - ops.gelu_and_mul(out, x) - elif self.approximate == "tanh": - ops.gelu_tanh_and_mul(out, x) + self.op(out, x) return out def extra_repr(self) -> str: @@ -140,6 +180,14 @@ class GeluAndMul(CustomOp): @CustomOp.register("gelu_new") class NewGELU(CustomOp): + def __init__(self): + super().__init__() + if current_platform.is_cuda_alike() or current_platform.is_cpu(): + self.op = torch.ops._C.gelu_new + elif current_platform.is_xpu(): + from vllm._ipex_ops import ipex_ops + self.op = ipex_ops.gelu_new + def forward_native(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" c = math.sqrt(2.0 / math.pi) @@ -147,58 +195,62 @@ class NewGELU(CustomOp): (x + 0.044715 * torch.pow(x, 3.0)))) def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: - from vllm import _custom_ops as ops - out = torch.empty_like(x) - ops.gelu_new(out, x) + self.op(out, x) return out def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: - from vllm._ipex_ops import ipex_ops as ops - - return ops.gelu_new(x) + return self.op(x) @CustomOp.register("gelu_fast") class FastGELU(CustomOp): + def __init__(self): + super().__init__() + if current_platform.is_cuda_alike() or current_platform.is_cpu(): + self.op = torch.ops._C.gelu_fast + elif current_platform.is_xpu(): + from vllm._ipex_ops import ipex_ops + self.op = ipex_ops.gelu_fast + def forward_native(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x))) def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: - from vllm import _custom_ops as ops - out = torch.empty_like(x) - ops.gelu_fast(out, x) + self.op(out, x) return out def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: - from vllm._ipex_ops import ipex_ops as ops - - return ops.gelu_fast(x) + return self.op(x) @CustomOp.register("quick_gelu") class QuickGELU(CustomOp): # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90 + def __init__(self): + super().__init__() + if current_platform.is_cuda_alike() or current_platform.is_cpu(): + self.op = torch.ops._C.gelu_quick + elif current_platform.is_xpu(): + from vllm._ipex_ops import ipex_ops + self.op = ipex_ops.gelu_quick + def forward_native(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" return x * torch.sigmoid(1.702 * x) def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: - from vllm import _custom_ops as ops - out = torch.empty_like(x) - ops.gelu_quick(out, x) + self.op(out, x) return out def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: - from vllm._ipex_ops import ipex_ops as ops - out = torch.empty_like(x) - ops.gelu_quick(out, x) + self.op(out, x) return out # TODO implement forward_xpu for QuickGELU diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 1bb6bc753d37c..308c1d6ac6db1 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -497,7 +497,10 @@ def grouped_topk(hidden_states: torch.Tensor, raise ValueError(f"Unsupported scoring function: {scoring_func}") if e_score_correction_bias is not None: - scores.add_(e_score_correction_bias.unsqueeze(0)) + # Store original scores before applying correction bias. We use biased + # scores for expert selection but original scores for routing weights + original_scores = scores + scores = scores + e_score_correction_bias.unsqueeze(0) num_token = scores.shape[0] group_scores = scores.view(num_token, num_expert_group, @@ -510,10 +513,16 @@ def grouped_topk(hidden_states: torch.Tensor, num_token, num_expert_group, scores.shape[-1] // num_expert_group).reshape(num_token, -1) # [n, e] tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e] - topk_weights, topk_ids = torch.topk(tmp_scores, - k=topk, - dim=-1, - sorted=False) + + if e_score_correction_bias is not None: + topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1] + # Use original unbiased scores for the routing weights + topk_weights = original_scores.gather(1, topk_ids) + else: + topk_weights, topk_ids = torch.topk(tmp_scores, + k=topk, + dim=-1, + sorted=False) if renormalize: topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) @@ -701,8 +710,14 @@ def fused_experts_impl(hidden_states: torch.Tensor, device=hidden_states.device, dtype=hidden_states.dtype) - compute_type = (tl.bfloat16 - if hidden_states.dtype == torch.bfloat16 else tl.float16) + if hidden_states.dtype == torch.bfloat16: + compute_type = tl.bfloat16 + elif hidden_states.dtype == torch.float16: + compute_type = tl.float16 + elif hidden_states.dtype == torch.float32: + compute_type = tl.float32 + else: + raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}") if inplace: out_hidden_states = hidden_states diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index b108cbd52c218..3d822fc0c7f99 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -13,13 +13,15 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform +from vllm.platforms.interface import CpuArchEnum if current_platform.is_cuda_alike(): from .fused_moe import fused_experts else: fused_experts = None # type: ignore if current_platform.is_tpu(): - from .moe_pallas import fused_moe as fused_moe_pallas + # the iterative moe implementation is used until the moe_pallas is fixed + from .moe_torch_iterative import fused_moe as fused_moe_pallas else: fused_moe_pallas = None # type: ignore logger = init_logger(__name__) @@ -83,6 +85,20 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): layer.register_parameter("w2_weight", w2_weight) set_weight_attrs(w2_weight, extra_weight_attrs) + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + super().process_weights_after_loading(layer) + + if current_platform.is_cpu(): + if current_platform.get_cpu_architecture() == CpuArchEnum.X86: + import intel_extension_for_pytorch as ipex + layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE( + layer.w13_weight, + layer.w2_weight, + use_prepack=True, + ) + else: + raise NotImplementedError("CPU MOE only supports x86 arch.") + def apply( self, layer: torch.nn.Module, @@ -142,9 +158,29 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): topk_ids=topk_ids, inplace=True) - def forward_cpu(self, *args, **kwargs): - raise NotImplementedError( - "The CPU backend currently does not support MoE.") + def forward_cpu( + self, + layer: torch.nn.Module, + x: torch.Tensor, + use_grouped_topk: bool, + top_k: int, + router_logits: torch.Tensor, + renormalize: bool, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + **kwargs, + ): + assert custom_routing_function is None + return layer.ipex_fusion( + x, + use_grouped_topk, + top_k, + router_logits, + renormalize, + topk_group, + num_expert_group, + ) def forward_tpu( self, diff --git a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py new file mode 100644 index 0000000000000..bcff55f4fdf16 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py @@ -0,0 +1,51 @@ +import torch +import torch.nn.functional as F + + +def fused_moe( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + renormalize: bool, +) -> torch.Tensor: + """ + Args: + hidden_states: [*, hidden_size] + w1: [num_experts, intermediate_size * 2, hidden_size] + w2: [num_experts, hidden_size, intermediate_size] + gating_output: [*, num_experts] + """ + orig_shape = hidden_states.shape + hidden_size = hidden_states.shape[-1] + num_tokens = hidden_states.shape[:-1].numel() + num_experts = w1.shape[0] + intermediate_size = w2.shape[-1] + dtype = hidden_states.dtype + + hidden_states = hidden_states.view(num_tokens, hidden_size) + gating_output = gating_output.view(num_tokens, num_experts) + topk_weights = gating_output.softmax(dim=-1, dtype=torch.float) + topk_weights, selected_experts = topk_weights.topk(topk, dim=-1) + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + topk_weights = topk_weights.to(dtype) + + final_hidden_states = None + for expert_idx in range(num_experts): + expert_w1 = w1[expert_idx] + expert_w2 = w2[expert_idx] + expert_mask = (selected_experts == expert_idx) + expert_weights = (topk_weights * expert_mask).sum(dim=-1, keepdim=True) + x = F.linear(hidden_states, expert_w1) + gate = F.silu(x[:, :intermediate_size]) + x = x[:, intermediate_size:] * gate + x = F.linear(x, expert_w2) + current_hidden_states = x * expert_weights + if final_hidden_states is None: + final_hidden_states = current_hidden_states + else: + final_hidden_states = final_hidden_states + current_hidden_states + + return final_hidden_states.view(orig_shape) # type: ignore diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 48cfb1b221720..52263e96fb9f9 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -32,7 +32,7 @@ WEIGHT_LOADER_V2_SUPPORTED = [ "MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod", "TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod", "ModelOptFp8LinearMethod", "IPEXAWQLinearMethod", "IPEXGPTQLinearMethod", - "HQQMarlinMethod" + "HQQMarlinMethod", "QuarkLinearMethod" ] @@ -344,11 +344,13 @@ class ColumnParallelLinear(LinearBase): param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype) use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit param_data = param.data - # bitsandbytes loads the weights of the specific portion - # no need to narrow here - if output_dim is not None and not use_bitsandbytes_4bit: + if output_dim is not None and not is_sharded_weight: shard_size = param_data.shape[output_dim] start_idx = tp_rank * shard_size loaded_weight = loaded_weight.narrow(output_dim, start_idx, @@ -447,8 +449,14 @@ class MergedColumnParallelLinear(ColumnParallelLinear): is_gguf_weight = getattr(param, "is_gguf_weight", False) is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) if is_gguf_weight_type: - param.data[loaded_shard_id].copy_(loaded_weight) - param.shard_weight_type[loaded_shard_id] = loaded_weight.item() + if loaded_shard_id is not None: + param.data[loaded_shard_id].copy_(loaded_weight) + param.shard_weight_type[loaded_shard_id] = loaded_weight.item() + else: + param.shard_weight_type = { + i: loaded_weight.item() + for i, _ in enumerate(self.output_sizes) + } return if is_gguf_weight: @@ -459,15 +467,15 @@ class MergedColumnParallelLinear(ColumnParallelLinear): shard_size = loaded_weight.size(output_dim) // tp_size start_idx = tp_rank * shard_size - loaded_weight = loaded_weight.narrow(output_dim, start_idx, - shard_size) - - param.shard_id.append(loaded_shard_id) - param.shard_id_map[loaded_shard_id] = len(param.data_container) - param.data_container.append(loaded_weight) - if len(param.data_container) == 2: - self.qweight = param.materialize_nested() - return + if loaded_shard_id is not None: + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) + param.shard_id.append(loaded_shard_id) + param.shard_id_map[loaded_shard_id] = len(param.data_container) + param.data_container.append(loaded_weight) + if len(param.data_container) == 2: + self.qweight = param.materialize_nested() + return param_data = param.data output_dim = getattr(param, "output_dim", None) @@ -540,6 +548,11 @@ class MergedColumnParallelLinear(ColumnParallelLinear): use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit + if use_bitsandbytes_4bit: shard_size = loaded_weight.shape[output_dim] shard_offset = loaded_weight.shape[output_dim] * \ @@ -548,9 +561,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): param_data = param_data.narrow(output_dim, shard_offset, shard_size) start_idx = tp_rank * shard_size - # bitsandbytes loads the weights of the specific portion - # no need to narrow here - if not use_bitsandbytes_4bit: + if not is_sharded_weight: loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) # Special case for AQLM codebooks. @@ -811,10 +822,16 @@ class QKVParallelLinear(ColumnParallelLinear): # initialize GGUF param after we know the quantize type is_gguf_weight = getattr(param, "is_gguf_weight", False) is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) - if is_gguf_weight_type and loaded_shard_id is not None: + if is_gguf_weight_type: idx_map = {"q": 0, "k": 1, "v": 2} - param.data[idx_map[loaded_shard_id]].copy_(loaded_weight) - param.shard_weight_type[loaded_shard_id] = loaded_weight.item() + if loaded_shard_id is not None: + param.data[idx_map[loaded_shard_id]].copy_(loaded_weight) + param.shard_weight_type[loaded_shard_id] = loaded_weight.item() + else: + param.shard_weight_type = { + k: loaded_weight.item() + for k in idx_map + } return if is_gguf_weight: @@ -825,15 +842,15 @@ class QKVParallelLinear(ColumnParallelLinear): shard_size = loaded_weight.size(output_dim) // tp_size start_idx = tp_rank * shard_size - loaded_weight = loaded_weight.narrow(output_dim, start_idx, - shard_size) - - param.shard_id.append(loaded_shard_id) - param.shard_id_map[loaded_shard_id] = len(param.data_container) - param.data_container.append(loaded_weight) - if len(param.data_container) == 3: - self.qweight = param.materialize_nested() - return + if loaded_shard_id is not None: + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) + param.shard_id.append(loaded_shard_id) + param.shard_id_map[loaded_shard_id] = len(param.data_container) + param.data_container.append(loaded_weight) + if len(param.data_container) == 3: + self.qweight = param.materialize_nested() + return param_data = param.data output_dim = getattr(param, "output_dim", None) @@ -929,6 +946,11 @@ class QKVParallelLinear(ColumnParallelLinear): use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit + if use_bitsandbytes_4bit: orig_qkv_offsets = { "q": (0, self.num_heads * self.head_size), @@ -952,9 +974,7 @@ class QKVParallelLinear(ColumnParallelLinear): shard_id = tp_rank // self.num_kv_head_replicas start_idx = shard_id * shard_size - # bitsandbytes loads the weights of the specific portion - # no need to narrow here - if not use_bitsandbytes_4bit: + if not is_sharded_weight: loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) @@ -1058,6 +1078,10 @@ class RowParallelLinear(LinearBase): tp_size = get_tensor_model_parallel_world_size() input_dim = getattr(param, "input_dim", None) use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit # Special case for GGUF is_gguf_weight = getattr(param, "is_gguf_weight", False) @@ -1073,9 +1097,7 @@ class RowParallelLinear(LinearBase): param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype) param_data = param.data - # bitsandbytes loads the weights of the specific portion - # no need to narrow here - if input_dim is not None and not use_bitsandbytes_4bit: + if input_dim is not None and not is_sharded_weight: shard_size = param_data.shape[input_dim] start_idx = tp_rank * shard_size loaded_weight = loaded_weight.narrow(input_dim, start_idx, diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 2bc7e458494f7..42decde1d0f79 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -6,6 +6,7 @@ import torch import torch.nn as nn import vllm.envs as envs +from vllm.config import get_current_vllm_config from vllm.distributed import (tensor_model_parallel_all_gather, tensor_model_parallel_gather) from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -44,8 +45,10 @@ class LogitsProcessor(nn.Module): self.soft_cap = soft_cap # Whether to use gather or all-gather to gather the logits. - self.use_gather = not current_platform.is_tpu( - ) and not envs.VLLM_USE_V1 + parallel_config = get_current_vllm_config().parallel_config + self.use_all_gather = current_platform.is_tpu() \ + or envs.VLLM_USE_V1 \ + or parallel_config.distributed_executor_backend == "external_launcher" # noqa def forward( self, @@ -88,16 +91,17 @@ class LogitsProcessor(nn.Module): logits = lm_head.linear_method.apply(lm_head, hidden_states, bias=embedding_bias) - if self.use_gather: - # None may be returned for rank > 0 - logits = tensor_model_parallel_gather(logits) - else: + + if self.use_all_gather: # Gather is not supported for some devices such as TPUs. # Use all-gather instead. # NOTE(woosuk): Here, the outputs of every device should not be None # because XLA requires strict SPMD among all devices. Every device # should execute the same operations after gathering the logits. logits = tensor_model_parallel_all_gather(logits) + else: + # None may be returned for rank > 0 + logits = tensor_model_parallel_gather(logits) # Remove paddings in vocab (if any). if logits is not None: logits = logits[..., :self.org_vocab_size] diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index dd10c434f0752..d2bde13fcf546 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -26,14 +26,56 @@ QUANTIZATION_METHODS: List[str] = [ "experts_int8", "neuron_quant", "ipex", + "quark" ] +# The customized quantization methods which will be added to this dict. +_CUSTOMIZED_METHOD_TO_QUANT_CONFIG = {} + + +def register_quantization_config(quantization: str): + """Register a customized vllm quantization config. + + When a quantization method is not supported by vllm, you can register a customized + quantization config to support it. + + Args: + quantization (str): The quantization method name. + + Examples: + >>> from vllm.model_executor.layers.quantization import register_quantization_config + >>> from vllm.model_executor.layers.quantization import get_quantization_config + >>> from vllm.model_executor.layers.quantization.base_config import QuantizationConfig + >>> + >>> @register_quantization_config("my_quant") + ... class MyQuantConfig(QuantizationConfig): + ... pass + >>> + >>> get_quantization_config("my_quant") + + """ # noqa: E501 + + def _wrapper(quant_config_cls): + if quantization in QUANTIZATION_METHODS: + raise ValueError( + f"The quantization method `{quantization}` is already exists.") + if not issubclass(quant_config_cls, QuantizationConfig): + raise ValueError("The quantization config must be a subclass of " + "`QuantizationConfig`.") + _CUSTOMIZED_METHOD_TO_QUANT_CONFIG[quantization] = quant_config_cls + QUANTIZATION_METHODS.append(quantization) + return quant_config_cls + + return _wrapper + def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: if quantization not in QUANTIZATION_METHODS: raise ValueError(f"Invalid quantization method: {quantization}") # lazy import to avoid triggering `torch.compile` too early + from vllm.model_executor.layers.quantization.quark.quark import QuarkConfig + from .aqlm import AQLMConfig from .awq import AWQConfig from .awq_marlin import AWQMarlinConfig @@ -79,7 +121,10 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: "experts_int8": ExpertsInt8Config, "neuron_quant": NeuronQuantConfig, "ipex": IPEXConfig, + "quark": QuarkConfig } + # Update the `method_to_config` with customized quantization methods. + method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG) return method_to_config[quantization] diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index 6dfac8aad5358..2fb2642dd5156 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -133,3 +133,6 @@ class QuantizationConfig(ABC): method. """ raise NotImplementedError + + def get_cache_scale(self, name: str) -> Optional[str]: + return None diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 0c1fc18228f5c..b2fc2360f47f1 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -113,7 +113,7 @@ class CompressedTensorsConfig(QuantizationConfig): :return: A dictionary mapping target layer names to their corresponding sparsity compression configurations """ - if (sparsity_config := config.get(SPARSITY_CONFIG_NAME)) is None: + if not (sparsity_config := config.get(SPARSITY_CONFIG_NAME)): return dict() sparsity_config = SparsityCompressionConfig.model_validate( @@ -412,6 +412,22 @@ class CompressedTensorsConfig(QuantizationConfig): self._check_scheme_supported(scheme.get_min_capability()) return scheme + def get_cache_scale(self, name: str) -> Optional[str]: + """ + Check whether the param name matches the format for k/v cache scales + in compressed-tensors. If this is the case, return its equivalent + param name expected by vLLM + + :param name: param name + :return: matching param name for KV cache scale in vLLM + """ + if name.endswith(".output_scale") and ".k_proj" in name: + return name.replace(".k_proj.output_scale", ".attn.k_scale") + if name.endswith(".output_scale") and ".v_proj" in name: + return name.replace(".v_proj.output_scale", ".attn.v_scale") + # If no matches, return None + return None + @staticmethod def supports_cutlass_24( weight_quant: Optional[QuantizationArgs], diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py index bc697ef93b34b..21e6fe7a22616 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py @@ -42,7 +42,7 @@ class CompressedTensors24(CompressedTensorsScheme): if not sparse_cutlass_supported(): raise ValueError( - "Sparse CUTLASS not supported. vLLM must be built with" + "Sparse CUTLASS not supported. vLLM must be built with " "CUDA 12.2 or later to use this feature") self.output_dtype = params_dtype diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py index 2659afcdc74a9..f4c1dbc0361c6 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py @@ -136,6 +136,10 @@ def triton_scaled_mm(input: torch.Tensor, assert N > 0 and K > 0 and M > 0 assert weight.shape[0] == K assert input.dtype == weight.dtype + + scale_a = scale_a.reshape(-1, 1) if scale_a.dim() <= 1 else scale_a + scale_b = scale_b.reshape(-1, 1) if scale_b.dim() <= 1 else scale_b + assert scale_a.dtype == scale_b.dtype and scale_a.is_floating_point() assert scale_a.shape == torch.Size([1, 1]) or scale_a.shape == torch.Size( [M, 1]) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index dfae4db71e546..8fcbda377428e 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -133,23 +133,6 @@ def _find_first_match(value: str, return None -def get_compressed_tensors_cache_scale(name: str) -> Optional[str]: - """ - Check whether the param name matches the format for k/v cache scales - in compressed-tensors. If this is the case, return its equivalent - param name expected by vLLM - - :param name: param name - :return: matching param name for KV cache scale in vLLM - """ - if name.endswith(".output_scale") and ".k_proj" in name: - return name.replace(".k_proj.output_scale", ".attn.k_scale") - if name.endswith(".output_scale") and ".v_proj" in name: - return name.replace(".v_proj.output_scale", ".attn.v_scale") - # If no matches, return None - return None - - def _is_equal_or_regex_match(value: str, target: str, check_contains: bool = False) -> bool: diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index a1be45a49e94a..26dd5df4e55b2 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -247,6 +247,15 @@ class Fp8LinearMethod(LinearMethodBase): def process_weights_after_loading(self, layer: Module) -> None: # Block quant doesn't need to process weights after loading if self.block_quant: + if current_platform.is_rocm(): + weight, weight_scale, _ = \ + normalize_e4m3fn_to_e4m3fnuz( + weight=layer.weight, + weight_scale=layer.weight_scale_inv, + input_scale=layer.input_scale) + layer.weight = Parameter(weight, requires_grad=False) + layer.weight_scale_inv = Parameter(weight_scale, + requires_grad=False) return layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False) @@ -355,7 +364,8 @@ class Fp8LinearMethod(LinearMethodBase): input_scale=layer.input_scale, bias=bias, cutlass_fp8_supported=self.cutlass_fp8_supported, - use_per_token_if_dynamic=False) + # Default to using per_token quantization if cutlass is supported + use_per_token_if_dynamic=self.cutlass_fp8_supported) class Fp8MoEMethod(FusedMoEMethodBase): @@ -494,6 +504,30 @@ class Fp8MoEMethod(FusedMoEMethodBase): def process_weights_after_loading(self, layer: Module) -> None: # Block quant doesn't need to process weights after loading if self.block_quant: + if current_platform.is_rocm(): + w13_weight, w13_weight_scale_inv, w13_input_scale = \ + normalize_e4m3fn_to_e4m3fnuz( + layer.w13_weight, layer.w13_weight_scale_inv, + layer.w13_input_scale) + w2_weight, w2_weight_scale_inv, w2_input_scale = \ + normalize_e4m3fn_to_e4m3fnuz( + layer.w2_weight, layer.w2_weight_scale_inv, + layer.w2_input_scale) + # Reset the parameter + layer.w13_weight = torch.nn.Parameter(w13_weight, + requires_grad=False) + layer.w13_weight_scale_inv = torch.nn.Parameter( + w13_weight_scale_inv, requires_grad=False) + if w13_input_scale is not None: + layer.w13_input_scale = torch.nn.Parameter( + w13_input_scale, requires_grad=False) + layer.w2_weight = torch.nn.Parameter(w2_weight, + requires_grad=False) + layer.w2_weight_scale_inv = torch.nn.Parameter( + w2_weight_scale_inv, requires_grad=False) + if w2_input_scale is not None: + layer.w2_input_scale = torch.nn.Parameter( + w2_input_scale, requires_grad=False) return # If checkpoint is fp16, quantize in place. if not self.quant_config.is_checkpoint_fp8_serialized: diff --git a/vllm/model_executor/layers/quantization/quark/__init__.py b/vllm/model_executor/layers/quantization/quark/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py new file mode 100644 index 0000000000000..fc214255eca71 --- /dev/null +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -0,0 +1,387 @@ +import fnmatch +import re +from typing import Any, Dict, List, Optional, cast + +import torch + +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, + UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 + QuantizationConfig, QuantizeMethodBase) +from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod +from vllm.model_executor.layers.quantization.quark.quark_moe import ( # noqa: E501 + QuarkMoEMethod) +from vllm.model_executor.layers.quantization.quark.schemes import ( + QuarkScheme, QuarkW8A8Fp8, QuarkW8A8Int8) +from vllm.model_executor.layers.quantization.quark.utils import ( + deep_compare, should_ignore_layer) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + FUSED_LAYER_NAME_MAPPING) +from vllm.platforms import current_platform + +__all__ = ["QuarkLinearMethod"] + + +class QuarkConfig(QuantizationConfig): + + def __init__(self, + quant_config: Dict[str, Any], + kv_cache_group: Optional[List[str]] = None, + kv_cache_config: Optional[Dict[str, Any]] = None, + pack_method: str = "reorder"): + if kv_cache_group is None: + kv_cache_group = [] + self.quant_config = quant_config + self.kv_cache_group = kv_cache_group + self.kv_cache_config = kv_cache_config + self.pack_method = pack_method + + def get_linear_method(self) -> "QuarkLinearMethod": + return QuarkLinearMethod(self) + + def get_supported_act_dtypes(cls) -> List[torch.dtype]: + return [torch.float16, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + return 70 + + def get_name(self) -> str: + return "quark" + + def get_quant_method(self, layer: torch.nn.Module, + prefix: str) -> Optional["QuantizeMethodBase"]: + from vllm.attention.layer import Attention # Avoid circular import + + # Check if the layer is skipped for quantization. + exclude_layers = cast(List[str], self.quant_config.get("exclude")) + if should_ignore_layer(prefix, ignore=exclude_layers): + return UnquantizedLinearMethod() + if isinstance(layer, LinearBase): + scheme = self.get_scheme(layer=layer, layer_name=prefix) + layer.scheme = scheme + return QuarkLinearMethod(self) + if isinstance(layer, Attention): + return QuarkKVCacheMethod(self) + if isinstance(layer, FusedMoE): + return QuarkMoEMethod.get_moe_method(self, + module=layer, + layer_name=prefix) + return None + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "QuarkConfig": + export_config = config.get("export") + if export_config is None: + raise ValueError("The export key should be included in " + "the configurations of Quark quantized model") + kv_cache_group = cast(List[str], export_config.get("kv_cache_group")) + pack_method = cast(str, export_config.get("pack_method")) + + # In the export model of quark, the quantization configuration + # of kv_cache is stored in layer_quant_config. First, it is + # judged whether kv_cache_group exists, and then it is judged + # whether layer_quant_config has a quantization configuration + # that matches kv_cache. + if len(kv_cache_group) == 0: + kv_cache_config = None + else: + kv_cache_set = set(kv_cache_group) + layer_quant_config = cast(Dict[str, Any], + config.get("layer_quant_config")) + layer_quant_names = list(layer_quant_config.keys()) + layer_quant_set = set(layer_quant_names) + + if not kv_cache_set.issubset(layer_quant_set): + raise ValueError("The Quark quantized model has the " + "kv_cache_group parameter setting, " + "but no kv_cache quantization settings " + "were found in the quantization " + "configuration.") + + q_configs = [ + cast(Dict[str, Any], layer_quant_config.get(name)) + for name in kv_cache_group + ] + if not all( + deep_compare(q_config, q_configs[0]) + for q_config in q_configs): + raise ValueError( + "The quantization method used for kv_cache should " + "be the same, but the quantization method for the " + "kv_cache layer in the config is different.") + kv_cache_config = q_configs[0].get("output_tensors") + if kv_cache_config is None: + raise ValueError( + "The kv_cache quantization configuration is empty.") + + # Since we have already set kv_cache quantization configurations, + # we will remove the quantization configuration for the + # output_tensors corresponding to the kv_cache layer. + for q_config in q_configs: + q_config["output_tensors"] = None + + return cls(quant_config=config, + kv_cache_group=kv_cache_group, + kv_cache_config=kv_cache_config, + pack_method=pack_method) + + @classmethod + def get_config_filenames(cls) -> List[str]: + return [] + + def _check_scheme_supported(self, + min_capability: int, + error: bool = True) -> bool: + capability_tuple = current_platform.get_device_capability() + + if capability_tuple is not None: + capability = capability_tuple.to_int() + supported = capability >= min_capability + if error and not supported: + raise RuntimeError( + "Quantization scheme is not supported for ", + f"the current GPU. Min capability: {min_capability}. ", + f"Current capability: {capability}.") + return supported + else: + return False + + def _is_fp8_w8a8(self, weight_quant: Optional[Dict[str, Any]], + input_quant: Optional[Dict[str, Any]]) -> bool: + # Confirm weights and input quantized. + if weight_quant is None or input_quant is None: + return False + + # Confirm weight scheme is supported + is_fp8_dtype = (weight_quant.get("dtype") == "fp8_e4m3" + and input_quant.get("dtype") == "fp8_e4m3") + is_static_weight = not weight_quant.get("is_dynamic") + is_per_tensor_or_channel_weight = (weight_quant.get("qscheme") + in ["per_tensor", "per_channel"]) + + if not (is_fp8_dtype and is_static_weight + and is_per_tensor_or_channel_weight): + return False + + # Dynamic quantization is always supported if weights supported. + if input_quant.get("is_dynamic"): + return True + + # Confirm activation scheme is supported. + is_per_tensor_activation = (input_quant.get("qscheme") == "per_tensor") + return is_per_tensor_activation + + def _is_static_tensor_w8a8(self, weight_quant: Optional[Dict[str, Any]], + input_quant: Optional[Dict[str, Any]]) -> bool: + # Confirm weights and input quantized. + if weight_quant is None or input_quant is None: + return False + + is_int8_dtype = (weight_quant.get("dtype") == "int8" + and input_quant.get("dtype") == "int8") + + is_tensor = (weight_quant.get("qscheme") + in ["per_tensor", "per_channel"] + and input_quant.get("qscheme") == "per_tensor") + + is_static = (not weight_quant.get("is_dynamic") + and not input_quant.get("is_dynamic")) + + is_weight_symmetric = (weight_quant.get("symmetric") is True) + + # Both symmetric and asymmetric input quantization supported. + # Only symmetric weight quantization supported. + return is_int8_dtype and is_tensor and is_weight_symmetric and is_static + + def _find_matched_config(self, layer_name: str, + module: torch.nn.Module) -> Dict[str, Any]: + + proj_name = layer_name.split(".")[-1] + if proj_name in FUSED_LAYER_NAME_MAPPING: + shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name] + + # Convert fused_name --> [shard_names] + shard_names = [ + layer_name.replace(proj_name, shard_proj_name) + for shard_proj_name in shard_proj_names + ] + shard_configs = [ + self._find_matched_config(shard_name, module) + for shard_name in shard_names + ] + if not all( + deep_compare(q_config, shard_configs[0]) + for q_config in shard_configs): + raise ValueError( + f"Found a different quantization configuration for " + f"{shard_proj_names} in {layer_name}. vLLM " + "requires all to use the same scheme.") + return shard_configs[0] + else: + layer_quant_config = cast( + Dict[str, Any], self.quant_config.get("layer_quant_config")) + for name_pattern in layer_quant_config: + if fnmatch.fnmatch(layer_name, name_pattern): + return layer_quant_config[name_pattern] + + layer_type = cast(str, type(module)) + layer_type_quant_config = cast( + Dict[str, Any], + self.quant_config.get("layer_type_quant_config")) + if layer_type in layer_type_quant_config: + return layer_type_quant_config[layer_type] + + global_quant_config = cast( + Dict[str, Any], self.quant_config.get("global_quant_config")) + return global_quant_config + + def _get_scheme_from_config(self, config: Dict[str, Any]) -> "QuarkScheme": + if config.get("output_tensors") or config.get("bias"): + raise NotImplementedError( + "Currently, Quark models with output_tensors " + "and bias quantized are not supported") + weight_config = cast(Dict[str, Any], config.get("weight")) + input_config = cast(Dict[str, Any], config.get("input_tensors")) + + if self._is_fp8_w8a8(weight_config, input_config): + is_fp8_w8a8_supported = self._check_scheme_supported( + QuarkW8A8Fp8.get_min_capability(), error=False) + if is_fp8_w8a8_supported: + weight_qscheme = cast(str, weight_config.get("qscheme")) + input_static = (input_config is not None and + not cast(bool, input_config.get("is_dynamic"))) + return QuarkW8A8Fp8(qscheme=weight_qscheme, + is_static_input_scheme=input_static) + elif self._is_static_tensor_w8a8(weight_config, input_config): + weight_qscheme = cast(str, weight_config.get("qscheme")) + return QuarkW8A8Int8(qscheme=weight_qscheme, + is_static_input_scheme=True, + input_symmetric=input_config.get("symmetric")) + + raise NotImplementedError("No quark compatible scheme was found. " + f"Weight config: {weight_config}, " + f"Input config: {input_config}") + + def get_scheme(self, layer: torch.nn.Module, + layer_name: str) -> "QuarkScheme": + + layer_quant_config = self._find_matched_config(layer_name, layer) + + # Find the quant_scheme + scheme = self._get_scheme_from_config(layer_quant_config) + # Raise error if device does not support the scheme + # (e.g. fp8 needs ada lovelace) + self._check_scheme_supported(scheme.get_min_capability()) + + return scheme + + def get_cache_scale(self, name: str) -> Optional[str]: + """ + Check whether the param name matches the format for k/v cache scales + in quark. If this is the case, return its equivalent param name + expected by vLLM + + :param name: param name + :return: matching param name for KV cache scale in vLLM + """ + if self.kv_cache_group is None or len(self.kv_cache_group) == 0: + return None + + kv_proj_names = [ + re.split(r"[*.]", kv_cache)[-1] for kv_cache in self.kv_cache_group + ] + if name.endswith(".output_scale"): + if len(kv_proj_names) == 1 and kv_proj_names[0] in name: + kv_output_scale_name = "." + kv_proj_names[0] + ".output_scale" + return name.replace(kv_output_scale_name, ".attn.k_scale") + + elif len(kv_proj_names) == 2: + for kv_proj_name in kv_proj_names: + if kv_proj_name in name and kv_proj_name == "k_proj": + return name.replace(".k_proj.output_scale", + ".attn.k_scale") + elif kv_proj_name in name and kv_proj_name == "v_proj": + return name.replace(".v_proj.output_scale", + ".attn.v_scale") + + # If no matches, return None + return None + + +class QuarkLinearMethod(LinearMethodBase): + + def __init__(self, quantization_config: QuarkConfig): + self.quantization_config = quantization_config + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + layer.scheme.process_weights_after_loading(layer) + + def create_weights(self, layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], input_size: int, + output_size: int, params_dtype: torch.dtype, + **extra_weight_attrs): + """ + Use the CompressedTensorsScheme associated with each layer to create + the necessary parameters for the layer. See LinearMethodBase for param + details + """ + weight_loader = extra_weight_attrs.get("weight_loader") + layer.scheme.create_weights( + layer=layer, + input_size=input_size, + input_size_per_partition=input_size_per_partition, + output_partition_sizes=output_partition_sizes, + output_size=output_size, + params_dtype=params_dtype, + weight_loader=weight_loader) + + def apply(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None): + """ + Use the output of create_weights and the CompressedTensorsScheme + associated with the layer to apply the forward pass with the + layer input. See LinearMethodBase for param details + + """ + scheme = layer.scheme + if scheme is None: + raise ValueError("A scheme must be defined for each layer") + return scheme.apply_weights(layer, x, bias=bias) + + +class QuarkKVCacheMethod(BaseKVCacheMethod): + """ + Supports loading kv-cache scaling factors from quark checkpoints. + """ + + def __init__(self, quant_config: QuarkConfig): + self.validate_kv_cache_config(quant_config.kv_cache_config) + super().__init__(quant_config) + + @staticmethod + def validate_kv_cache_config(kv_cache_config: Optional[Dict[str, Any]]): + """ + Validator for the kv cache configuration. Useful for controlling the + kv cache quantization schemes, that are being supported in vLLM + :param kv_cache_config: the quark kv cache scheme + """ + if kv_cache_config is None: + return + + dtype = kv_cache_config.get("dtype") + if dtype != "fp8_e4m3": + raise NotImplementedError( + "Currently supported kv cache quantization is " + f"dtype=fp8_e4m3, however received {dtype}") + + qscheme = kv_cache_config.get("qscheme") + if qscheme != "per_tensor": + raise NotImplementedError( + "Only support per-tensor scaling factor " + "for quark KV cache. " + f"Expected qscheme: per_tensor, found qscheme: {qscheme}") diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py new file mode 100644 index 0000000000000..3e19247300808 --- /dev/null +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -0,0 +1,225 @@ +from typing import Any, Callable, Dict, Optional + +import torch + +import vllm.model_executor.layers.fused_moe # noqa +from vllm import _custom_ops as ops +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase, + FusedMoeWeightScaleSupported) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize) +from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform + +logger = init_logger(__name__) + +__all__ = ["QuarkMoEMethod", "QuarkW8A8Fp8MoEMethod"] + + +class QuarkMoEMethod(FusedMoEMethodBase): + + @staticmethod + def get_moe_method( + quant_config: "QuarkConfig", # type: ignore # noqa E501 # noqa F821 + module: torch.nn.Module, + layer_name: str) -> "QuarkMoEMethod": + layer_quant_config = quant_config._find_matched_config( + layer_name, module) + + if (layer_quant_config.get("output_tensors") + or layer_quant_config.get("bias")): + raise NotImplementedError("Currently, Quark models with " + "output_tensors and bias " + "quantized are not supported") + weight_config = layer_quant_config.get("weight") + input_config = layer_quant_config.get("input_tensors") + + if quant_config._is_fp8_w8a8(weight_config, input_config): + return QuarkW8A8Fp8MoEMethod(weight_config, input_config) + else: + raise RuntimeError("Unsupported FusedMoe scheme") + + +class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): + + def __init__(self, weight_config: Dict[str, Any], input_config: Dict[str, + Any]): + self.weight_quant = weight_config + self.input_quant = input_config + + weight_qscheme = self.weight_quant.get("qscheme") + input_qscheme = self.input_quant.get("qscheme") + if not (weight_qscheme == "per_tensor" + and input_qscheme == "per_tensor"): + raise ValueError( + "For FP8 Fused MoE layers, only per-tensor scales" + "for weights and activations are supported. Found " + f"{weight_qscheme}, {input_qscheme}") # noqa E501 + + self.static_input_scales = not self.input_quant.get("is_dynamic") + + def create_weights(self, layer: torch.nn.Module, num_experts: int, + hidden_size: int, intermediate_size: int, + params_dtype: torch.dtype, **extra_weight_attrs): + + params_dtype = torch.float8_e4m3fn + + # WEIGHTS + w13_weight = torch.nn.Parameter(torch.empty(num_experts, + 2 * intermediate_size, + hidden_size, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + w2_weight = torch.nn.Parameter(torch.empty(num_experts, + hidden_size, + intermediate_size, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + # WEIGHT_SCALES + # Allocate 2 scales for w1 and w3 respectively. + # They will be combined to a single scale after weight loading. + w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts, + 2, + dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + + w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts, + dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + # Add the quantization method used (per tensor/grouped/channel) + # to ensure the weight scales are loaded in properly + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + # INPUT_SCALES + if self.static_input_scales: + w13_input_scale = torch.nn.Parameter(torch.ones( + num_experts, dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w13_input_scale", w13_input_scale) + set_weight_attrs(w13_input_scale, extra_weight_attrs) + + w2_input_scale = torch.nn.Parameter(torch.ones( + num_experts, dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w2_input_scale", w2_input_scale) + set_weight_attrs(w2_input_scale, extra_weight_attrs) + else: + layer.w13_input_scale = None + layer.w2_input_scale = None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # Fp8 moe kernels require a single activation scale. + # We take the max of all the scales in case they differ. + if self.static_input_scales: + if (layer.w13_input_scale is None or layer.w2_input_scale is None): + raise ValueError( + "QuantConfig has static quantization, but found " + "activation scales are None.") + if (not all_close_1d(layer.w13_input_scale) + or not all_close_1d(layer.w2_input_scale)): + logger.warning_once( + "Found input_scales that are not equal for " + "fp8 MoE layer. Using the maximum across experts " + "for each layer. ") + layer.w13_input_scale = torch.nn.Parameter( + layer.w13_input_scale.max(), requires_grad=False) + layer.w2_input_scale = torch.nn.Parameter( + layer.w2_input_scale.max(), requires_grad=False) + + # If rocm, normalize the weights and scales to e4m3fnuz + if current_platform.is_rocm(): + # Normalize the weights and scales + w13_weight, w13_weight_scale, w13_input_scale = \ + normalize_e4m3fn_to_e4m3fnuz( + layer.w13_weight, layer.w13_weight_scale, + layer.w13_input_scale) + w2_weight, w2_weight_scale, w2_input_scale = \ + normalize_e4m3fn_to_e4m3fnuz( + layer.w2_weight, layer.w2_weight_scale, + layer.w2_input_scale) + # Reset the parameter + layer.w13_weight = torch.nn.Parameter(w13_weight, + requires_grad=False) + layer.w13_weight_scale = torch.nn.Parameter(w13_weight_scale, + requires_grad=False) + if w13_input_scale is not None: + layer.w13_input_scale = torch.nn.Parameter(w13_input_scale, + requires_grad=False) + layer.w2_weight = torch.nn.Parameter(w2_weight, + requires_grad=False) + layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale, + requires_grad=False) + if w2_input_scale is not None: + layer.w2_input_scale = torch.nn.Parameter(w2_input_scale, + requires_grad=False) + + # Fp8 moe kernel needs single weight scale for w13 per expert. + # We take the max then dequant and requant each expert. + assert layer.w13_weight_scale is not None + shard_size = layer.intermediate_size_per_partition + max_w13_scales = layer.w13_weight_scale.max(dim=1).values + for expert_id in range(layer.num_experts): + start = 0 + for shard_id in range(2): + dq_weight = per_tensor_dequantize( + layer.w13_weight[expert_id][start:start + shard_size, :], + layer.w13_weight_scale[expert_id][shard_id]) + layer.w13_weight[expert_id][ + start:start + shard_size, :], _ = ops.scaled_fp8_quant( + dq_weight, max_w13_scales[expert_id]) + start += shard_size + + layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales, + requires_grad=False) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + from vllm.model_executor.layers.fused_moe import fused_experts + + topk_weights, topk_ids = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias) + + return fused_experts(x, + layer.w13_weight, + layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + use_fp8_w8a8=True, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale) diff --git a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py new file mode 100644 index 0000000000000..fb0ba9bd5220c --- /dev/null +++ b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py @@ -0,0 +1,5 @@ +from .quark_scheme import QuarkScheme +from .quark_w8a8_fp8 import QuarkW8A8Fp8 +from .quark_w8a8_int8 import QuarkW8A8Int8 + +__all__ = ["QuarkScheme", "QuarkW8A8Fp8", "QuarkW8A8Int8"] diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py new file mode 100644 index 0000000000000..239597fa4be0e --- /dev/null +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py @@ -0,0 +1,52 @@ +from abc import ABC, abstractmethod +from typing import Optional + +import torch + +__all__ = ["QuarkScheme"] + + +class QuarkScheme(ABC): + """ + Abstract class used to describe the weight creation and forward pass + of different quantization schemes supported by Quark. + """ + + @classmethod + @abstractmethod + def get_min_capability(cls) -> int: + """ + Get minimum device capability. + """ + raise NotImplementedError + + @abstractmethod + def create_weights(self, *args, **kwargs): + """ + Weight creation for the particular scheme. Inputs to this function + + """ + raise NotImplementedError + + @abstractmethod + def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, + bias: Optional[torch.Tensor]): + """ + Run the forward pass for the particular scheme. This is where + scheme-specific dequant/quant steps/kernels should be applied. + + :param layer: torch.nn.Module with the registered weights and + other parameters relevant to the particular scheme. + :param x: input to the layer + :param bias: bias parameter + + """ + raise NotImplementedError + + @abstractmethod + def process_weights_after_loading(self, layer: torch.nn.Module): + """ + Called after weight loading is complete for any cleanup that + needs to occur. + """ + raise NotImplementedError diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py new file mode 100644 index 0000000000000..206931ea2ffc0 --- /dev/null +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py @@ -0,0 +1,140 @@ +from typing import Callable, List, Optional + +import torch +from torch.nn import Parameter + +from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + apply_fp8_linear, cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz, + requantize_with_max_scale) +from vllm.model_executor.parameter import (ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter) +from vllm.platforms import current_platform + +__all__ = ["QuarkW8A8Fp8"] + + +class QuarkW8A8Fp8(QuarkScheme): + + def __init__(self, qscheme: str, is_static_input_scheme: Optional[bool]): + self.qscheme = qscheme + self.is_static_input_scheme = is_static_input_scheme + self.cutlass_fp8_supported = cutlass_fp8_supported() + + @classmethod + def get_min_capability(cls) -> int: + # lovelace and up + return 89 + + def process_weights_after_loading(self, layer) -> None: + # If per tensor, when we have a fused module (e.g. QKV) with per + # tensor scales (thus N scales being passed to the kernel), + # requantize so we can always run per tensor + if self.qscheme == "per_tensor": + max_w_scale, weight = requantize_with_max_scale( + weight=layer.weight, + weight_scale=layer.weight_scale, + logical_widths=layer.logical_widths, + ) + + if current_platform.is_rocm(): + weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz( + weight=weight, + weight_scale=max_w_scale, + input_scale=layer.input_scale) + if input_scale is not None: + layer.input_scale = Parameter(input_scale, + requires_grad=False) + + layer.weight = Parameter(weight.t(), requires_grad=False) + layer.weight_scale = Parameter(max_w_scale, requires_grad=False) + + # If channelwise, scales are already lined up, so just transpose. + elif self.qscheme == "per_channel": + weight = layer.weight + + if current_platform.is_rocm(): + weight, weight_scale, input_scale = \ + normalize_e4m3fn_to_e4m3fnuz( + weight=weight, + weight_scale=layer.weight_scale, + input_scale=layer.input_scale) + if input_scale is not None: + layer.input_scale = Parameter(input_scale, + requires_grad=False) + else: + weight_scale = layer.weight_scale.data + + layer.weight = Parameter(weight.t(), requires_grad=False) + # required by torch.compile to be torch.nn.Parameter + layer.weight_scale = Parameter(weight_scale, requires_grad=False) + + else: + raise ValueError(f"Unknown quantization scheme {self.qscheme}") + + # INPUT SCALE + if self.is_static_input_scheme: + layer.input_scale = Parameter(layer.input_scale.max(), + requires_grad=False) + else: + layer.input_scale = None + + def create_weights(self, layer: torch.nn.Module, + output_partition_sizes: List[int], + input_size_per_partition: int, + params_dtype: torch.dtype, weight_loader: Callable, + **kwargs): + output_size_per_partition = sum(output_partition_sizes) + layer.logical_widths = output_partition_sizes + + # WEIGHT + weight = ModelWeightParameter(data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=torch.float8_e4m3fn), + input_dim=1, + output_dim=0, + weight_loader=weight_loader) + layer.register_parameter("weight", weight) + + # WEIGHT SCALE + # TODO: update create_xxx_parameter functions to return + # the newly added parameters + if self.qscheme == "per_channel": + weight_scale = ChannelQuantScaleParameter( + data=torch.empty((sum(output_partition_sizes), 1), + dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader) + else: + assert self.qscheme == "per_tensor" + weight_scale = PerTensorScaleParameter(data=torch.empty( + len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader) + + # min requirement for fp8 kernels + weight_scale[:] = torch.finfo(torch.float32).min + layer.register_parameter("weight_scale", weight_scale) + + # INPUT SCALE + if self.is_static_input_scheme: + input_scale = PerTensorScaleParameter(data=torch.empty( + len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader) + input_scale[:] = torch.finfo(torch.float32).min + layer.register_parameter("input_scale", input_scale) + + def apply_weights(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + + return apply_fp8_linear( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=layer.input_scale, + bias=bias, + cutlass_fp8_supported=self.cutlass_fp8_supported, + use_per_token_if_dynamic=True) diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py new file mode 100644 index 0000000000000..8cb47e9c37e56 --- /dev/null +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py @@ -0,0 +1,105 @@ +from typing import Callable, List, Optional, Set + +import torch + +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.kernels.scaled_mm import ( + ScaledMMLinearLayerConfig, choose_scaled_mm_linear_kernel) +from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme +from vllm.model_executor.parameter import (BasevLLMParameter, + ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter) + +logger = init_logger(__name__) + + +class QuarkW8A8Int8(QuarkScheme): + _kernel_backends_being_used: Set[str] = set() + + def __init__(self, qscheme: str, is_static_input_scheme: Optional[bool], + input_symmetric: Optional[bool]): + self.qscheme = qscheme + self.is_static_input_scheme = is_static_input_scheme + self.input_symmetric = input_symmetric + + @classmethod + def get_min_capability(cls) -> int: + # turing and up + return 75 + + def create_weights(self, layer: torch.nn.Module, + output_partition_sizes: List[int], + input_size_per_partition: int, + params_dtype: torch.dtype, weight_loader: Callable, + **kwargs): + self.logical_widths = output_partition_sizes + + scaled_mm_linear_kernel_config = ScaledMMLinearLayerConfig( + is_channelwise=(self.qscheme == "per_channel"), + is_static_input_scheme=(self.is_static_input_scheme is True), + input_symmetric=(self.input_symmetric is True)) + + kernel_type = choose_scaled_mm_linear_kernel( + scaled_mm_linear_kernel_config) + + if kernel_type.__name__ not in self._kernel_backends_being_used: + logger.info("Using %s for QuarkW8A8Int8", kernel_type.__name__) + self._kernel_backends_being_used.add(kernel_type.__name__) + + # WEIGHT + weight = ModelWeightParameter(data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition, + dtype=torch.int8), + input_dim=1, + output_dim=0, + weight_loader=weight_loader) + + layer.register_parameter("weight", weight) + + # WEIGHT SCALE + if self.qscheme == "per_channel": + weight_scale = ChannelQuantScaleParameter( + data=torch.empty((sum(output_partition_sizes), 1), + dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader) + else: + assert self.qscheme == "per_tensor" + weight_scale = PerTensorScaleParameter(data=torch.empty( + len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader) + layer.register_parameter("weight_scale", weight_scale) + + # INPUT SCALE + if self.is_static_input_scheme: + input_scale = BasevLLMParameter(data=torch.empty( + 1, dtype=torch.float32), + weight_loader=weight_loader) + layer.register_parameter("input_scale", input_scale) + + if not self.input_symmetric: + # Note: quark stores the zp using the same dtype + # as the weights + # AZP loaded as int8 but used as int32 + input_zero_point = BasevLLMParameter( + data=torch.empty(1, dtype=torch.int8), + weight_loader=weight_loader) + layer.register_parameter("input_zero_point", input_zero_point) + + self.kernel = kernel_type(c=scaled_mm_linear_kernel_config, + w_q_param_name="weight", + w_s_param_name="weight_scale", + i_s_param_name="input_scale", + i_zp_param_name="input_zero_point", + azp_adj_param_name="azp_adj") + + # Checkpoints are serialized in quark format, which is + # different from the format the kernel may want. Handle repacking here. + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + self.kernel.process_weights_after_loading(layer) + + def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, + bias: Optional[torch.Tensor]) -> torch.Tensor: + return self.kernel.apply_weights(layer, x, bias) diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py new file mode 100644 index 0000000000000..742a629bdb1c5 --- /dev/null +++ b/vllm/model_executor/layers/quantization/quark/utils.py @@ -0,0 +1,99 @@ +import re +from typing import Any, Iterable, Optional + +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + FUSED_LAYER_NAME_MAPPING) + + +def deep_compare(dict1: Any, dict2: Any) -> bool: + if type(dict1) is not type(dict2): + return False + if isinstance(dict1, dict): + if dict1.keys() != dict2.keys(): + return False + return all(deep_compare(dict1[k], dict2[k]) for k in dict1) + elif isinstance(dict1, list): + return set(dict1) == set(dict2) + else: + return dict1 == dict2 + + +def should_ignore_layer(layer_name: Optional[str], + ignore: Iterable[str]) -> bool: + if layer_name is None: + return False + + # layer_name = model.layers.0.self_attn.qkv_proj + # proj_name = qkv_proj + proj_name = layer_name.split(".")[-1] + + # Fused layers like gate_up_proj or qkv_proj will not be fused + # in the safetensors checkpoint. So, we convert the name + # from the fused version to unfused + check to make sure that + # each shard of the fused layer has the same scheme. + if proj_name in FUSED_LAYER_NAME_MAPPING: + shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name] + + # Convert fused_name --> [shard_names] + shard_names = [ + layer_name.replace(proj_name, shard_proj_name) + for shard_proj_name in shard_proj_names + ] + + # Layer should be ignored if shards are ignored. + should_ignore_layer = None + for shard_name in shard_names: + should_ignore_shard = check_equal_or_regex_match( + layer_name=shard_name, targets=ignore) + + # If shard_idx=0, set layer ignore to match shard. + if should_ignore_layer is None: + should_ignore_layer = should_ignore_shard + + # If shard_idx=1+ confirm scheme matches prior shards. + elif should_ignore_shard != should_ignore_layer: + raise ValueError(f"Found a different quantization schemes for " + f"{shard_proj_names} in {layer_name}. vLLM " + "requires all to use the same scheme.") + + # Unfused layers like down_proj and o_proj will match + # the safetensors checkpoint already. + else: + should_ignore_layer = check_equal_or_regex_match(layer_name=layer_name, + targets=ignore) + + assert should_ignore_layer is not None + return should_ignore_layer + + +def check_equal_or_regex_match(layer_name: str, + targets: Iterable[str]) -> bool: + """ + Checks whether a layer_name is exactly equal or a regex match for + if target starts with 're:' to any target in list. + """ + for target in targets: + if _is_equal_or_regex_match(layer_name, target): + return True + return False + + +def _is_equal_or_regex_match(value: str, + target: str, + check_contains: bool = False) -> bool: + """ + Checks whether a value is exactly equal or a regex match for target + if target starts with 're:'. If check_contains is set to True, + additionally checks if the target string is contained within the value. + """ + + if target.startswith("re:"): + pattern = target[3:] + if re.match(pattern, value): + return True + elif check_contains: + if target.lower() in value.lower(): + return True + elif target == value: + return True + return False diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index f3c3e130e4161..b6882cc7c837c 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -5,6 +5,8 @@ import torch import triton import triton.language as tl +from vllm.platforms import current_platform + def apply_w8a8_block_fp8_linear( input: torch.Tensor, @@ -33,11 +35,14 @@ def apply_w8a8_block_fp8_linear( def input_to_float8( - x: torch.Tensor, - dtype: torch.dtype = torch.float8_e4m3fn + x: torch.Tensor, + dtype: Optional[torch.dtype] = None ) -> Tuple[torch.Tensor, torch.Tensor]: """This function quantizes input values to float8 values " "with tensor-wise quantization.""" + if dtype is None: + dtype = (torch.float8_e4m3fnuz + if current_platform.is_rocm() else torch.float8_e4m3fn) finfo = torch.finfo(dtype) min_val, max_val = x.aminmax() amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12) @@ -125,7 +130,7 @@ def per_token_group_quant_fp8( x: torch.Tensor, group_size: int, eps: float = 1e-10, - dtype: torch.dtype = torch.float8_e4m3fn, + dtype: Optional[torch.dtype] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: """Function to perform per-token-group quantization on an input tensor `x`. It converts the tensor values into signed float8 values and returns the @@ -140,6 +145,9 @@ def per_token_group_quant_fp8( Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the scaling factor for quantization. """ + if dtype is None: + dtype = (torch.float8_e4m3fnuz + if current_platform.is_rocm() else torch.float8_e4m3fn) assert (x.shape[-1] % group_size == 0), ( f"the last dimension of `x` {x.shape[-1]} must be divisible " f"by `group_size` {group_size}") diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index f173cbde03f44..9d6c3797c62fc 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -9,6 +9,7 @@ import vllm.envs as envs from vllm.logger import init_logger from vllm.model_executor.layers.spec_decode_base_sampler import ( SpecDecodeStochasticBaseSampler) +from vllm.platforms import current_platform logger = init_logger(__name__) @@ -368,7 +369,7 @@ class RejectionSampler(SpecDecodeStochasticBaseSampler): # Note that we always sample with replacement. # probs will be modified in place, but this is fine, as we pass # in a copy already. -@torch.compile(dynamic=True) +@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) def _multinomial( probs: torch.Tensor, num_samples: int, diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 3fcd81a3c4213..d071cfe888f05 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -841,6 +841,37 @@ class MRotaryEmbedding(RotaryEmbedding): ) -> Tuple[List[List[int]], int]: """Get mrope input positions and delta value.""" + llm_positions, mrope_position_delta = \ + MRotaryEmbedding.get_input_positions_tensor( + input_tokens, + image_grid_thw, + video_grid_thw, + image_token_id, + video_token_id, + vision_start_token_id, + vision_end_token_id, + spatial_merge_size, + context_len, + seq_len, + ) + + return llm_positions.tolist(), mrope_position_delta + + @staticmethod + def get_input_positions_tensor( + input_tokens: List[int], + image_grid_thw: Union[List[List[int]], torch.Tensor], + video_grid_thw: Union[List[List[int]], torch.Tensor], + image_token_id: int, + video_token_id: int, + vision_start_token_id: int, + vision_end_token_id: int, + spatial_merge_size: int, + context_len: int = 0, + seq_len: Optional[int] = None, + ) -> Tuple[torch.Tensor, int]: + """Get mrope input positions and delta value.""" + if isinstance(image_grid_thw, torch.Tensor): image_grid_thw = image_grid_thw.tolist() if isinstance(video_grid_thw, torch.Tensor): @@ -916,7 +947,7 @@ class MRotaryEmbedding(RotaryEmbedding): len(input_tokens)).item() llm_positions = llm_positions[:, context_len:seq_len] - return llm_positions.tolist(), mrope_position_delta + return llm_positions, mrope_position_delta @staticmethod def get_next_input_positions( @@ -930,6 +961,17 @@ class MRotaryEmbedding(RotaryEmbedding): seq_len + mrope_position_delta)) for _ in range(3) ] + @staticmethod + def get_next_input_positions_tensor( + mrope_position_delta: int, + context_len: int, + seq_len: int, + ) -> torch.Tensor: + return torch.arange( + mrope_position_delta + context_len, + mrope_position_delta + seq_len, + ).expand(3, -1) + _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {} diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 30548e656c557..65920aa61ba15 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -133,7 +133,7 @@ class VocabParallelEmbeddingShardIndices: assert self.num_added_elements <= self.num_added_elements_padded -@torch.compile(dynamic=True) +@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) def get_masked_input_and_mask( input_: torch.Tensor, org_vocab_start_index: int, org_vocab_end_index: int, num_org_vocab_padding: int, diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 0033fbff0e9ac..f697c3245f098 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -39,7 +39,8 @@ from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.model_loader.tensorizer import ( TensorizerConfig, is_vllm_tensorized, load_with_tensorizer, serialize_vllm_model, tensorizer_weights_iterator) -from vllm.model_executor.model_loader.utils import (get_model_architecture, +from vllm.model_executor.model_loader.utils import (ParamMapping, + get_model_architecture, set_default_torch_dtype) from vllm.model_executor.model_loader.weight_utils import ( download_safetensors_index_file_from_hf, download_weights_from_hf, @@ -181,6 +182,9 @@ class DefaultModelLoader(BaseModelLoader): fall_back_to_pt: bool = True """Whether .pt weights can be used.""" + allow_patterns_overrides: Optional[list[str]] = None + """If defined, weights will load exclusively using these patterns.""" + def __init__(self, load_config: LoadConfig): super().__init__(load_config) if load_config.model_loader_extra_config: @@ -217,6 +221,7 @@ class DefaultModelLoader(BaseModelLoader): model_name_or_path: str, revision: Optional[str], fall_back_to_pt: bool, + allow_patterns_overrides: Optional[list[str]], ) -> Tuple[str, List[str], bool]: """Prepare weights for the model. @@ -248,6 +253,9 @@ class DefaultModelLoader(BaseModelLoader): if fall_back_to_pt: allow_patterns += ["*.pt"] + if allow_patterns_overrides is not None: + allow_patterns = allow_patterns_overrides + if not is_local: hf_folder = download_weights_from_hf( model_name_or_path, @@ -297,7 +305,8 @@ class DefaultModelLoader(BaseModelLoader): ) -> Generator[Tuple[str, torch.Tensor], None, None]: """Get an iterator for the model weights based on the load format.""" hf_folder, hf_weights_files, use_safetensors = self._prepare_weights( - source.model_or_path, source.revision, source.fall_back_to_pt) + source.model_or_path, source.revision, source.fall_back_to_pt, + source.allow_patterns_overrides) if self.load_config.load_format == LoadFormat.NPCACHE: # Currently np_cache only support *.bin checkpoints assert use_safetensors is False @@ -339,6 +348,8 @@ class DefaultModelLoader(BaseModelLoader): prefix="", fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load", True), + allow_patterns_overrides=getattr(model, "allow_patterns_overrides", + None), ) yield from self._get_weights_iterator(primary_weights) @@ -352,7 +363,8 @@ class DefaultModelLoader(BaseModelLoader): def download_model(self, model_config: ModelConfig) -> None: self._prepare_weights(model_config.model, model_config.revision, - fall_back_to_pt=True) + fall_back_to_pt=True, + allow_patterns_overrides=None) def load_model(self, vllm_config: VllmConfig) -> nn.Module: device_config = vllm_config.device_config @@ -983,21 +995,11 @@ class BitsAndBytesModelLoader(BaseModelLoader): def _get_bnb_target_modules(self, model: nn.Module) -> None: - # TODO: Maybe we can replace bitsandbytes_stacked_params_mapping with - # packed_modules_mapping. - inverse_stacked_mapping: Dict[str, List[str]] = {} - for orig, ( - packed, - idx, - ) in model.bitsandbytes_stacked_params_mapping.items(): - if packed not in inverse_stacked_mapping: - inverse_stacked_mapping[packed] = [] - inverse_stacked_mapping[packed].insert(idx, orig) - for name, module in model.named_modules(): if isinstance(module, (LinearBase, )): last_name = name.split(".")[-1] - if sub_modules := inverse_stacked_mapping.get(last_name, []): + if sub_modules := self.modules_mapping.packed_mapping.get( + last_name, []): # Map vllm's names to transformers's names. for sub_name in sub_modules: self.target_modules.append( @@ -1018,15 +1020,19 @@ class BitsAndBytesModelLoader(BaseModelLoader): "The required method 'load_weights' is not defined in class" f" {type(model).__name__}.") - if not hasattr(model, "bitsandbytes_stacked_params_mapping"): + if not hasattr(model, "packed_modules_mapping"): raise AttributeError( f"Model {type(model).__name__} does not support BitsAndBytes " - "quantization yet.") + "quantization yet. No 'packed_modules_mapping' found.") + + self.modules_mapping = ParamMapping( + copy.deepcopy(model.packed_modules_mapping)) # For some models like Molmo, we need to use hf_to_vllm_mapper # to ensure correct loading of weights. if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None): self.weight_mapper = lambda name: hf_to_vllm_mapper._map_name(name) + # Modules whose weights might have fused on disk # we need their output_sizes to make shard in flight correctly with TP self.maybe_fused_weights_modules: Dict[str, List[int]] = {} @@ -1109,16 +1115,23 @@ class BitsAndBytesModelLoader(BaseModelLoader): for shard_name, ( weight_name, index, - ) in model.bitsandbytes_stacked_params_mapping.items(): - shard_pos = quant_param_name.find(shard_name) + ) in self.modules_mapping.inverse_packed_mapping.items(): # Some models, such as MiniCPM V2.5/2.6, contain both # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj' # from being incorrectly identified as being present in # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight - if shard_pos > 0 and quant_param_name[shard_pos - 1] == ".": + shard_pos = quant_param_name.find(shard_name) + can_correct_rename = (shard_pos > 0) and ( + quant_param_name[shard_pos - 1] == ".") + # If the quant_param_name is packed, it won't occur in the + # param_dict before renaming. + new_quant_param_name = quant_param_name.replace( + shard_name, weight_name) + need_rename = (quant_param_name not in param_dict) \ + and (new_quant_param_name in param_dict) + if can_correct_rename and need_rename: shard_index = index - quant_param_name = quant_param_name.replace( - shard_name, weight_name) + quant_param_name = new_quant_param_name break # Models like Clip/Siglip may skip some layers in initialization, diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index fbd4937112e11..5b4757072353f 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -459,16 +459,7 @@ def tensorize_vllm_model(engine_args: EngineArgs, stream.write(encryption_params.key) engine = LLMEngine.from_engine_args(engine_args) - if tensorizer_config._is_sharded: - # if the engine is a distributed engine (for tensor parallel) then each - # worker shard needs to serialize its part of the model. - engine.model_executor._run_workers( - "save_tensorized_model", - tensorizer_config=tensorizer_config, - ) - else: - # with a single worker, we can get to the underlying model directly - serialize_vllm_model( - engine.model_executor.driver_worker.model_runner.model, - tensorizer_config, - ) + engine.model_executor.collective_rpc( + "save_tensorized_model", + kwargs=dict(tensorizer_config=tensorizer_config), + ) diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 44978a55e072d..3f923d2f6632a 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -1,6 +1,7 @@ """Utilities for selecting and loading models.""" import contextlib -from typing import Tuple, Type +from dataclasses import dataclass, field +from typing import Dict, List, Tuple, Type import torch from torch import nn @@ -49,3 +50,26 @@ def get_model_architecture( def get_architecture_class_name(model_config: ModelConfig) -> str: return get_model_architecture(model_config)[1] + + +@dataclass +class ParamMapping: + """ + A class to handle parameter mapping for model weight loading. + It creates a bidirectional mapping between packed parameters and their + constituent parts. + """ + packed_mapping: Dict[str, List[str]] + inverse_packed_mapping: Dict[str, Tuple[str, + int]] = field(default_factory=dict) + + def __post_init__(self): + for packed_name, sub_params in self.packed_mapping.items(): + # Skip self-contained cases (e.g., {"W_pack": ["W_pack"]}) + if len(sub_params) == 1 and sub_params[0] == packed_name: + continue + for index, param_name in enumerate(sub_params): + self.inverse_packed_mapping[param_name] = ( + packed_name, + index, + ) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 11d5fd7135d9e..9cfcdbf620d2b 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -29,7 +29,9 @@ from vllm.utils import PlaceholderModule try: from runai_model_streamer import SafetensorsStreamer -except ImportError: +except (ImportError, OSError): + # see https://github.com/run-ai/runai-model-streamer/issues/26 + # OSError will be raised on arm64 platform runai_model_streamer = PlaceholderModule( "runai_model_streamer") # type: ignore[assignment] SafetensorsStreamer = runai_model_streamer.placeholder_attr( diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 089062ab53fc3..5b97eced62df0 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -13,8 +13,6 @@ from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - get_compressed_tensors_cache_scale) from vllm.model_executor.layers.sampler import (SamplerOutput, SamplingMetadata, get_sampler) from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead @@ -390,12 +388,14 @@ class AriaMoELMModel(LlamaModel): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - if scale_name := get_compressed_tensors_cache_scale(name): - # Loading kv cache scales for compressed-tensors quantization + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 5e68b7f165bf4..a923ed36a9db2 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -350,13 +350,6 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP): embedding_modules = {} embedding_padding_modules = [] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } - def __init__( self, *, diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 7dfc0b687c6e3..917b88e802071 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -441,6 +441,24 @@ class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]): class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]): + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + if not mm_data: + # HF processor always adds placeholders even when there's no image + tokenizer = self.info.get_tokenizer() + prompt_ids = tokenizer.encode(prompt) + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") + + return super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + def _get_mm_fields_config( self, hf_inputs: BatchFeature, @@ -469,11 +487,11 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]): def apply( self, - prompt_text: str, + prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], ) -> MultiModalInputsV2: - result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) # Only tokens should be considered as placeholders, # so we ignore the trailing bos_token diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 452fe727875fe..a6634204699c9 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -99,6 +99,34 @@ class ChameleonDummyInputsBuilder( class ChameleonMultiModalProcessor( BaseMultiModalProcessor[ChameleonProcessingInfo]): + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + if not mm_data: + prompt_ids = self.info.get_tokenizer().encode(prompt) + prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") + + return super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + + def _apply_hf_processor_tokens_only( + self, + prompt_tokens: list[int], + ) -> list[int]: + # HF processor adds sep token for chat mode + tokenizer = self.info.get_tokenizer() + sep_token_id: int = \ + tokenizer.vocab[tokenizer.sep_token] # type: ignore + + return prompt_tokens + [sep_token_id] + def _get_mm_fields_config( self, hf_inputs: BatchFeature, @@ -128,11 +156,11 @@ class ChameleonMultiModalProcessor( def apply( self, - prompt_text: str, + prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], ) -> MultiModalInputsV2: - result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) # Only tokens should be considered as placeholders, # so we ignore the image_start_token and image_end_token diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index ffd6891b25965..d5f9b4d19e5ca 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -1,6 +1,6 @@ # Adapted from -# https://github.com/THUDM/GLM-4 -"""Inference-only ChatGLM model compatible with THUDM weights.""" +# https://github.com/THUDM/CogAgent +"""Inference-only CogAgent model compatible with THUDM weights.""" from argparse import Namespace from array import array from typing import (Dict, Iterable, List, Mapping, Optional, Set, Tuple, @@ -41,7 +41,7 @@ from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, from vllm.transformers_utils.configs import ChatGLMConfig from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP -from .utils import (is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -201,7 +201,6 @@ def input_processor_for_glmv(ctx: InputContext, inputs: DecoderOnlyInputs): new_input_ids = [] final_processed_position = 0 - final_processed_position = 0 for boi_position, eoi_position in zip(boi_positions, eoi_positions): assert boi_position < eoi_position @@ -275,12 +274,15 @@ class GLMAttention(nn.Module): # https://huggingface.co/THUDM/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141 rope_ratio = getattr(config, "rope_ratio", 1.0) max_positions = getattr(config, "seq_length", 8192) + # NOTE: THUDM/cogagent-9b-20241220 uses original_rope=False, + # which is equivalent to is_neox_style=True + is_neox_style = not config.original_rope self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim // 2, max_position=max_positions, base=10000 * rope_ratio, - is_neox_style=False, + is_neox_style=is_neox_style, ) self.attn = Attention(self.num_heads, self.head_dim, @@ -603,9 +605,50 @@ class ChatGLMModel(nn.Module): return IntermediateTensors({"hidden_states": hidden_states}) return hidden_states + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("linear_proj.merged_proj", "linear_proj.gate_proj", 0), + ("linear_proj.merged_proj", "linear_proj.dense_h_to_4h", 1), + ] + params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() + + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + if "rotary_pos_emb.inv_freq" in name: + continue + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={".word_embeddings": ""}, ) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config @@ -658,52 +701,9 @@ class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP): next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: - # Merge two ColumnParallelLinear into one MergedColumnParallelLinear - merged_weights_dict: Dict[str, Dict[str, Optional[torch.Tensor]]] = { - "transformer.vision.linear_proj.merged_proj.weight": { - "transformer.vision.linear_proj.gate_proj.weight": None, - "transformer.vision.linear_proj.dense_h_to_4h.weight": None, - } - } - - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() - for name, loaded_weight in weights: - is_weight_to_be_merge = False - for _, merged_weight_dict in merged_weights_dict.items(): - if name in merged_weight_dict: - assert merged_weight_dict[name] is None - merged_weight_dict[name] = loaded_weight - is_weight_to_be_merge = True - if is_weight_to_be_merge: - continue - if "rotary_pos_emb.inv_freq" in name: - continue - if "word_embeddings" in name: - name = name.replace(".word_embeddings", "") - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - - for combined_name, merged_weight_dict in merged_weights_dict.items(): - if combined_name in params_dict: - param = params_dict[combined_name] - combined_weight = torch.cat(list(merged_weight_dict.values()), - dim=0) - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, combined_weight) - loaded_params.add(combined_name) - return loaded_params + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) class ChatGLM(ChatGLMBaseModel): @@ -724,6 +724,7 @@ class ChatGLM(ChatGLMBaseModel): class ChatGLMV(ChatGLMBaseModel, SupportsMultiModal): + packed_modules_mapping = { "query_key_value": ["query_key_value"], "dense_h_to_4h": ["dense_h_to_4h"], @@ -775,8 +776,8 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, ) -> None: config = vllm_config.model_config.hf_config # Initialize VL - if hasattr(config, "visual"): + if hasattr(config, "vision_config"): return ChatGLMV(vllm_config=vllm_config, prefix=prefix) # Initialize LLM else: - return ChatGLM(vllm_config=vllm_config, prefix=prefix) + return ChatGLM(vllm_config=vllm_config, prefix=prefix) \ No newline at end of file diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index d22d1f3171463..989056bf5c155 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -45,6 +45,7 @@ from vllm.model_executor.model_loader.weight_utils import ( row_parallel_weight_loader) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -53,7 +54,7 @@ from .utils import (extract_layer_index, is_pp_missing_parameter, maybe_prefix) -@torch.compile +@torch.compile(backend=current_platform.simple_compile_backend) def layer_norm_func(hidden_states, weight, variance_epsilon): input_dtype = hidden_states.dtype hidden_states = hidden_states.to(torch.float32) @@ -436,6 +437,19 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP): params_dict = dict(self.named_parameters()) loaded_params: Set[str] = set() for name, loaded_weight in weights: + + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + for param_name, shard_name, shard_id in stacked_params_mapping: if shard_name not in name: continue diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 3932d8b52a9d1..b2aa3c0709bd4 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -83,7 +83,7 @@ class DbrxExperts(FusedMoE): # Define custom weight loader for dbrx model def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, - weight_name: str): + weight_name: str, param_name: str): tp_rank = get_tensor_model_parallel_rank() param_data = param.data shard_size = self.intermediate_size @@ -91,25 +91,37 @@ class DbrxExperts(FusedMoE): # DBRX uses GLU for each experts. # GLU has 3 linear layers: w1, v1 and w2. if weight_name.endswith("w1"): - loaded_weight = torch.reshape( - loaded_weight, - [-1, self.intermediate_size * self.tp_size, self.d_model], - ) - param_data[:, 0:shard_size, :] = loaded_weight[:, shard, :] + if param_name.endswith("weight"): + loaded_weight = torch.reshape( + loaded_weight, + [-1, self.intermediate_size * self.tp_size, self.d_model], + ) + param_data[:, 0:shard_size, :] = loaded_weight[:, shard, :] + elif param_name.endswith("weight_scale"): + param_data[:, 0] = loaded_weight + else: + param_data = loaded_weight if weight_name.endswith("v1"): - loaded_weight = torch.reshape( - loaded_weight, - [-1, self.intermediate_size * self.tp_size, self.d_model], - ) - param_data[:, - shard_size:2 * shard_size, :] = loaded_weight[:, - shard, :] + if param_name.endswith("weight"): + loaded_weight = torch.reshape( + loaded_weight, + [-1, self.intermediate_size * self.tp_size, self.d_model], + ) + param_data[:, shard_size:2 * + shard_size, :] = loaded_weight[:, shard, :] + elif param_name.endswith("weight_scale"): + param_data[:, 1] = loaded_weight + else: + param_data[:] = loaded_weight if weight_name.endswith("w2"): - loaded_weight = torch.reshape( - loaded_weight, - [-1, self.intermediate_size * self.tp_size, self.d_model], - ).transpose(1, 2) - param_data[:] = loaded_weight[:, :, shard] + if param_name.endswith("weight"): + loaded_weight = torch.reshape( + loaded_weight, + [-1, self.intermediate_size * self.tp_size, self.d_model], + ).transpose(1, 2) + param_data[:] = loaded_weight[:, :, shard] + else: + param_data[:] = loaded_weight class DbrxMoE(nn.Module): @@ -430,14 +442,28 @@ class DbrxForCausalLM(nn.Module, SupportsPP): def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: - expert_params_mapping = [( - "w13_weight" if weight_name in ["w1", "v1"] else "w2_weight", + "w13" if weight_name in ["w1", "v1"] else "w2", f"mlp.{weight_name}", ) for weight_name in ["w1", "v1", "w2"]] params_dict = dict(self.named_parameters(remove_duplicate=False)) loaded_params: Set[str] = set() + for name, loaded_weight in weights: + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + + if name.endswith(("w1", "w2", "v1")): + name = name + "_weight" for param_name, weight_name in expert_params_mapping: if weight_name not in name: continue @@ -446,8 +472,9 @@ class DbrxForCausalLM(nn.Module, SupportsPP): continue param = params_dict[name] weight_loader = param.weight_loader - weight_loader(param, loaded_weight, weight_name) + weight_loader(param, loaded_weight, weight_name, name) break + else: # Remapping the name of FP8 kv-scale. name = maybe_remap_kv_scale_name(name, params_dict) @@ -456,6 +483,9 @@ class DbrxForCausalLM(nn.Module, SupportsPP): if is_pp_missing_parameter(name, self): continue + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 4cf4e6c358bf2..af6810a140b43 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -45,7 +45,8 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors @@ -243,7 +244,11 @@ class DeepseekV2Attention(nn.Module): bias=False, quant_config=quant_config, prefix=f"{prefix}.o_proj") - rope_scaling["rope_type"] = 'deepseek_yarn' + if rope_scaling: + rope_scaling["rope_type"] = 'deepseek_yarn' + self.use_normal_rope = False + else: + self.use_normal_rope = True self.rotary_emb = get_rope(qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, @@ -257,14 +262,8 @@ class DeepseekV2Attention(nn.Module): mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.scaling = self.scaling * mscale * mscale - # self.attn = Attention(self.num_heads, - # self.qk_head_dim, - # self.scaling, - # num_kv_heads=self.num_heads) - - # TODO, support head_size 192 self.attn = Attention(self.num_local_heads, - 256, + self.qk_head_dim, self.scaling, num_kv_heads=self.num_local_heads, cache_config=cache_config, @@ -298,23 +297,30 @@ class DeepseekV2Attention(nn.Module): self.qk_nope_head_dim + self.v_head_dim) k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) k_pe = latent_cache[:, :, self.kv_lora_rank:] + + if self.use_normal_rope: + seq_len = positions.size(0) + ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape + q_pe = q_pe.reshape(seq_len, -1) + k_pe = k_pe.reshape(seq_len, -1) + q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe) + + if self.use_normal_rope: + q_pe, k_pe = q_pe.view(ori_q_pe_shape), k_pe.view(ori_k_pe_shape) + q[..., self.qk_nope_head_dim:] = q_pe k = torch.empty_like(q) k[..., :self.qk_nope_head_dim] = k_nope k[..., self.qk_nope_head_dim:] = k_pe - q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim], - value=0).view(-1, - self.num_local_heads * 256) - k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim], - value=0).view(-1, - self.num_local_heads * 256) - v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim], - value=0).view(-1, - self.num_local_heads * 256) + # padding value to qk_head_dim for alignment + v = torch.nn.functional.pad( + v, [0, self.qk_head_dim - self.v_head_dim], + value=0).view(-1, self.num_local_heads * self.qk_head_dim) attn_output = self.attn(q, k, v, kv_cache, attn_metadata) attn_output = attn_output.view( - -1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape( + -1, self.num_local_heads, + self.qk_head_dim)[..., :self.v_head_dim].reshape( -1, self.num_local_heads * self.v_head_dim) output, _ = self.o_proj(attn_output) return output @@ -355,6 +361,7 @@ class DeepseekV2DecoderLayer(nn.Module): quant_config=quant_config, prefix=f"{prefix}.self_attn", ) + if (config.n_routed_experts is not None and layer_idx >= config.first_k_dense_replace and layer_idx % config.moe_layer_freq == 0): @@ -619,6 +626,11 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP): if name.endswith(".bias") and name not in params_dict: continue + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + if is_pp_missing_parameter(name, self): continue diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py index 333dc019b4d99..0b44f0d062c40 100644 --- a/vllm/model_executor/models/deepseek_v3.py +++ b/vllm/model_executor/models/deepseek_v3.py @@ -251,7 +251,11 @@ class DeepseekV3Attention(nn.Module): bias=False, quant_config=quant_config, prefix=f"{prefix}.o_proj") - rope_scaling["rope_type"] = 'deepseek_yarn' + if rope_scaling: + rope_scaling["rope_type"] = 'deepseek_yarn' + self.use_normal_rope = False + else: + self.use_normal_rope = True self.rotary_emb = get_rope(qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, @@ -265,14 +269,8 @@ class DeepseekV3Attention(nn.Module): mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.scaling = self.scaling * mscale * mscale - # self.attn = Attention(self.num_heads, - # self.qk_head_dim, - # self.scaling, - # num_kv_heads=self.num_heads) - - # TODO, support head_size 192 self.attn = Attention(self.num_local_heads, - 256, + self.qk_head_dim, self.scaling, num_kv_heads=self.num_local_heads, cache_config=cache_config, @@ -306,23 +304,30 @@ class DeepseekV3Attention(nn.Module): self.qk_nope_head_dim + self.v_head_dim) k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) k_pe = latent_cache[:, :, self.kv_lora_rank:] + + if self.use_normal_rope: + seq_len = positions.size(0) + ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape + q_pe = q_pe.reshape(seq_len, -1) + k_pe = k_pe.reshape(seq_len, -1) + q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe) + + if self.use_normal_rope: + q_pe, k_pe = q_pe.view(ori_q_pe_shape), k_pe.view(ori_k_pe_shape) + q[..., self.qk_nope_head_dim:] = q_pe k = torch.empty_like(q) k[..., :self.qk_nope_head_dim] = k_nope k[..., self.qk_nope_head_dim:] = k_pe - q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim], - value=0).view(-1, - self.num_local_heads * 256) - k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim], - value=0).view(-1, - self.num_local_heads * 256) - v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim], - value=0).view(-1, - self.num_local_heads * 256) + # padding value to qk_head_dim for alignment + v = torch.nn.functional.pad( + v, [0, self.qk_head_dim - self.v_head_dim], + value=0).view(-1, self.num_local_heads * self.qk_head_dim) attn_output = self.attn(q, k, v, kv_cache, attn_metadata) attn_output = attn_output.view( - -1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape( + -1, self.num_local_heads, + self.qk_head_dim)[..., :self.v_head_dim].reshape( -1, self.num_local_heads * self.v_head_dim) output, _ = self.o_proj(attn_output) return output @@ -583,7 +588,8 @@ class DeepseekV3ForCausalLM(nn.Module, SupportsPP): continue # TODO(simon): support nextn predict layers - if self.config.num_nextn_predict_layers > 0: + if hasattr(self.config, "num_nextn_predict_layers" + ) and self.config.num_nextn_predict_layers > 0: assert self.config.num_nextn_predict_layers == 1 layer_idx = self.config.num_hidden_layers if name.startswith(f"model.layers.{layer_idx}"): @@ -639,9 +645,6 @@ class DeepseekV3ForCausalLM(nn.Module, SupportsPP): if is_pp_missing_parameter(name, self): continue - if name not in params_dict: - for key in params_dict: - print(key) param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py new file mode 100644 index 0000000000000..4d3d1c329a2c0 --- /dev/null +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -0,0 +1,642 @@ +# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py +"""Inference-only Deepseek-VL2 model compatible with HuggingFace weights.""" +import math +from functools import cached_property +from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, + TypedDict, Union) + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange, repeat +from transformers import BatchFeature + +from vllm.attention import AttentionMetadata +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.model_executor import SamplingMetadata +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler +from vllm.model_executor.model_loader.utils import set_default_torch_dtype +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + ImageSize, MultiModalDataItems) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs +from vllm.multimodal.utils import cached_get_tokenizer +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config, + MlpProjectorConfig, + VisionEncoderConfig) +from vllm.transformers_utils.processors.deepseek_vl2 import ( + DeepseekVLV2Processor) +from vllm.utils import is_list_of + +from .interfaces import SupportsMultiModal, SupportsPP +from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, + init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings) + +logger = init_logger(__name__) + +# The image token id may be various +_IMAGE_TOKEN = "" + + +class DeepseekVL2ImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: Union[torch.Tensor, List[torch.Tensor]] + """ + Shape: `(batch_size * num_images, num_channels, height, width)` + """ + images_spatial_crop: torch.Tensor + """ + Shape: `(batch_size * num_images, 2)` + """ + + +class DeepseekVL2VImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: Union[torch.Tensor, List[torch.Tensor]] + """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + """ + + +DeepseekVL2ImageInputs = Union[DeepseekVL2ImagePixelInputs, + DeepseekVL2VImageEmbeddingInputs] + + +class MlpProjector(nn.Module): + + def __init__(self, cfg: MlpProjectorConfig): + + super().__init__() + + self.cfg = cfg + assert not cfg.token_pooling, ( + "Token pooling is not supported currently.") + + if cfg.projector_type == "downsample_mlp_gelu": + mlp_depth = cfg.depth + mlp_ratio = cfg.mlp_ratio + modules = [ + nn.Linear( + cfg.input_dim * cfg.downsample_ratio * + cfg.downsample_ratio, cfg.n_embed * mlp_ratio) + ] + for _ in range(1, mlp_depth - 1): + modules.append(nn.GELU()) + modules.append( + nn.Linear(cfg.n_embed * mlp_ratio, + cfg.n_embed * mlp_ratio)) + modules.append(nn.GELU()) + modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed)) + modules = nn.Sequential(*modules) + + else: + raise NotImplementedError( + f"Unsupported projector type: {cfg.projector_type}") + + self.layers = modules + + def forward(self, x): + bs, hw, input_dim = x.shape + h = w = int((hw)**0.5) + """compute padding""" + if h % self.cfg.downsample_ratio: + pad = self.cfg.downsample_ratio - h % self.cfg.downsample_ratio + else: + pad = 0 + x = x.reshape(bs, h, w, input_dim) + if pad > 0: + x = F.pad(x, (0, 0, 0, pad, 0, pad), "constant", 0) + """4 to 1 concat""" + x = x.permute(0, 3, 1, 2) # B, C, H, W + x = F.unfold(x, + kernel_size=self.cfg.downsample_ratio, + stride=self.cfg.downsample_ratio, + padding=0) # B, C*4, HW // 4 + x = x.permute(0, 2, 1) + + return self.layers(x) + + +class DeepseekVL2ProcessingInfo(BaseProcessingInfo): + + def get_hf_config(self): + return self.ctx.get_hf_config(DeepseekVLV2Config) + + def get_hf_processor(self) -> DeepseekVLV2Processor: + return self.ctx.get_hf_processor(DeepseekVLV2Processor) + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_num_image_tokens(self, *, image_width: int, + image_height: int) -> int: + hf_processor = self.get_hf_processor() + image_size = hf_processor.image_size + patch_size = hf_processor.patch_size + downsample_ratio = hf_processor.downsample_ratio + + best_width, best_height = hf_processor.select_best_resolution( + (image_width, image_height)) + + num_width_tiles, num_height_tiles = (best_width // image_size, + best_height // image_size) + h = w = math.ceil((image_size // patch_size) / downsample_ratio) + + global_views_tokens = h * (w + 1) + local_views_tokens = (num_height_tiles * h) * (num_width_tiles * w + 1) + return global_views_tokens + local_views_tokens + 1 + + def get_image_size_with_most_features(self) -> ImageSize: + hf_config = self.get_hf_config() + candidate_resolutions = hf_config.candidate_resolutions + height, width = max(candidate_resolutions, + key=lambda x: self.get_num_image_tokens( + image_width=x[1], image_height=x[0])) + return ImageSize(width=width, height=height) + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + max_image_size = self.get_image_size_with_most_features() + max_image_tokens = self.get_num_image_tokens( + image_height=max_image_size.height, + image_width=max_image_size.width) + + return {"image": max_image_tokens} + + +class DeepseekVL2DummyInputsBuilder( + BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]): + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + hf_processor = self.info.get_hf_processor() + image_token: str = hf_processor.image_token + + max_image_size = self.info.get_image_size_with_most_features() + + mm_data = { + "image": + self._get_dummy_images(width=max_image_size.width, + height=max_image_size.height, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text=image_token * num_images, + mm_data=mm_data, + ) + + +class DeepseekVL2MultiModalProcessor( + BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]): + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + if mm_data: + processed_outputs = self.info.ctx.call_hf_processor( + self.info.get_hf_processor(**mm_kwargs), + dict(prompt=prompt, **mm_data), + mm_kwargs, + ) + target_dtype = self.info.ctx.model_config.dtype + pixel_values = processed_outputs.pop("pixel_values").to( + target_dtype) + # split pixel values into patches corresponding to each image + images_spatial_crop = processed_outputs["images_spatial_crop"] + patches_per_image = [ + x.prod().item() + 1 for x in images_spatial_crop + ] + pixel_values = pixel_values.split(patches_per_image) + processed_outputs["pixel_values"] = pixel_values + else: + tokenizer = self.info.get_tokenizer() + processed_outputs = tokenizer(prompt, + add_special_tokens=True, + return_tensors="pt") + + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + images_spatial_crop=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_processor = self.info.get_hf_processor() + image_token_id: int = hf_processor.image_token_id + + def get_replacement_deepseek_vl2(item_idx: int): + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) + + if isinstance(images, ImageEmbeddingItems): + num_image_tokens = images.get_feature_size(item_idx) + else: + image_size = images.get_image_size(item_idx) + + num_image_tokens = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + ) + return [image_token_id] * num_image_tokens + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=get_replacement_deepseek_vl2, + ) + ] + + +@MULTIMODAL_REGISTRY.register_processor( + DeepseekVL2MultiModalProcessor, + info=DeepseekVL2ProcessingInfo, + dummy_inputs=DeepseekVL2DummyInputsBuilder) +class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): + + hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ + "language.": "language_model.", + }) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config: DeepseekVLV2Config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + + self.config = config + self.multimodal_config = multimodal_config + + self.vision_config = config.vision_config + self.projector_config = config.projector_config + self.text_config = config.text_config + + model_config = vllm_config.model_config + tokenizer = cached_get_tokenizer( + model_config.tokenizer, + tokenizer_mode=model_config.tokenizer_mode, + tokenizer_revision=model_config.tokenizer_revision, + trust_remote_code=model_config.trust_remote_code, + ) + self.image_token_id = tokenizer.vocab.get(_IMAGE_TOKEN) + + self.vision = self._init_vision_module(self.vision_config, + quant_config, + maybe_prefix(prefix, "vision")) + + self.projector = MlpProjector(self.projector_config) + self.tile_tag = config.tile_tag + self.global_view_pos = config.global_view_pos + + # special token for image token sequence format + embed_std = 1 / torch.sqrt( + torch.tensor(self.projector_config.n_embed, dtype=torch.float32)) + if self.tile_tag == "2D": + # <|view_separator|>, <|\n|> + self.image_newline = nn.Parameter( + torch.randn(self.projector_config.n_embed) * embed_std) + # This is a typo in original implementation + self.view_seperator = nn.Parameter( + torch.randn(self.projector_config.n_embed) * embed_std) + else: + raise ValueError( + f"Only 2D tile_tag is supported currently, got: {self.tile_tag}" + ) + + if self.text_config.topk_method == "noaux_tc": + architectures = ["DeepseekV3ForCausalLM"] + elif not self.text_config.use_mla: + architectures = ["DeepseekForCausalLM"] + else: + architectures = ["DeepseekV2ForCausalLM"] + + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=self.text_config, + prefix=maybe_prefix(prefix, "language"), + architectures=architectures, + ) + + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors) + + def _init_vision_module( + self, + vision_config: VisionEncoderConfig, + quant_config: Optional[QuantizationConfig], + prefix: str = "", + ) -> nn.Module: + # TODO: refactor vision model through timm wrapper from transformers + try: + import timm + except ImportError: + raise ImportError("Please install timm") from ImportError + + with set_default_torch_dtype(torch.float16): + model = timm.create_model( + "vit_so400m_patch14_siglip_384.webli", + pretrained=False, + num_classes=0, + dynamic_img_size=True, + dynamic_img_pad=True, + ) + + model = model.to(dtype=torch.get_default_dtype()) + return model + + @cached_property + def sampler(self): + if hasattr(self.language_model, "sampler"): + return self.language_model.sampler + + return get_sampler() + + def _validate_pixel_values( + self, data: Union[torch.Tensor, List[torch.Tensor]] + ) -> Union[torch.Tensor, List[torch.Tensor]]: + + h = w = self.vision_config.image_size + expected_dims = (3, h, w) + + def _validate_shape(d: torch.Tensor): + actual_dims = tuple(d.shape[1:]) + + if actual_dims != expected_dims: + expected_expr = ("num_patches", *map(str, expected_dims)) + raise ValueError( + "The expected shape of pixel values per image per batch " + f"is {expected_expr}. You supplied {tuple(d.shape)}.") + + for d in data: + _validate_shape(d) + + return data + + def _validate_images_spatial_crop( + self, data: Union[torch.Tensor, List[torch.Tensor]] + ) -> Union[torch.Tensor, List[torch.Tensor]]: + expected_dims = 2 + + def _validate_shape(d: torch.Tensor): + actual_dims = d.size(-1) + + if actual_dims != expected_dims: + expected_expr = str(expected_dims) + raise ValueError( + f"The expected shape of image sizes per image per batch " + f"is {expected_expr}. You supplied {tuple(d.shape)}.") + + for d in data: + _validate_shape(d) + + return data + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[DeepseekVL2ImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + images_spatial_crop = kwargs.pop("images_spatial_crop", None) + image_embeds = kwargs.pop("image_embeds", None) + + if pixel_values is None and image_embeds is None: + return None + + if pixel_values is not None: + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + if not isinstance(images_spatial_crop, (torch.Tensor, list)): + raise ValueError("Incorrect type of image sizes. " + f"Got type: {type(images_spatial_crop)}") + + return DeepseekVL2ImagePixelInputs( + type="pixel_values", + data=self._validate_pixel_values(flatten_bn(pixel_values)), + images_spatial_crop=self._validate_images_spatial_crop( + flatten_bn(images_spatial_crop, concat=True))) + + if image_embeds is not None: + if not isinstance(image_embeds, torch.Tensor): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + + return DeepseekVL2VImageEmbeddingInputs( + type="image_embeds", + data=flatten_bn(image_embeds), + ) + + raise AssertionError("This line should be unreachable.") + + def _pixel_values_to_embedding( + self, + pixel_values: NestedTensors, + images_spatial_crop: torch.Tensor, + ) -> NestedTensors: + # Pixel_values: n_image * batch_size * [patch_per_img, 3, height, width] + total_tiles = [x for x in pixel_values] + + # [batch_all_tiles, 3, height, width] + total_tiles = torch.cat(total_tiles, dim=0) + + # [batch_all_tiles, vit_seq_len, c] + images_feature = self.vision.forward_features(total_tiles) + + # [batch_all_tiles, hw, D] + images_embeds = self.projector(images_feature) + + _, hw, n_dim = images_embeds.shape + h = w = int(hw**0.5) + + # 根据self.tile_tag & self.global_view_pos填充image token sequence + tile_index = 0 + vision_embeddings = [] + for jdx in range(images_spatial_crop.size(0)): + # extra global & local features + num_width_tiles, num_height_tiles = images_spatial_crop[jdx] + if num_width_tiles == 0 or num_height_tiles == 0: + break + num_tiles_in_image = num_width_tiles * num_height_tiles + + # [hw, D] + global_features = images_embeds[tile_index] + + # [num_height_tiles * num_width_tiles, hw, D] + local_features = images_embeds[tile_index + 1:tile_index + 1 + + num_tiles_in_image] + tile_index += num_tiles_in_image + 1 + + # format global and local features + # ----------------- global view add newline ----------------- + # [hw, D] -> [h, w, D] + global_features = global_features.view(h, w, n_dim) + + # [D] -> [h, 1, D] + new_lines_in_global = repeat(self.image_newline, "d -> h 1 d", h=h) + + # cat([h, w, D], [h, 1, D], dim=1) -> [h, w + 1, D] + global_features = torch.cat([global_features, new_lines_in_global], + dim=1) + + # [h, w + 1, D] -> [h * (w + 1), D] + global_features = global_features.view(-1, n_dim) + + # ----------------- local view add newline ----------------- + # [num_height_tiles * num_width_tiles, h * w, D] -> + # [num_height_tiles * h, num_width_tiles * w, D] + local_features = rearrange(local_features, + "(th tw) (h w) d -> (th h) (tw w) d", + th=num_height_tiles, + tw=num_width_tiles, + h=h, + w=w) + + # [D] -> [num_height_tiles * h, 1, D] + new_lines_in_local = repeat(self.image_newline, + "d -> (th h) 1 d", + th=num_height_tiles, + h=h) + + # [num_height_tiles * h, num_width_tiles * w + 1, D] + local_features = torch.cat([local_features, new_lines_in_local], + dim=1) + + # [num_height_tiles * h, num_width_tiles * w + 1, D] + # --> [(num_height_tiles * h) * (num_width_tiles * w + 1), D] + local_features = local_features.view(-1, n_dim) + + # merge global and local tiles + if self.global_view_pos == "head": + global_local_features = torch.cat([ + global_features, + self.view_seperator[None, :], + local_features, + ]) + else: + global_local_features = torch.cat([ + local_features, + self.view_seperator[None, :], + global_features, + ]) + + vision_embeddings.append(global_local_features) + return vision_embeddings + + def _process_image_input( + self, image_input: DeepseekVL2ImageInputs) -> torch.Tensor: + if image_input["type"] == "image_embeds": + image_data = image_input["data"] + if is_list_of(image_data, torch.Tensor): + # it's already a list of tensors + return image_data + if len(image_data.shape) == 3: + # 3D tensor + return list(torch.unbind(image_data, dim=0)) + raise ValueError( + "We expect batched 2D tensors;" + "this can be either a list of 2D tensors or a single 3D tensor." + ) + + pixel_values = image_input["data"] + images_spatial_crop = image_input["images_spatial_crop"] + + return self._pixel_values_to_embedding( + pixel_values=pixel_values, images_spatial_crop=images_spatial_crop) + + def get_multimodal_embeddings(self, **kwargs: object) -> torch.Tensor: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + self.image_token_id) + return inputs_embeds + + def forward(self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object): + + if intermediate_tensors is not None: + inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None + + hidden_states = self.language_model(input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds=inputs_embeds) + + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states, + sampling_metadata) + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + return self.language_model.sample(logits, sampling_metadata) + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + + loader = AutoWeightsLoader(self) + autoloaded_weights = loader.load_weights(weights, + mapper=self.hf_to_vllm_mapper) + return autoloaded_weights diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index f138d13630263..948560b4906b8 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -17,14 +17,35 @@ from vllm.sequence import IntermediateTensors from .utils import maybe_prefix +class DummyInputLayerNorm(nn.Module): + + def __init__(self, weight=None, bias=None): + super().__init__() + self.weight = nn.Parameter(weight) if weight is not None else None + self.bias = nn.Parameter(bias) if bias is not None else None + + def forward(self, x): + return x + + +class DummyOutputNorm(nn.Module): + + def forward(self, x, residual): + if residual is None: + return x + else: + return x, residual + + class EAGLE(nn.Module): """This class implements the EAGLE draft model from the paper: https://arxiv.org/pdf/2401.15077 Reference implementation: https://github.com/SafeAILab/EAGLE Differences from reference implementation: 1. In reference, LlamaDecoderLayer implementation doesn't have - input_layernorm for 1st decoder layer (https://github.com/SafeAILab/EAGLE/blob/7d065d084443fbfd386f88839efd7193c12be869/eagle/model/cnets.py#L427) - but we do as HF implementation also does. + input_layernorm for 1st decoder layer (https://github.com/SafeAILab/EAGLE/blob/7d065d084443fbfd386f88839efd7193c12be869/eagle/model/cnets.py#L427). + Following this approach, our implementation also disables + the input_layernorm for the first decoder layer. 2. We allow any decoder layer to be used in EAGLE whereas in reference decoder layer is fixed to be LlamaDecoderLayer. 3. We have an optional token_map which reduces draft vocab to most @@ -46,10 +67,20 @@ class EAGLE(nn.Module): self.model = model_cls(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) + self.fc = nn.Linear(config.model.hidden_size * 2, config.model.hidden_size, bias=getattr(self.config, "eagle_fc_bias", False)) + # Modify layer normalization and residual connections as suggested + # in the EAGLE framework: https://github.com/SafeAILab/EAGLE + # While weights and biases are generally not needed, + # they are retained here to support certain unit tests + # (e.g., spec_decode/e2e/test_eagle_correctness.py). + self.model.model.layers[0].input_layernorm = DummyInputLayerNorm( + weight=self.model.model.layers[0].input_layernorm.weight) + self.model.model.norm = DummyOutputNorm() + self.orig_vocab_size = config.vocab_size self.truncated_vocab_size = config.truncated_vocab_size self.unpadded_vocab_size = self.truncated_vocab_size diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 8324a563edd64..eab3bf0756fca 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -39,8 +39,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -430,14 +428,6 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "lm_head": "output_embeddings", } embedding_padding_modules = ["lm_head"] - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "c_fc_0": ("gate_up_proj", 0), - "c_fc_1": ("gate_up_proj", 1), - } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -447,6 +437,7 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.config = config self.lora_config = lora_config + self.quant_config = quant_config self.transformer = ExaoneModel( vllm_config=vllm_config, @@ -540,12 +531,14 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP): # processed with quantization, LoRA, fine-tuning, etc. if self.config.tie_word_embeddings and "lm_head.weight" in name: continue - if scale_name := get_compressed_tensors_cache_scale(name): - # Loading kv cache scales for compressed-tensors quantization + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py new file mode 100644 index 0000000000000..b93a68680375d --- /dev/null +++ b/vllm/model_executor/models/fairseq2_llama.py @@ -0,0 +1,151 @@ +# Copyright 2024 The vLLM team. +# Copyright 2024 Meta Platforms, Inc. and affiliates. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Llama model for fairseq2 weights.""" + +from typing import Iterable, Set, Tuple + +import torch +from torch.nn import Parameter + +from vllm.config import VllmConfig +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size) +from vllm.model_executor.layers.linear import set_weight_attrs +from vllm.model_executor.models.llama import LlamaForCausalLM + +from .utils import AutoWeightsLoader, WeightsMapper + + +class Fairseq2LlamaForCausalLM(LlamaForCausalLM): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + self.tp_rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() + # For the model loader to read only the relevant checkpoint files + self.allow_patterns_overrides = [ + # either the full checkpoint + "model.pt", + # or the tp-sharded checkpoint of the current rank + f"model.{self.tp_rank}.pt", + ] + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + # fairseq2's serialization adds a wrapper to usual .pt state_dict's: + # { "model_key": my_model_name, "my_model_name": state_dict } + # which we first need to unpack + weights_wrapped = dict(weights) + weights = weights_wrapped[ + weights_wrapped["model_key"]].items() # type: ignore + + # remap keys + fs2_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "decoder_frontend.embed.": "model.embed_tokens.", + "decoder.": "model.", + "final_proj.": "lm_head.", + }, + orig_to_new_substr={ + ".self_attn_layer_norm.": ".input_layernorm.", + ".ffn_layer_norm.": ".post_attention_layernorm.", + ".self_attn.output_proj.": ".self_attn.o_proj.", + ".ffn.gate_proj.": ".mlp.gate_proj.", + ".ffn.inner_proj.": ".mlp.up_proj.", + ".ffn.output_proj.": ".mlp.down_proj.", + ".layer_norm.": ".norm.", + }, + ) + weights = fs2_to_vllm_mapper.apply(weights) + + params = dict(self.named_parameters()) + + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] + if self.config.tie_word_embeddings else None), + ) + return loader.load_weights( + (self.reshape_fairseq2_weights(name, loaded_weight, params) + for name, loaded_weight in weights)) + + def flag_sharded_weights(self, params: dict[str, Parameter]): + """Sets the `is_sharded_weight` flag to True for all sharded weights""" + for name, param in params.items(): + modules = name.split(".") + if "norm" in name and len(param.size()) < 2: + # layer norms are not sharded + continue + elif any(emb in modules for emb in ["embed_tokens", "lm_head"]): + # for now we repeat embedding layers for compatibility + continue + else: + # all other layers are sharded + set_weight_attrs(param, {"is_sharded_weight": True}) + + def reshape_fairseq2_weights( + self, + name: str, + loaded_weight: torch.Tensor, + params: dict[str, Parameter], + ) -> Tuple[str, torch.Tensor]: + """Reshape fairseq2's weights.""" + + def permute(w: torch.Tensor, n_heads: int) -> torch.Tensor: + attn_in = self.config.head_dim * n_heads + # check for a sharded weight on dim 0 + if attn_in // self.tp_size == w.size()[0]: + attn_in //= self.tp_size + n_heads //= self.tp_size + attn_out = self.config.hidden_size + return (w.view(n_heads, attn_in // n_heads // 2, 2, + attn_out).transpose(1, + 2).reshape(attn_in, attn_out)) + + modules = name.split(".") + + # rotary embeds should be sliced + if "k_proj" in modules: + loaded_weight = permute(loaded_weight, + self.config.num_key_value_heads) + + elif "q_proj" in modules: + loaded_weight = permute(loaded_weight, + self.config.num_attention_heads) + + # We make the loaded weights compatible with both + # full checkpoints and tp sharded checkpoints. + # Embeddings are repeated to fit the vocab size. + # Other weights are flagged for the weight_loader calls. + if any(emb in modules for emb in ["embed_tokens", "lm_head"]): + # Embeddings are sharded on dim 0 + dim = 0 + # In fairseq2, vocab size has to be divisible by tp_size + # so we don't worry about padding + if self.tp_size > 1 and loaded_weight.shape[ + dim] < self.config.vocab_size: + assert loaded_weight.shape[ + dim] * self.tp_size == self.config.vocab_size, \ + "vocab_size should be divisible by tp_size." + repeats = [1] * len(loaded_weight.size()) + repeats[dim] = self.tp_size + # repeat to match vocab size and to be easily 'narrow'able + loaded_weight = loaded_weight.repeat(repeats) + set_weight_attrs(params[name], {"is_sharded_weight": False}) + # if embeddings are sharded, the rest is too + if "embed_tokens" in modules: + self.flag_sharded_weights(params) + + return name, loaded_weight diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 8660cf79b9cdb..c503a368e8244 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -409,9 +409,9 @@ class FalconModel(nn.Module): class FalconForCausalLM(nn.Module, SupportsPP): - - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = {} + packed_modules_mapping = { + "query_key_value": ["query_key_value"], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 59af5f0b3ae98..63e7147f84e03 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -16,7 +16,7 @@ """ PyTorch Fuyu model.""" import math from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict) + TypedDict, Union) import torch import torch.nn as nn @@ -149,14 +149,10 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]): mm_data: Mapping[str, object], mm_kwargs: Mapping[str, object], ) -> BatchFeature: - if not mm_data: # Avoid warning from HF logger for text-only input - # Input_ids format: bos_token_id + prompt_token_ids + boa_token_id - # Tokenizer won't add boa_token_id by default, we add it manually. - tokenizer = self.info.get_tokenizer() - boa_token_id: int = tokenizer.vocab["<0x04>"] # type: ignore - prompt_ids = tokenizer.encode(prompt) + [boa_token_id] + prompt_ids = self.info.get_tokenizer().encode(prompt) + prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") processed_outputs = super()._call_hf_processor( @@ -181,6 +177,16 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]): return processed_outputs + def _apply_hf_processor_tokens_only( + self, + prompt_tokens: list[int], + ) -> list[int]: + # HF processor adds boa_token_id + tokenizer = self.info.get_tokenizer() + boa_token_id: int = tokenizer.vocab["<0x04>"] # type: ignore + + return prompt_tokens + [boa_token_id] + def _get_mm_fields_config( self, hf_inputs: BatchFeature, @@ -223,11 +229,11 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]): def apply( self, - prompt_text: str, + prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], ) -> MultiModalInputsV2: - result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) # Only |SPEAKER| (image) tokens should be considered as placeholders, # so we ignore the trailing bos_token_id diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index b28715c48adfb..6de0c866bc2f0 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -349,15 +349,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "gate_up_proj", "down_proj", ] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } # Gemma does not apply LoRA to the embedding layer. embedding_modules = {} diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index f4530e4771960..f0dc7693974be 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -31,8 +31,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -254,6 +252,7 @@ class Gemma2Model(nn.Module): cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config + self.quant_config = quant_config self.embed_tokens = VocabParallelEmbedding( config.vocab_size, @@ -329,7 +328,8 @@ class Gemma2Model(nn.Module): params_dict = dict(self.named_parameters()) loaded_params: Set[str] = set() for name, loaded_weight in weights: - if scale_name := get_compressed_tensors_cache_scale(name): + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): # Loading kv cache scales for compressed-tensors quantization param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", @@ -399,16 +399,6 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): embedding_modules = {} embedding_padding_modules = [] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py index 39a5736eb199b..51922e6f2d03d 100644 --- a/vllm/model_executor/models/glm4_vision_encoder.py +++ b/vllm/model_executor/models/glm4_vision_encoder.py @@ -42,7 +42,8 @@ class PatchEmbedding(nn.Module): torch.Tensor Transformed tensor with shape (B, L, D) """ - images = images.to(self.proj.weight.device) + images = images.to(device=self.proj.weight.device, + dtype=self.proj.weight.dtype) x = self.proj(images) x = x.flatten(2).transpose(1, 2) cls_token = self.cls_embedding.expand(x.shape[0], -1, -1) diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index fd926ff0254d4..1656a3cc9e46d 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -198,7 +198,10 @@ class GPT2Model(nn.Module): assert not config.scale_attn_by_inverse_layer_idx assert not config.reorder_and_upcast_attn self.embed_dim = config.hidden_size - self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim) + self.wte = VocabParallelEmbedding(config.vocab_size, + self.embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.wte") self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, @@ -259,7 +262,9 @@ class GPT2LMHeadModel(nn.Module, SupportsPP): self.lm_head = self.transformer.wte else: self.lm_head = ParallelLMHead(self.config.vocab_size, - self.config.hidden_size) + self.config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.lm_head") self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( @@ -304,7 +309,7 @@ class GPT2LMHeadModel(nn.Module, SupportsPP): params_dict = dict(self.named_parameters(remove_duplicate=False)) loaded_params: Set[str] = set() for name, loaded_weight in weights: - if "lm_head.weight" in name: + if name.startswith("lm_head"): # GPT-2 ties the weights of the embedding layer and the final # linear layer. continue diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 4829578a56959..08298cc0db36f 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -313,6 +313,19 @@ class GPTJForCausalLM(nn.Module, SupportsPP): for name, loaded_weight in weights: if "attn.bias" in name or "attn.masked_bias" in name: continue + + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index a91ed4158a73f..ddd2d7a16b242 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -39,8 +39,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -362,14 +360,6 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "lm_head": "output_embeddings", } embedding_padding_modules = ["lm_head"] - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -379,6 +369,7 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.config = config self.lora_config = lora_config + self.quant_config = quant_config self.model = GraniteModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) @@ -482,12 +473,14 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): # processed with quantization, LoRA, fine-tuning, etc. if self.config.tie_word_embeddings and "lm_head.weight" in name: continue - if scale_name := get_compressed_tensors_cache_scale(name): - # Loading kv cache scales for compressed-tensors quantization + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 17e772e7faa32..d16a77f862d98 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -662,16 +662,6 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, "down_proj", ] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } - embedding_modules = {} embedding_padding_modules = [] diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index b51cba86ec1a4..c5fd0d9332379 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -39,13 +39,13 @@ class SupportsMultiModal(Protocol): The output embeddings must be one of the following formats: - - A list or tuple of 2D tensors, where each tensor corresponds to - each input multimodal data item (e.g, image). + - A list or tuple of 2D tensors, where each tensor corresponds to + each input multimodal data item (e.g, image). - A single 3D tensor, with the batch dimension grouping the 2D tensors. Note: - The returned multimodal embeddings must be in the same order as - the appearances of their corresponding multimodal data item in the + The returned multimodal embeddings must be in the same order as + the appearances of their corresponding multimodal data item in the input prompt. """ ... diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 7ff68bd60e8ad..8ad009d5101e4 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -271,7 +271,7 @@ class InternSdpaAttention(nn.Module): v = v.transpose(1, 2) x = F.scaled_dot_product_attention(q, k, v, scale=self.scale) - x = x.transpose(1, 2).view(B, N, -1) + x = x.transpose(1, 2).reshape(B, N, -1) x = self.proj(x) return x diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 8623da99574bb..a5bd418801f2c 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -38,8 +38,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -97,20 +95,19 @@ class LlamaMLP(nn.Module): class LlamaAttention(nn.Module): - def __init__( - self, - config: LlamaConfig, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, - max_position_embeddings: int = 8192, - quant_config: Optional[QuantizationConfig] = None, - bias: bool = False, - cache_config: Optional[CacheConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, + config: LlamaConfig, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + max_position_embeddings: int = 8192, + quant_config: Optional[QuantizationConfig] = None, + bias: bool = False, + bias_o_proj: bool = False, + cache_config: Optional[CacheConfig] = None, + prefix: str = "") -> None: super().__init__() layer_idx = extract_layer_index(prefix) self.hidden_size = hidden_size @@ -150,13 +147,14 @@ class LlamaAttention(nn.Module): self.o_proj = RowParallelLinear( input_size=self.total_num_heads * self.head_dim, output_size=hidden_size, - bias=bias, + bias=bias_o_proj, quant_config=quant_config, prefix=f"{prefix}.o_proj", ) is_neox_style = True - if quant_config is not None and quant_config.get_name() == "gguf": + is_gguf = quant_config and quant_config.get_name() == "gguf" + if is_gguf and config.model_type == "llama": is_neox_style = False self.rotary_emb = get_rope( @@ -230,6 +228,11 @@ class LlamaDecoderLayer(nn.Module): # Support internlm/internlm-7b with bias attention_bias = getattr(config, "attention_bias", False) or getattr( config, "bias", False) + bias_o_proj = attention_bias + # support internlm/internlm3-8b with qkv_bias + if hasattr(config, 'qkv_bias'): + attention_bias = config.qkv_bias + self.self_attn = LlamaAttention( config=config, hidden_size=self.hidden_size, @@ -241,6 +244,7 @@ class LlamaDecoderLayer(nn.Module): max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, + bias_o_proj=bias_o_proj, cache_config=cache_config, prefix=f"{prefix}.self_attn", ) @@ -300,6 +304,7 @@ class LlamaModel(nn.Module): lora_config = vllm_config.lora_config self.config = config + self.quant_config = quant_config self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0 @@ -390,12 +395,14 @@ class LlamaModel(nn.Module): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - if scale_name := get_compressed_tensors_cache_scale(name): - # Loading kv cache scales for compressed-tensors quantization + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue @@ -477,16 +484,6 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): } embedding_padding_modules = ["lm_head"] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } - # Mistral/Llama models can also be loaded with --load-format mistral # from consolidated.safetensors checkpoints mistral_mapping = { diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 8d94acf3b21d5..6cceded43a79d 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -5,9 +5,11 @@ from typing import (Final, Iterable, List, Literal, Mapping, Optional, import torch import torch.nn as nn +from packaging.version import Version from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig, PixtralVisionConfig, PretrainedConfig, SiglipVisionConfig) +from transformers import __version__ as TRANSFORMERS_VERSION from transformers.models.llava import LlavaProcessor from transformers.models.pixtral import PixtralProcessor @@ -463,14 +465,10 @@ def init_vision_tower_for_llava( info=_build_llava_or_pixtral_hf_info, dummy_inputs=LlavaDummyInputsBuilder) class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), + + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"] } def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: @@ -720,11 +718,32 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): return loader.load_weights(weights) +class MantisProcessingInfo(LlavaProcessingInfo): + + def get_hf_processor(self): + hf_config = self.get_hf_config() + vision_info = self.get_vision_encoder_info() + + if Version(TRANSFORMERS_VERSION) < Version("4.48"): + # BUG: num_additional_image_tokens = 0 but treated as 1, + # so we set vision_feature_select_strategy to None to offset this + vision_feature_select_strategy = None + else: + # FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150 + vision_feature_select_strategy = hf_config.vision_feature_select_strategy # noqa: E501 + + return self.ctx.get_hf_processor( + LlavaProcessor, + patch_size=vision_info.get_patch_size(), + vision_feature_select_strategy=vision_feature_select_strategy, + ) + + class MantisMultiModalProcessor(LlavaMultiModalProcessor): def apply( self, - prompt_text: str, + prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], ) -> MultiModalInputsV2: @@ -737,7 +756,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): image_height=-1, ) - result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) mm_items = self._to_mm_items(mm_data) mm_item_counts = mm_items.get_all_counts() @@ -760,7 +779,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): ) ]) - prompt_ids, prompt_text, _ = self._apply_prompt_replacements( + prompt_ids, prompt, _ = self._apply_prompt_replacements( result["prompt_token_ids"], mantis_mm_repls, mm_item_counts, @@ -788,7 +807,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): return MultiModalInputsV2( type="multimodal", - prompt=prompt_text, + prompt=prompt, prompt_token_ids=prompt_ids, mm_kwargs=mm_kwargs, mm_placeholders=mm_placeholder_ranges, @@ -798,7 +817,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): # To use this model, please use # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` @MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor, - info=LlavaProcessingInfo, + info=MantisProcessingInfo, dummy_inputs=LlavaDummyInputsBuilder) class MantisForConditionalGeneration(LlavaForConditionalGeneration): pass diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 78a47e64d9afc..6faa79f65d8de 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -19,8 +19,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, NestedTensors) -from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems, - VideoProcessorItems) +from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, + VideoEmbeddingItems, VideoProcessorItems) from vllm.multimodal.processing import PromptReplacement from vllm.multimodal.profiling import ProcessorInputs from vllm.sequence import IntermediateTensors @@ -145,6 +145,10 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo): return (unpadded_features, newline_features) + def get_image_size_with_most_features(self) -> ImageSize: + # NOTE: This hardcoded value is found via processor tests + return ImageSize(width=1153, height=944) + def _get_num_frame_tokens( self, *, @@ -550,10 +554,12 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, # Preserve the order of modalities if there are multiple of them # from the order of kwargs. for input_key in kwargs: - if input_key == "pixel_values" and "images" not in modalities: + if input_key in ("pixel_values", + "image_embeds") and "images" not in modalities: modalities["images"] = self._parse_and_validate_image_input( **kwargs) - if input_key == "pixel_values_videos" and "videos" not in modalities: # noqa E501 + if input_key in ("pixel_values_videos", + "video_embeds") and "videos" not in modalities: modalities["videos"] = self._parse_and_validate_video_input( **kwargs) diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 5a0f202364f26..6254d26c7060d 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -534,16 +534,6 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): } embedding_padding_modules = ["lm_head"] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index e9d7eada1d16c..5e1e6c6fa6141 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -241,11 +241,5 @@ class MiniCPM3ForCausalLM(MiniCPMForCausalLM): # `embedding_modules` and `embedding_padding_modules` # are inherited from MiniCPMForCausalLM - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } - def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""): return MiniCPM3Model(vllm_config=vllm_config, prefix=prefix) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 8f36437d47d9e..1aa529056893b 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -657,7 +657,7 @@ class MiniCPMV2_0(MiniCPMVBaseModel): quant_config: Optional[QuantizationConfig], prefix: str = "", ) -> nn.Module: - # TODO: refactor this vision model + # TODO: refactor vision model through timm wrapper from transformers try: import timm except ImportError: @@ -761,16 +761,6 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA): "kv_proj", ] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } - embedding_modules = {} embedding_padding_modules = [] @@ -881,16 +871,6 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA): "kv_proj", ] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } - embedding_modules = {} embedding_padding_modules = [] diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index a5b364fe5ec85..da415cdae96ed 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -347,6 +347,7 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP): lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config + self.quant_config = quant_config self.model = MixtralModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) @@ -428,6 +429,18 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP): if "rotary_emb.inv_freq" in name: continue + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index c5046e06edecb..2554281610a30 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -123,6 +123,13 @@ def input_processor_for_mllama( assert is_list_of(image_data, Image.Image) + num_image_tokens = dec_inputs['prompt_token_ids'].count( + MLLAMA_IMAGE_TOKEN_ID) + if num_image_tokens != len(image_data): + raise ValueError( + f"The number of image tokens ({num_image_tokens}) must be" + f" the same as the number of images ({len(image_data)})") + # Since only the last group of consecutive images # are attended by the decoded tokens, we only need to # get the number of tiles for those images. @@ -1100,20 +1107,16 @@ class MllamaForCausalLM(nn.Module): @INPUT_REGISTRY.register_dummy_encoder_data(dummy_encoder_data_for_mllama) @INPUT_REGISTRY.register_input_processor(input_processor_for_mllama) class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal): - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"] } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config + self.quant_config = quant_config self.vocab_size = config.text_config.vocab_size self.hidden_size = config.text_config.hidden_size self.max_num_tiles = config.vision_config.max_num_tiles @@ -1427,6 +1430,17 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal): name = name.replace('patch_embedding.weight', 'patch_embedding._linear.weight') loaded_weight = loaded_weight.view(loaded_weight.shape[0], -1) + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + updated_params.add(scale_name) + continue for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue @@ -1493,6 +1507,8 @@ def convert_sparse_cross_attention_mask_to_dense( dense_mask[seq_start + start:seq_start + end, tile_start:tile_start + tile] = 1 tile_start += tile + assert ts != -1 + assert td != 0 tile_range_for_decode.append((ts, ts + td)) seq_start += length diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 2e60bc719f096..5c7ae0deefcd8 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -23,7 +23,8 @@ from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, InputContext, token_inputs) from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul +from vllm.model_executor.layers.activation import (MulAndSilu, QuickGELU, + SiluAndMul) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, @@ -462,15 +463,6 @@ class MolmoAttention(nn.Module): return output -class SwiGLU(nn.Module): - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x, gate = x.chunk(2, dim=-1) - # Note that the order is reversed compared to - # SiluAndMul. - return x * F.silu(gate) - - class LanuageModelMLP(nn.Module): """Molmo's LLM mlp.""" @@ -489,7 +481,7 @@ class LanuageModelMLP(nn.Module): quant_config=quant_config, ) # Activation function. - self.act_fn = SwiGLU() + self.act_fn = MulAndSilu() # Feed-forward output projection. self.down_proj = RowParallelLinear( self.intermediate_size, @@ -1068,7 +1060,7 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs): trust_remote_code=model_config.trust_remote_code) # NOTE: message formatting for raw text prompt is only applied for - # offline inference; for online inference, the prompt is always in + # offline inference; for online serving, the prompt is always in # instruction format and tokenized. if prompt is not None and re.match(r"^User:[\s\S]*?(Assistant:)*$", prompt): @@ -1193,12 +1185,6 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, embedding_modules = {} embedding_padding_modules = [] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - "gate_proj": ("merged_linear", 0), - "up_proj": ("merged_linear", 1), - } - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 34cb9981c167b..2340283b69665 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -395,12 +395,6 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "lm_head": "output_embeddings", } embedding_padding_modules = ["lm_head"] - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -411,6 +405,7 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.config = config self.lora_config = lora_config + self.quant_config = quant_config self.model = NemotronModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) @@ -495,6 +490,17 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 7edafcd20b5db..ea1185aa80dc6 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -329,13 +329,9 @@ class OPTModel(nn.Module): class OPTForCausalLM(nn.Module, SupportsPP): - - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"] } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index f9e972688ddd1..59b7508a370f8 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -279,14 +279,6 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "fc2", ] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - } - embedding_modules = {} embedding_padding_modules = [] diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py index 937858ee3b8c2..34141511ea791 100644 --- a/vllm/model_executor/models/phi3.py +++ b/vllm/model_executor/models/phi3.py @@ -14,7 +14,3 @@ class Phi3ForCausalLM(LlamaForCausalLM): "gate_up_proj", ], } - - # BitandBytes specific attributes - # Initialize an empty dict when there is no stacked parameter mapping. - bitsandbytes_stacked_params_mapping = {} diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index da7e4cdbc6940..f47676b934e4e 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -20,6 +20,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -54,12 +55,12 @@ class HeadMajorColumnParallelLinear(MergedColumnParallelLinear): return load_column_parallel_weight(param, loaded_weight) -@torch.compile(dynamic=True) +@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) def quick_gelu(x): return x * torch.sigmoid(1.702 * x) -@torch.compile(dynamic=True) +@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) def gegelu(input, limit: Optional[float] = None): a_gelu, a_linear = input[..., ::2], input[..., 1::2] if limit is not None: diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index a1b1af35604db..7a230e5beb367 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -481,11 +481,11 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]): def apply( self, - prompt_text: str, + prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], ) -> MultiModalInputsV2: - result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) # Only <|image|> tokens should be considered as placeholders, # so we ignore the trailing bos_token_id diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 1febd62f2f705..881c09ea9db99 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -546,6 +546,7 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP): lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config + self.quant_config = vllm_config.quant_config self.model = PhiMoEModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) @@ -623,6 +624,18 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP): if "rotary_emb.inv_freq" in name: continue + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index baf955f6b515d..1345b381f0a99 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -1028,13 +1028,6 @@ class QWenLLM(QWenBaseModel): embedding_modules = {} embedding_padding_modules = [] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "w2": ("gate_up_proj", 0), - "w1": ("gate_up_proj", 1), - } - class QWenVL(QWenBaseModel, SupportsMultiModal): packed_modules_mapping = { diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index d20fb150f7e39..82de1c3574090 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -256,7 +256,15 @@ class Qwen2DecoderLayer(nn.Module): return hidden_states, residual -@support_torch_compile +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl, + # otherwise (seq_len, ). + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + }) class Qwen2Model(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -279,6 +287,7 @@ class Qwen2Model(nn.Module): )) self.config = config + self.quant_config = quant_config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -364,6 +373,17 @@ class Qwen2Model(nn.Module): for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue @@ -418,16 +438,6 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): embedding_modules = {} embedding_padding_modules = [] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 0dff9595c6c08..47d56175261e4 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -36,8 +36,9 @@ from vllm.config import VllmConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, - NestedTensors) +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputsV2, MultiModalKwargs, + NestedTensors, PlaceholderRange) from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -153,29 +154,24 @@ class Qwen2AudioMultiModalProcessor( mm_data: Mapping[str, object], mm_kwargs: Mapping[str, Any], ) -> BatchFeature: - mm_data = dict(mm_data) - audios = mm_data.pop("audios", []) + # Text-only input not supported in composite processor + if not mm_data or not mm_data.get("audios", []): + prompt_ids = self.info.get_tokenizer().encode(prompt) + prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") - if audios: - mm_data["audios"] = audios + feature_extractor = self.info.get_feature_extractor(**mm_kwargs) + mm_kwargs = dict( + **mm_kwargs, + sampling_rate=feature_extractor.sampling_rate, + ) - feature_extractor = self.info.get_feature_extractor(**mm_kwargs) - mm_kwargs = dict( - **mm_kwargs, - sampling_rate=feature_extractor.sampling_rate, - ) - else: - # NOTE: WhisperFeatureExtractor cannot handle empty list of audios - pass - - processed_outputs = super()._call_hf_processor( + return super()._call_hf_processor( prompt=prompt, mm_data=mm_data, mm_kwargs=mm_kwargs, ) - return processed_outputs - def _get_mm_fields_config( self, hf_inputs: BatchFeature, @@ -192,8 +188,14 @@ class Qwen2AudioMultiModalProcessor( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self.info.get_hf_config() - placeholder = hf_config.audio_token_index + processor = self.info.get_hf_processor() + + # Use getattr with default to be compatible with transformers<4.48 + audio_token = getattr(processor, "audio_token", "<|AUDIO|>") + audio_bos_token = getattr(processor, "audio_bos_token", + "<|audio_bos|>") + audio_eos_token = getattr(processor, "audio_eos_token", + "<|audio_eos|>") feature_attention_mask = out_mm_kwargs.get("feature_attention_mask") if feature_attention_mask is None: @@ -214,12 +216,16 @@ class Qwen2AudioMultiModalProcessor( f"The audio {audio} (len={len(audio)}) is too short " "to be represented inside the model") - return [placeholder] * num_placeholders + return "".join([ + audio_bos_token, + audio_token * num_placeholders, + audio_eos_token, + ]) return [ PromptReplacement( modality="audio", - target=[placeholder], + target=audio_token, replacement=get_replacement_qwen2_audio, ) ] @@ -234,6 +240,26 @@ class Qwen2AudioMultiModalProcessor( # tokens than the number of audio items) return not hasattr(self.info.get_hf_processor(), "audio_token") + def apply( + self, + prompt: Union[str, list[int]], + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) + + # Only <|AUDIO|> tokens should be considered as placeholders, + # so we ignore the audio_bos_token and audio_eos_token + result["mm_placeholders"] = { + modality: [ + PlaceholderRange(offset=p["offset"] + 1, + length=p["length"] - 2) for p in ps + ] + for modality, ps in result["mm_placeholders"].items() + } + + return result + @MULTIMODAL_REGISTRY.register_processor( Qwen2AudioMultiModalProcessor, diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 988d682d36be3..593ce4857af0f 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -12,7 +12,7 @@ from vllm.attention import AttentionMetadata from vllm.config import VllmConfig from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.pooler import Pooler, PoolingType, SimplePooler from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput @@ -32,7 +32,7 @@ class ReLU(nn.Module): return self.activation(input) -class Qwen2ForRewardModel(nn.Module, SupportsLoRA, SupportsPP): +class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -60,7 +60,6 @@ class Qwen2ForRewardModel(nn.Module, SupportsLoRA, SupportsPP): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config - pooler_config = vllm_config.model_config.pooler_config self.config = config self.lora_config = lora_config @@ -74,14 +73,11 @@ class Qwen2ForRewardModel(nn.Module, SupportsLoRA, SupportsPP): config.hidden_size, quant_config=quant_config), ReLU(), - RowParallelLinear(config.hidden_size, 1, + RowParallelLinear(config.hidden_size, + config.num_labels, quant_config=quant_config), ) - self._pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.ALL, - normalize=False, - softmax=False) + self._pooler: SimplePooler self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -115,3 +111,31 @@ class Qwen2ForRewardModel(nn.Module, SupportsLoRA, SupportsPP): loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["lm_head."]) return loader.load_weights(weights) + + +class Qwen2ForRewardModel(Qwen2RewardBaseModel): + + def __init__(self, *, vllm_config, prefix=""): + vllm_config.model_config.hf_config.num_labels = 1 + super().__init__(vllm_config=vllm_config, prefix=prefix) + pooler_config = vllm_config.model_config.pooler_config + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.ALL, + normalize=False, + softmax=False) + + +class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel): + + def __init__(self, *, vllm_config, prefix=""): + vllm_config.model_config.hf_config.num_labels = 2 + super().__init__(vllm_config=vllm_config, prefix=prefix) + pooler_config = vllm_config.model_config.pooler_config + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.STEP, + normalize=False, + softmax=True, + step_tag_id=151651, + ) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 76a810e8f0c20..34d5c8ad089a3 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -67,11 +67,15 @@ from vllm.transformers_utils.config import uses_mrope from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, - init_vllm_registered_model, maybe_prefix) + init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings) from .vision import get_vit_attn_backend logger = init_logger(__name__) +# For profile run +_MAX_FRAMES_PER_VIDEO = 16 + # === Vision Inputs === # @@ -135,7 +139,7 @@ class Qwen2VLVideoEmbeddingInputs(TypedDict): - List[`torch.Tensor`]: A list of tensors holding all videos' features. Each tensor holds an video's features. - `torch.Tensor`: A tensor holding all videos' features - (concatenation of all videos' feature tensors). + (concatenation of all videos' feature tensors). Tensor shape: `(num_image_features, hidden_size)` - `num_image_features` varies based on @@ -611,6 +615,7 @@ class Qwen2VisionTransformer(nn.Module): # adapter x = self.merger(x) + return x def load_weights(self, weights: Iterable[Tuple[str, @@ -874,8 +879,8 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): max_image_tokens = self.get_max_image_tokens() * max_images max_total_frames = self._get_max_video_frames(seq_len - max_image_tokens) - - num_frames = max(max_total_frames // max(max_videos, 1), 1) + num_frames = min(max(max_total_frames // max(max_videos, 1), 1), + _MAX_FRAMES_PER_VIDEO) # Temporary workaround for https://github.com/huggingface/transformers/issues/35412 if num_frames > 1 and num_frames % 2 == 1: @@ -955,13 +960,14 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo] "image": hf_processor.image_token, "video": hf_processor.video_token, } + merge_length = image_processor.merge_size**2 def get_replacement_qwen2vl(item_idx: int, modality: str): grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx] assert isinstance(grid_thw, torch.Tensor) - num_tokens = grid_thw.prod() // merge_length + num_tokens = grid_thw.prod().item() // merge_length return placeholder[modality] * num_tokens return [ @@ -1038,16 +1044,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, embedding_modules = {} embedding_padding_modules = [] - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } - # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ "lm_head.": "language_model.lm_head.", @@ -1057,11 +1053,8 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config: Qwen2VLConfig = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config - assert not cache_config.enable_prefix_caching, \ - "Qwen2-VL currently does not support prefix caching" self.config = config self.multimodal_config = multimodal_config @@ -1183,59 +1176,82 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, video_embeds=video_embeds, video_grid_thw=video_grid_thw) - def _process_image_input(self, - image_input: Qwen2VLImageInputs) -> torch.Tensor: + def _process_image_input( + self, image_input: Qwen2VLImageInputs) -> tuple[torch.Tensor, ...]: + + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + if image_input["type"] == "image_embeds": - return image_input["image_embeds"].type(self.visual.dtype) + image_embeds = image_input["image_embeds"].type(self.visual.dtype) + else: + pixel_values = image_input["pixel_values"].type(self.visual.dtype) + image_embeds = self.visual(pixel_values, grid_thw=grid_thw) - pixel_values = image_input["pixel_values"].type(self.visual.dtype) - image_embeds = self.visual(pixel_values, - grid_thw=image_input["image_grid_thw"]) - return image_embeds + # Split concatenated embeddings for each image item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size + + return image_embeds.split(sizes.tolist()) + + def _process_video_input( + self, video_input: Qwen2VLVideoInputs) -> tuple[torch.Tensor, ...]: + + grid_thw = video_input["video_grid_thw"] + assert grid_thw.ndim == 2 - def _process_video_input(self, - video_input: Qwen2VLVideoInputs) -> torch.Tensor: if video_input["type"] == "video_embeds": - return video_input["video_embeds"].type(self.visual.dtype) + video_embeds = video_input["video_embeds"].type(self.visual.dtype) + else: + pixel_values_videos = video_input["pixel_values_videos"].type( + self.visual.dtype) + video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) - pixel_values_videos = video_input["pixel_values_videos"].type( - self.visual.dtype) - video_embeds = self.visual(pixel_values_videos, - grid_thw=video_input["video_grid_thw"]) - return video_embeds + # Split concatenated embeddings for each video item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size - def _merge_multimodal_embeddings( - self, - input_ids: torch.Tensor, - inputs_embeds: torch.Tensor, - multimodal_embeddings: torch.Tensor, - placeholder_token_id: int, - ) -> torch.Tensor: - mask = (input_ids == placeholder_token_id) - inputs_embeds[mask, :] = multimodal_embeddings - return inputs_embeds + return video_embeds.split(sizes.tolist()) + + def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: + modalities = {} + + # Preserve the order of modalities if there are multiple of them + # from the order of kwargs. + for input_key in kwargs: + if input_key in ("pixel_values", + "image_embeds") and "images" not in modalities: + modalities["images"] = self._parse_and_validate_image_input( + **kwargs) + if input_key in ("pixel_values_videos", + "video_embeds") and "videos" not in modalities: + modalities["videos"] = self._parse_and_validate_video_input( + **kwargs) + + return modalities def get_multimodal_embeddings( self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]: - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - if image_input is None and video_input is None: + modalities = self._parse_and_validate_multimodal_inputs(**kwargs) + if not modalities: return None - # We make a tuple of each embedding with its modality string. This is a - # temporary workaround for models to handle mixed modalities when - # get_multimodal_embeddings and get_input_embeddings are called - # separately. - # TODO(ywang96): Add support for mixed-modality inference for v1. - multimodal_embeddings: List[Tuple[NestedTensors, str]] = [] + # The result multimodal_embeddings is tuple of tensors, with each + # tensor correspoending to a multimodal data item (image or video). + multimodal_embeddings: tuple[torch.Tensor, ...] = () - if image_input is not None: - image_embeds = self._process_image_input(image_input) - multimodal_embeddings.append((image_embeds, "image")) - if video_input is not None: - video_embeds = self._process_video_input(video_input) - multimodal_embeddings.append((video_embeds, "video")) + # NOTE: It is important to iterate over the keys in this dictionary + # to preserve the order of the modalities. + for modality in modalities: + if modality == "images": + image_input = modalities["images"] + vision_embeddings = self._process_image_input(image_input) + multimodal_embeddings += vision_embeddings + if modality == "videos": + video_input = modalities["videos"] + video_embeddings = self._process_video_input(video_input) + multimodal_embeddings += video_embeddings return multimodal_embeddings @@ -1247,21 +1263,9 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: - for embeddings, modality in multimodal_embeddings: - if modality == "image": - inputs_embeds = self._merge_multimodal_embeddings( - input_ids, - inputs_embeds, - embeddings, - placeholder_token_id=self.config.image_token_id, - ) - if modality == "video": - inputs_embeds = self._merge_multimodal_embeddings( - input_ids, - inputs_embeds, - embeddings, - placeholder_token_id=self.config.video_token_id, - ) + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + [self.config.image_token_id, self.config.video_token_id]) return inputs_embeds def forward( diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 62840b8c1bcda..8d2719ca2d00d 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -47,6 +47,7 @@ _TEXT_GENERATION_MODELS = { "DeepseekV3ForCausalLM": ("deepseek_v3", "DeepseekV3ForCausalLM"), "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"), "FalconForCausalLM": ("falcon", "FalconForCausalLM"), + "Fairseq2LlamaForCausalLM": ("fairseq2_llama", "Fairseq2LlamaForCausalLM"), "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"), "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"), "GlmForCausalLM": ("glm", "GlmForCausalLM"), @@ -60,6 +61,7 @@ _TEXT_GENERATION_MODELS = { "InternLMForCausalLM": ("llama", "LlamaForCausalLM"), "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"), "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"), + "InternLM3ForCausalLM": ("llama", "LlamaForCausalLM"), "JAISLMHeadModel": ("jais", "JAISLMHeadModel"), "JambaForCausalLM": ("jamba", "JambaForCausalLM"), "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), @@ -125,6 +127,7 @@ _EMBEDDING_MODELS = { "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), + "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"), "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), # [Multimodal] "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 @@ -149,6 +152,7 @@ _MULTIMODAL_MODELS = { "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), # noqa: E501 "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"), "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"), + "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"), "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"), "InternVLChatModel": ("internvl", "InternVLChatModel"), diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index ba1a78ac640fd..5997a76890c9d 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -1,3 +1,4 @@ +import itertools from typing import Iterable, List, Optional, Tuple import torch @@ -20,6 +21,30 @@ from vllm.transformers_utils.config import ( from .interfaces import SupportsCrossEncoding +def roberta_task_weights_filter( + all_weights: Iterable[Tuple[str, torch.Tensor]] +) -> Tuple[Iterable[Tuple[str, torch.Tensor]], Iterable[Tuple[str, + torch.Tensor]]]: + """ + Separate task-specific weights that are applied on top + of the encoder-decoder bert base. + To do so, return two generators over the original iterator. + Also, remove the "roberta." prefix to make it loadable + from vanilla BertModel. + """ + # Copy of a lazy iterator without in-memory overhead so both + # iterators can be iterated upon independently. + all_weights1, all_weights2 = itertools.tee(all_weights) + + def encoder_decoder_weights(): + for name, weight in all_weights1: + if name.startswith("roberta."): + yield (name[len("roberta."):], weight) + + return encoder_decoder_weights(), ((n, w) for n, w in all_weights2 + if not n.startswith("roberta.")) + + class RobertaEmbedding(nn.Module): def __init__(self, config: RobertaConfig): @@ -152,6 +177,18 @@ class RobertaEmbeddingModel(BertEmbeddingModel): prefix=prefix, embedding_class=RobertaEmbedding) + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + weights = self.hf_to_vllm_mapper.apply(weights) + # Separate weights in "roberta"-prefixed and all else (not in memory). + # For use with models like FacebookAI/roberta-base. + bert_weights, task_weights = roberta_task_weights_filter(weights) + loaded = self.model.load_weights(bert_weights) + if not len(loaded): + # Fix for models like `sentence-transformers/stsb-roberta-base-v2` + # which use the same architecture, but have no "roberta" prefix. + loaded = self.model.load_weights(task_weights) + assert len(loaded), "Unable to load RobertaEmbeddingModel" + class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding): """A model that uses Roberta to provide embedding functionalities. @@ -181,20 +218,12 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding): def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - self_weights = [] - - def weight_filter(): - for name, weight in weights: - if name.startswith("roberta."): - yield (name[len("roberta."):], weight) - else: - self_weights.append((name, weight)) - - self.roberta.load_weights(weight_filter()) + bert_weights, task_weights = roberta_task_weights_filter(weights) + self.roberta.load_weights(bert_weights) params_dict = dict(self.named_parameters()) - for name, loaded_weight in self_weights: + for name, loaded_weight in task_weights: if name.startswith("classifier"): param = params_dict[name] weight_loader = getattr(param, "weight_loader", diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index a7cf65a0e36e4..37c5a4b5713b8 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -39,8 +39,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -401,14 +399,6 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "lm_head": "output_embeddings", } embedding_padding_modules = ["lm_head"] - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -417,6 +407,7 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP): lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config + self.quant_config = quant_config self.model = SolarModel( vllm_config=vllm_config, @@ -499,12 +490,14 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - if scale_name := get_compressed_tensors_cache_scale(name): - # Loading kv cache scales for compressed-tensors quantization + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 6b2107bef0a66..c9d1af78246a6 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -22,7 +22,7 @@ from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn -from transformers import PretrainedConfig +from transformers import StableLmConfig from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, VllmConfig @@ -50,8 +50,9 @@ from .utils import (is_pp_missing_parameter, class StablelmMLP(nn.Module): def __init__(self, - config: PretrainedConfig, - quant_config: Optional[QuantizationConfig] = None) -> None: + config: StableLmConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> None: super().__init__() self.config = config self.hidden_size = config.hidden_size @@ -59,10 +60,13 @@ class StablelmMLP(nn.Module): self.gate_up_proj = MergedColumnParallelLinear( config.hidden_size, [config.intermediate_size] * 2, bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") self.down_proj = RowParallelLinear(config.intermediate_size, config.hidden_size, - bias=False) + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.down_proj") self.act_fn = SiluAndMul() def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -75,7 +79,7 @@ class StablelmMLP(nn.Module): class StablelmAttention(nn.Module): def __init__(self, - config: PretrainedConfig, + config: StableLmConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "") -> None: @@ -116,11 +120,13 @@ class StablelmAttention(nn.Module): self.total_num_heads, self.total_num_key_value_heads, self.qkv_bias, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj") self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim, self.hidden_size, bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.o_proj") self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.rotary_ndims, @@ -154,7 +160,7 @@ class StablelmDecoderLayer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: StableLmConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -164,7 +170,7 @@ class StablelmDecoderLayer(nn.Module): cache_config, quant_config, prefix=f"{prefix}.self_attn") - self.mlp = StablelmMLP(config, quant_config) + self.mlp = StablelmMLP(config, quant_config, prefix=f"{prefix}.mlp") norm_eps = getattr(config, "norm_eps", getattr(config, "layer_norm_eps", 1e-05)) self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps) @@ -210,6 +216,8 @@ class StableLMEpochModel(nn.Module): self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens", ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, @@ -270,7 +278,8 @@ class StablelmForCausalLM(nn.Module, SupportsPP): prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.lm_head") if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 22189a517d313..1cd0dedfed2cb 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -88,12 +88,14 @@ class Starcoder2Attention(nn.Module): self.total_num_kv_heads, bias=self.use_bias, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.o_proj = RowParallelLinear( self.total_num_heads * self.head_dim, self.hidden_size, bias=self.use_bias, quant_config=quant_config, + prefix=f"{prefix}.o_proj", ) self.rotary_emb = get_rope( self.head_dim, @@ -129,19 +131,22 @@ class Starcoder2MLP(nn.Module): def __init__(self, config: Starcoder2Config, - quant_config: Optional[QuantizationConfig] = None): + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): super().__init__() self.c_fc = ColumnParallelLinear( config.hidden_size, config.intermediate_size, bias=config.use_bias, quant_config=quant_config, + prefix=f"{prefix}.c_fc", ) self.c_proj = RowParallelLinear( config.intermediate_size, config.hidden_size, bias=config.use_bias, quant_config=quant_config, + prefix=f"{prefix}.c_proj", ) self.act = get_act_fn(config.hidden_act) @@ -165,7 +170,9 @@ class Starcoder2DecoderLayer(nn.Module): cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn") - self.mlp = Starcoder2MLP(config, quant_config=quant_config) + self.mlp = Starcoder2MLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon) self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, @@ -213,8 +220,11 @@ class Starcoder2Model(nn.Module): self.vocab_size = config.vocab_size # TODO: consider padding_idx (currently removed) - self.embed_tokens = VocabParallelEmbedding(config.vocab_size, - config.hidden_size) + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens") self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Starcoder2DecoderLayer( @@ -279,6 +289,7 @@ class Starcoder2ForCausalLM(nn.Module, SupportsPP): org_num_embeddings=config.vocab_size, padding_size=DEFAULT_VOCAB_PADDING_SIZE, quant_config=quant_config, + prefix=f"{prefix}.lm_head", ) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index fada22d685dd6..9301422383696 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -16,7 +16,7 @@ from transformers.models.whisper.modeling_whisper import WhisperEncoder from vllm import envs from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn +from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.loader import DefaultModelLoader @@ -137,26 +137,15 @@ class UltravoxMultiModalProcessor( mm_kwargs: Mapping[str, object], ) -> BatchFeature: # Text-only input not supported in composite processor - if not mm_data: - tokenizer = self.info.get_tokenizer() - - prompt_ids = tokenizer.encode( - prompt, - add_special_tokens=False, # type: ignore - ) + if not mm_data or not mm_data.get("audios", []): + prompt_ids = self.info.get_tokenizer().encode(prompt) + prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") mm_data = dict(mm_data) audios = mm_data.pop("audios", []) assert isinstance(audios, list) - if not audios: - return super()._call_hf_processor( - prompt=prompt, - mm_data=mm_data, - mm_kwargs=mm_kwargs, - ) - feature_extractor = self.info.get_feature_extractor() mm_kwargs = dict( **mm_kwargs, @@ -188,6 +177,16 @@ class UltravoxMultiModalProcessor( ) return BatchFeature(combined_outputs) + def _apply_hf_processor_tokens_only( + self, + prompt_tokens: list[int], + ) -> list[int]: + # HF processor omits bos_token_id by setting add_special_tokens=False + tokenizer = self.info.get_tokenizer() + assert prompt_tokens[0] == tokenizer.bos_token_id + + return prompt_tokens[1:] + def _get_mm_fields_config( self, hf_inputs: BatchFeature, @@ -242,15 +241,6 @@ class StackAudioFrames(nn.Module): return audio_embeds -class FlippedSiluAndMul(SiluAndMul): - """Ultravox is trained with SwiGLU with flipped halves.""" - - def forward(self, x: torch.Tensor): - a, b = x.chunk(2, dim=-1) - flipped = torch.cat((b, a), dim=-1) - return super().forward(flipped) - - class UltravoxProjector(nn.Module): def __init__(self, config: UltravoxConfig): @@ -263,7 +253,7 @@ class UltravoxProjector(nn.Module): dim = self.hidden_dim if config.projector_act == "swiglu": - self.act = FlippedSiluAndMul() + self.act = MulAndSilu() dim = dim // 2 else: self.act = get_act_fn(config.projector_act) diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py index fc5a3e7fba674..a9ce8af15d3bb 100644 --- a/vllm/model_executor/parameter.py +++ b/vllm/model_executor/parameter.py @@ -56,8 +56,14 @@ class BasevLLMParameter(Parameter): def weight_loader(self): return self._weight_loader + def _is_1d_and_scalar(self, loaded_weight: torch.Tensor): + cond1 = self.data.ndim == 1 and self.data.numel() == 1 + cond2 = loaded_weight.ndim == 0 and loaded_weight.numel() == 1 + return (cond1 and cond2) + def _assert_and_load(self, loaded_weight: torch.Tensor): - assert self.data.shape == loaded_weight.shape + assert (self.data.shape == loaded_weight.shape + or self._is_1d_and_scalar(loaded_weight)) self.data.copy_(loaded_weight) def load_column_parallel_weight(self, loaded_weight: torch.Tensor): diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 343b9322ecc5e..1d7f5d57fa24e 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -8,10 +8,10 @@ from .registry import MultiModalRegistry MULTIMODAL_REGISTRY = MultiModalRegistry() """ The global :class:`~MultiModalRegistry` is used by model runners to -dispatch data processing according to its modality and the target model. +dispatch data processing according to the target model. See also: - :ref:`input-processing-pipeline` + :ref:`mm-processing` """ __all__ = [ diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 4941fbac963ca..fd3ec7e0ec8ce 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -90,10 +90,6 @@ class MultiModalPlugin(ABC): invoked to transform the data into a dictionary of model inputs. If `None` is provided, then the default input mapper is used instead. - - See also: - - :ref:`input-processing-pipeline` - - :ref:`enabling-multimodal-inputs` """ def wrapper(model_cls: N) -> N: @@ -126,10 +122,6 @@ class MultiModalPlugin(ABC): Raises: TypeError: If the data type is not supported. - - See also: - - :ref:`input-processing-pipeline` - - :ref:`enabling-multimodal-inputs` """ # Avoid circular import @@ -186,9 +178,6 @@ class MultiModalPlugin(ABC): for a model class. If `None` is provided, then the default calculation is used instead. - - See also: - :ref:`enabling-multimodal-inputs` """ def wrapper(model_cls: N) -> N: @@ -218,9 +207,6 @@ class MultiModalPlugin(ABC): If this registry is not applicable to the model, `0` is returned. The model is identified by ``model_config``. - - See also: - :ref:`enabling-multimodal-inputs` """ # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 8680e4175593b..4b63703585214 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -493,7 +493,8 @@ A dictionary containing placeholder ranges for each modality. class MultiModalInputsV2(TypedDict): """ - Represents the outputs of :class:`vllm.multimodal.MultiModalProcessor`, + Represents the outputs of + :class:`vllm.multimodal.processing.BaseMultiModalProcessor`, ready to be passed to vLLM internals. """ diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 07d883d5d7295..fa199a07b4cf8 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -404,71 +404,60 @@ def replace_text_matches( return "".join(texts) -def _iter_modality_placeholders( - prompt: list[int], - modality: str, - modality_repls: Sequence[BoundPromptReplacement], - modal_item_count: int, -) -> Iterable[PlaceholderInfo]: - if modal_item_count == 0: - return - - prompt_len = len(prompt) - item_idx = 0 - - start_idx = 0 - while start_idx < prompt_len: - found = False - - for repl_info in modality_repls: - replacement = repl_info.get_replacement(item_idx) - repl_tokens = replacement.token_ids - repl_len = len(repl_tokens) - end_idx = start_idx + repl_len - - if repl_len == 0 or end_idx > prompt_len: - continue - - if prompt[start_idx:end_idx] == repl_tokens: - yield PlaceholderInfo( - modality=modality, - item_idx=item_idx, - start_idx=start_idx, - replacement=repl_tokens, - ) - - item_idx += 1 - if item_idx >= modal_item_count: - return - - # Exclude overlapping matches - start_idx = end_idx - found = True - break - - if not found: - start_idx += 1 - - def _iter_placeholders( mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], prompt: list[int], mm_item_counts: Mapping[str, int], ) -> Iterable[PlaceholderInfo]: """ - For each modality, yield each set of placeholder tokens found in - :code:`prompt`. + Yield each set of placeholder tokens found in :code:`prompt`. + + Matches are exclusive even when multiple modalities share + the same placeholder tokens. In that case, the modality that + appears earlier in `mm_prompt_repls` takes priority. Note that empty matches are ignored. """ - for modality, modal_item_count in mm_item_counts.items(): - if modality in mm_prompt_repls: - yield from _iter_modality_placeholders( - prompt, - modality, - mm_prompt_repls[modality], - modal_item_count, - ) + prompt_len = len(prompt) + item_idx_by_modality = defaultdict[str, int](lambda: 0) + + start_idx = 0 + while start_idx < prompt_len: + found = False + + for modality, modality_repls in mm_prompt_repls.items(): + item_idx = item_idx_by_modality[modality] + if item_idx >= mm_item_counts.get(modality, 0): + continue + + for repl_info in modality_repls: + replacement = repl_info.get_replacement(item_idx) + repl_tokens = replacement.token_ids + repl_len = len(repl_tokens) + end_idx = start_idx + repl_len + + if repl_len == 0 or end_idx > prompt_len: + continue + + if prompt[start_idx:end_idx] == repl_tokens: + yield PlaceholderInfo( + modality=modality, + item_idx=item_idx, + start_idx=start_idx, + replacement=repl_tokens, + ) + + # Exclude overlapping matches + start_idx = end_idx + item_idx_by_modality[modality] += 1 + found = True + break + + if found: + break # Go back to the outer while loop + + if not found: + start_idx += 1 def find_mm_placeholders( @@ -725,15 +714,15 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_kwargs, ) - def _apply_hf_processor( + def _apply_hf_processor_text_mm( self, prompt_text: str, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], ) -> tuple[list[int], MultiModalKwargs]: """ - Wrapper of :meth:`_call_hf_processor` that applies - additional pre-processing and post-processing. + Apply the HF processor on the prompt text and multi-modal data + together. """ processor_data, passthrough_data = self._get_hf_mm_data(mm_items) @@ -753,40 +742,93 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): return prompt_ids, mm_kwargs - def _apply_hf_processor_missing( - self, - prompt_text: str, - mm_missing_data_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - ): + def _apply_hf_processor_text_only(self, prompt_text: str) -> list[int]: """ - Apply the HF processor on the full prompt text, but only on the - multi-modal data that are missing from the cache. + Apply the HF processor on the prompt text only. - Note: - We pass prompt text and multi-modal data into the HF processor - in separate calls to avoid HF prompt replacement being done for - cached items; instead, we rely on our own prompt replacement logic - (:meth:`_get_prompt_replacements`) for the full text. + Since HF processor requires that text and multi-modal items + correspond to each other, we create dummy multi-modal items + to go along with the text. """ - mm_missing_counts = mm_missing_data_items.get_all_counts() - - prompt_ids, _ = self._apply_hf_processor( + prompt_ids, _ = self._apply_hf_processor_text_mm( prompt_text=prompt_text, mm_items=MultiModalDataItems({}), hf_processor_mm_kwargs={}, ) - # Some HF processors (e.g. Qwen2-VL) expect corresponding - # multi-modal tokens to be in the prompt text + return prompt_ids + + def _apply_hf_processor_tokens_only( + self, + prompt_tokens: list[int], + ) -> list[int]: + """ + Apply the HF processor on the prompt tokens only. + + Most HF processors accept prompt text but not prompt tokens. + If the HF processor adds or removes tokens that are not related to + multi-modal data, you should override this method so it is consistent + with the output of :meth:`_apply_hf_processor_text_only` on the + corresponding text. + """ + return prompt_tokens + + def _apply_hf_processor_mm_only( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalKwargs: + """ + Apply the HF processor on the multi-modal data only. + + Since HF processor requires that text and multi-modal items + correspond to each other, we generate dummy text using + :class:`DummyInputsBuilder` to go along with the multi-modal data. + """ + mm_counts = mm_items.get_all_counts() + dummy_inputs = self.dummy_inputs.get_dummy_processor_inputs( self.info.ctx.model_config.max_model_len, - mm_missing_counts, + mm_counts, ) - _, mm_missing_kwargs = self._apply_hf_processor( + _, mm_kwargs = self._apply_hf_processor_text_mm( prompt_text=dummy_inputs.prompt_text, - mm_items=mm_missing_data_items, + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) + + return mm_kwargs + + def _apply_hf_processor_main( + self, + prompt: Union[str, list[int]], + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + *, + enable_hf_prompt_replacement: bool, + ) -> tuple[list[int], MultiModalKwargs]: + """ + Apply the HF processor on the prompt text and multi-modal data. + + Note: + If :code:`enable_hf_prompt_replacement=False`, the prompt should + correspond to the multi-modal items. + """ + if isinstance(prompt, str): + if enable_hf_prompt_replacement: + return self._apply_hf_processor_text_mm( + prompt_text=prompt, + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) + + prompt_ids = self._apply_hf_processor_text_only(prompt) + else: + prompt_ids = self._apply_hf_processor_tokens_only(prompt) + + mm_missing_kwargs = self._apply_hf_processor_mm_only( + mm_items=mm_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, ) @@ -794,7 +836,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): def _cached_apply_hf_processor( self, - prompt_text: str, + prompt: Union[str, list[int]], mm_data_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], ) -> tuple[list[int], MultiModalKwargs]: @@ -807,10 +849,11 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): _, passthrough_data = self._get_hf_mm_data(mm_data_items) if cache is None or passthrough_data: - return self._apply_hf_processor( - prompt_text=prompt_text, + return self._apply_hf_processor_main( + prompt=prompt, mm_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, + enable_hf_prompt_replacement=True, ) mm_maybe_cached_kw_items = { @@ -832,10 +875,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): } mm_missing_data_items = self._to_mm_items(mm_missing_data) - prompt_ids, mm_missing_kwargs = self._apply_hf_processor_missing( - prompt_text=prompt_text, - mm_missing_data_items=mm_missing_data_items, + # NOTE: `prompt` does not correspond to `mm_missing_data_items`, + # so we need to pass `enable_hf_prompt_replacement=False` + prompt_ids, mm_missing_kwargs = self._apply_hf_processor_main( + prompt=prompt, + mm_items=mm_missing_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, + enable_hf_prompt_replacement=False, ) mm_missing_next_idx = { @@ -1018,7 +1064,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): def apply( self, - prompt_text: str, + prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], ) -> MultiModalInputsV2: @@ -1056,7 +1102,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_hashes = None prompt_ids, mm_kwargs = self._cached_apply_hf_processor( - prompt_text, + prompt, mm_items, hf_processor_mm_kwargs, ) @@ -1099,14 +1145,14 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): # If HF processor already inserts placeholder tokens, # there is no need for us to insert them - if all(len(repls) == 0 for repls in mm_missing_repls.items()): + if all(len(repls) == 0 for repls in mm_missing_repls.values()): tokenizer = self.info.get_tokenizer() - prompt_text = decode_tokens(tokenizer, prompt_ids) + prompt = decode_tokens(tokenizer, prompt_ids) mm_placeholders = hf_mm_placeholders else: ( prompt_ids, - prompt_text, + prompt, missing_mm_placeholders, ) = self._apply_prompt_replacements( prompt_ids, @@ -1125,7 +1171,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): return MultiModalInputsV2( type="multimodal", - prompt=prompt_text, + prompt=prompt, prompt_token_ids=prompt_ids, mm_kwargs=mm_kwargs, mm_hashes=mm_hashes, diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 6f7da1509990f..ec580cd6ecddd 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -137,7 +137,7 @@ class MultiModalProfiler(Generic[_I]): seq_len, mm_counts) return self.processor.apply( - prompt_text=processor_inputs.prompt_text, + prompt=processor_inputs.prompt_text, mm_data=processor_inputs.mm_data, hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, ) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 9eceefb08c93f..aaf7ff34ca573 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -100,8 +100,7 @@ class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]): class MultiModalRegistry: """ - A registry that dispatches data processing to the - :class:`~vllm.multimodal.MultiModalPlugin` for each modality. + A registry that dispatches data processing according to the model. """ DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin(), VideoPlugin()) @@ -253,14 +252,14 @@ class MultiModalRegistry: model_config: "ModelConfig", ) -> Mapping[str, int]: """ - Get the maximum number of tokens per data item from each modality - for profiling the memory usage of a model. - - Note: - This is currently directly used only in V1. + Get the maximum number of tokens per data item from each modality based + on underlying model configuration. """ if self.has_processor(model_config): - tokenizer = cached_get_tokenizer(model_config.tokenizer) + tokenizer = cached_get_tokenizer( + model_config.tokenizer, + trust_remote_code=model_config.trust_remote_code, + ) processor = self.create_processor(model_config, tokenizer) seq_len = model_config.max_model_len return processor.info.get_mm_max_tokens_per_item(seq_len) @@ -270,6 +269,28 @@ class MultiModalRegistry: for key, plugin in self._plugins.items() } + def get_max_tokens_per_item_by_nonzero_modality( + self, + model_config: "ModelConfig", + ) -> Mapping[str, int]: + """ + Get the maximum number of tokens per data item from each modality based + on underlying model configuration, excluding modalities that user + explicitly disabled via `limit_mm_per_prompt`. + + Note: + This is currently directly used only in V1 for profiling the memory + usage of a model. + """ + limits_per_plugin = self._limits_by_model[model_config] + + return { + key: max_tokens_per_mm_item + for key, max_tokens_per_mm_item in + self.get_max_tokens_per_item_by_modality(model_config).items() + if limits_per_plugin[key] > 0 + } + def get_max_tokens_by_modality( self, model_config: "ModelConfig", @@ -367,8 +388,7 @@ class MultiModalRegistry: invoked to transform the data into a dictionary of model inputs. See also: - - :ref:`input-processing-pipeline` - - :ref:`enabling-multimodal-inputs` + :ref:`mm-processing` """ def wrapper(model_cls: N) -> N: @@ -398,6 +418,9 @@ class MultiModalRegistry: def has_processor(self, model_config: "ModelConfig") -> bool: """ Test whether a multi-modal processor is defined for a specific model. + + See also: + :ref:`mm-processing` """ return self._get_model_cls(model_config) in self._processor_factories @@ -408,6 +431,9 @@ class MultiModalRegistry: ) -> BaseMultiModalProcessor[BaseProcessingInfo]: """ Create a multi-modal processor for a specific model and tokenizer. + + See also: + :ref:`mm-processing` """ model_cls = self._get_model_cls(model_config) factories = self._processor_factories[model_cls] diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index f6ac14446c021..6ca95b41dbb07 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -179,7 +179,7 @@ def resolve_current_platform_cls_qualname() -> str: logger.info("Automatically detected platform %s.", activated_builtin_plugins[0]) else: - platform_cls_qualname = "vllm.interface.UnspecifiedPlatform" + platform_cls_qualname = "vllm.platforms.interface.UnspecifiedPlatform" logger.info( "No platform detected, vLLM is running on UnspecifiedPlatform") return platform_cls_qualname diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index eb3e269cac285..74948202cbe48 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -1,3 +1,4 @@ +import os from typing import TYPE_CHECKING, Optional import psutil @@ -105,7 +106,37 @@ class CpuPlatform(Platform): else: parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker" + assert vllm_config.device_config.device_type == "cpu" + + # + # Environment variables for CPU executor + # + + # Disable torch async compiling which won't work with daemonic processes + os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" + + # Intel OpenMP setting + ld_prealod_str = os.getenv("LD_PRELOAD", "") + if "libiomp5.so" in ld_prealod_str: + # The time(milliseconds) that a thread should wait after + # completing the execution of a parallel region, before sleeping. + os.environ['KMP_BLOCKTIME'] = "1" + # Prevents the CPU to run into low performance state + os.environ['KMP_TPAUSE'] = "0" + # Provides fine granularity parallelism + os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist" + os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist" + os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist" + + # To hint IPEX uses shared memory based AllReduce + os.environ["LOCAL_WORLD_SIZE"] = str( + vllm_config.parallel_config.tensor_parallel_size) + @classmethod def is_pin_memory_available(cls) -> bool: logger.warning("Pin memory is not supported on CPU.") return False + + @classmethod + def get_punica_wrapper(cls) -> str: + return "vllm.lora.punica_wrapper.punica_cpu.PunicaWrapperCPU" diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 23ceac83e49de..2587e3a11dde3 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -77,6 +77,8 @@ class CudaPlatformBase(Platform): device_name: str = "cuda" device_type: str = "cuda" dispatch_key: str = "CUDA" + ray_device_key: str = "GPU" + device_control_env_var: str = "CUDA_VISIBLE_DEVICES" @classmethod def get_device_capability(cls, @@ -141,6 +143,13 @@ class CudaPlatformBase(Platform): if cache_config and cache_config.block_size is None: cache_config.block_size = 16 + @classmethod + def get_current_memory_usage(cls, + device: Optional[torch.types.Device] = None + ) -> float: + torch.cuda.reset_peak_memory_stats(device) + return torch.cuda.max_memory_allocated(device) + @classmethod def get_attn_backend_cls(cls, selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1) -> str: @@ -216,6 +225,10 @@ class CudaPlatformBase(Platform): logger.info("Using Flash Attention backend.") return "vllm.attention.backends.flash_attn.FlashAttentionBackend" + @classmethod + def get_punica_wrapper(cls) -> str: + return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU" + # NVML utils # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 8152d881fa8d9..242c2c127979a 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -19,6 +19,8 @@ class HpuPlatform(Platform): device_name: str = "hpu" device_type: str = "hpu" dispatch_key: str = "HPU" + ray_device_key: str = "HPU" + device_control_env_var: str = "HABANA_VISIBLE_MODULES" @classmethod def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, @@ -61,3 +63,7 @@ class HpuPlatform(Platform): def is_pin_memory_available(cls): logger.warning("Pin memory is not supported on HPU.") return False + + @classmethod + def get_punica_wrapper(cls) -> str: + return "vllm.lora.punica_wrapper.punica_hpu.PunicaWrapperHPU" diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index f440358f65fbb..f2ecec3203fb7 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -33,6 +33,7 @@ class _Backend(enum.Enum): HPU_ATTN = enum.auto() PALLAS = enum.auto() IPEX = enum.auto() + BLOCK_SPARSE_FLASH_ATTN = enum.auto() NO_ATTENTION = enum.auto() @@ -45,6 +46,7 @@ class PlatformEnum(enum.Enum): CPU = enum.auto() NEURON = enum.auto() OPENVINO = enum.auto() + OOT = enum.auto() UNSPECIFIED = enum.auto() @@ -77,10 +79,30 @@ class Platform: _enum: PlatformEnum device_name: str device_type: str + # available dispatch keys: # check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa # use "CPU" as a fallback for platforms not registered in PyTorch dispatch_key: str = "CPU" + + # available ray device keys: + # https://github.com/ray-project/ray/blob/10ba5adadcc49c60af2c358a33bb943fb491a171/python/ray/_private/ray_constants.py#L438 # noqa + # empty string means the device does not support ray + ray_device_key: str = "" + + # platform-agnostic way to specify the device control environment variable, + # .e.g. CUDA_VISIBLE_DEVICES for CUDA. + # hint: search for "get_visible_accelerator_ids_env_var" in + # https://github.com/ray-project/ray/tree/master/python/ray/_private/accelerators # noqa + device_control_env_var: str = "VLLM_DEVICE_CONTROL_ENV_VAR_PLACEHOLDER" + + # The torch.compile backend for compiling simple and + # standalone functions. The default value is "inductor" to keep + # the same behavior as PyTorch. + # NOTE: for the forward part of the model, vLLM has another separate + # compilation strategy. + simple_compile_backend: str = "inductor" + supported_quantization: list[str] = [] def is_cuda(self) -> bool: @@ -107,6 +129,9 @@ class Platform: def is_openvino(self) -> bool: return self._enum == PlatformEnum.OPENVINO + def is_out_of_tree(self) -> bool: + return self._enum == PlatformEnum.OOT + def is_cuda_alike(self) -> bool: """Stateless version of :func:`torch.cuda.is_available`.""" return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM) @@ -252,6 +277,22 @@ class Platform: return False return True + @classmethod + def get_current_memory_usage(cls, + device: Optional[torch.types.Device] = None + ) -> float: + """ + Return the memory usage in bytes. + """ + raise NotImplementedError + + @classmethod + def get_punica_wrapper(cls) -> str: + """ + Return the punica wrapper for current platform. + """ + raise NotImplementedError + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index a4bbbd27c8a89..ead3dab05a6b1 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -16,7 +16,9 @@ class NeuronPlatform(Platform): _enum = PlatformEnum.NEURON device_name: str = "neuron" device_type: str = "neuron" + ray_device_key: str = "neuron_cores" supported_quantization: list[str] = ["neuron_quant"] + device_control_env_var: str = "NEURON_RT_VISIBLE_CORES" @classmethod def get_device_name(cls, device_id: int = 0) -> str: @@ -33,6 +35,14 @@ class NeuronPlatform(Platform): parallel_config.worker_cls = \ "vllm.worker.neuron_worker.NeuronWorker" + if parallel_config.world_size > 1: + parallel_config.distributed_executor_backend = "uni" + + assert (vllm_config.lora_config is + None), "LoRA is not supported for Neuron backend." + assert (not vllm_config.speculative_config + ), "Speculative decoding not yet supported for Neuron backend." + cache_config = vllm_config.cache_config if cache_config: # neuron needs block_size = max_model_len diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py index 9390eda535c8f..7d414165a8188 100644 --- a/vllm/platforms/openvino.py +++ b/vllm/platforms/openvino.py @@ -66,9 +66,8 @@ class OpenVinoPlatform(Platform): from vllm.utils import GiB_bytes parallel_config = vllm_config.parallel_config - assert ( - parallel_config.world_size == 1 - ), "OpenVINOExecutor only supports single CPU socket currently." + assert (parallel_config.world_size == 1 + ), "OpenVINO only supports single CPU socket currently." if parallel_config.worker_cls == "auto": parallel_config.worker_cls = \ @@ -141,3 +140,10 @@ class OpenVinoPlatform(Platform): raise RuntimeError( "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE" f" {kv_cache_space}, expect a positive integer value.") + + assert vllm_config.device_config.device_type == "openvino" + assert vllm_config.lora_config is None, \ + "OpenVINO backend doesn't support LoRA" + assert cls.is_openvino_cpu() or \ + cls.is_openvino_gpu(), \ + "OpenVINO backend supports only CPU and GPU devices" diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 1c2f602efc856..5ef56406e1935 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -64,9 +64,13 @@ class RocmPlatform(Platform): device_name: str = "rocm" device_type: str = "cuda" dispatch_key: str = "CUDA" + ray_device_key: str = "GPU" + # rocm shares the same device control env var as CUDA + device_control_env_var: str = "CUDA_VISIBLE_DEVICES" + supported_quantization: list[str] = [ "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors", - "fbgemm_fp8", "gguf" + "fbgemm_fp8", "gguf", "quark" ] @classmethod @@ -149,3 +153,14 @@ class RocmPlatform(Platform): "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ" " is not set, enabling VLLM_USE_TRITON_AWQ.") envs.VLLM_USE_TRITON_AWQ = True + + @classmethod + def get_punica_wrapper(cls) -> str: + return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU" + + @classmethod + def get_current_memory_usage(cls, + device: Optional[torch.types.Device] = None + ) -> float: + torch.cuda.reset_peak_memory_stats(device) + return torch.cuda.max_memory_allocated(device) diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 8a59b53ca4b15..05a3aa4305cfa 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -19,6 +19,9 @@ class TpuPlatform(Platform): device_name: str = "tpu" device_type: str = "tpu" dispatch_key: str = "XLA" + ray_device_key: str = "TPU" + device_control_env_var: str = "TPU_VISIBLE_CHIPS" + supported_quantization: list[str] = [ "tpu_int8", "compressed-tensors", "compressed_tensors" ] @@ -69,6 +72,16 @@ class TpuPlatform(Platform): assert vllm_config.speculative_config is None, \ "TPU does not support speculative decoding" + assert not vllm_config.scheduler_config.chunked_prefill_enabled, ( + "Chunked prefill is not yet supported for TPU backend") + assert not vllm_config.speculative_config, ( + "Speculative decoding is not yet supported for TPU backend") + if vllm_config.model_config.dtype in (torch.float16, torch.float32): + logger.warning( + "The TPU backend currently does not support %s. " + "Using bfloat16 instead.", vllm_config.model_config.dtype) + vllm_config.model_config.dtype = torch.bfloat16 + parallel_config = vllm_config.parallel_config scheduler_config = vllm_config.scheduler_config if parallel_config.worker_cls == "auto": diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 00692a5d23031..c34b5b58672e7 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -19,6 +19,10 @@ class XPUPlatform(Platform): device_name: str = "xpu" device_type: str = "xpu" dispatch_key: str = "XPU" + # Intel XPU's device key is "GPU" for Ray. + # see https://github.com/ray-project/ray/blob/6a5eb5865eeb9ccf058a79b44f107e327e360673/python/ray/_private/accelerators/intel_gpu.py#L20 # noqa: E501 + ray_device_key: str = "GPU" + device_control_env_var: str = "ONEAPI_DEVICE_SELECTOR" @classmethod def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, @@ -74,19 +78,40 @@ class XPUPlatform(Platform): raise NotImplementedError( "XPU does not support speculative decoding") + if vllm_config.device_config is not None: + assert vllm_config.device_config.device_type == "xpu" + # check and update parallel config parallel_config = vllm_config.parallel_config - if (parallel_config.distributed_executor_backend is not None - and parallel_config.distributed_executor_backend != "ray"): + if parallel_config.worker_cls == "auto": + parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker" + + if parallel_config.distributed_executor_backend is None: + parallel_config.distributed_executor_backend = "ray" + elif parallel_config.distributed_executor_backend == "mp": + # FIXME(kunshang): + # spawn needs calling `if __name__ == '__main__':`` + # fork is not supported for xpu start new process. + logger.error( + "Both start methods (spawn and fork) have issue " + "on XPU if you use mp backend, setting it to ray instead.") + parallel_config.distributed_executor_backend = "ray" + + elif parallel_config.distributed_executor_backend != "ray": logger.warning( "%s is not supported on XPU, fallback to ray distributed" " executor backend.", parallel_config.distributed_executor_backend) parallel_config.distributed_executor_backend = "ray" - if parallel_config.worker_cls == "auto": - parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker" @classmethod def is_pin_memory_available(cls): logger.warning("Pin memory is not supported on XPU.") return False + + @classmethod + def get_current_memory_usage(cls, + device: Optional[torch.types.Device] = None + ) -> float: + torch.xpu.reset_peak_memory_stats(device) + return torch.xpu.max_memory_allocated(device) diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index e5fa4f0e4a2f6..ff54174f634af 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -1,9 +1,6 @@ import logging -import os from typing import Callable, Dict -import torch - import vllm.envs as envs logger = logging.getLogger(__name__) @@ -50,34 +47,6 @@ def load_general_plugins(): processes. They should be designed in a way that they can be loaded multiple times without causing issues. """ - - # all processes created by vllm will load plugins, - # and here we can inject some common environment variables - # for all processes. - - # see https://github.com/vllm-project/vllm/issues/10480 - os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1' - # see https://github.com/vllm-project/vllm/issues/10619 - torch._inductor.config.compile_threads = 1 - - from vllm.platforms import current_platform - - if current_platform.is_xpu(): - # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158 - torch._dynamo.config.disable = True - if current_platform.is_hpu(): - # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1) - # does not support torch.compile - # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for - # torch.compile support - is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1' - if is_lazy: - torch._dynamo.config.disable = True - # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only) - # requires enabling lazy collectives - # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501 - os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true' - global plugins_loaded if plugins_loaded: return diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py index 33babfebdca1e..29c0edd0ee535 100644 --- a/vllm/profiler/layerwise_profile.py +++ b/vllm/profiler/layerwise_profile.py @@ -1,7 +1,7 @@ import copy from collections import defaultdict from dataclasses import asdict, dataclass, field -from typing import Callable, Dict, List, Optional, Tuple, TypeAlias, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, TypeAlias, Union import pandas as pd from torch._C._autograd import DeviceType, _KinetoEvent, _ProfilerResult @@ -128,7 +128,7 @@ class LayerwiseProfileResults(profile): ]) df.to_csv(filename) - def convert_stats_to_dict(self) -> str: + def convert_stats_to_dict(self) -> dict[str, Any]: return { "metadata": { "num_running_seqs": self.num_running_seqs @@ -227,7 +227,7 @@ class LayerwiseProfileResults(profile): [self._cumulative_cuda_time(root) for root in self._module_tree]) def _build_stats_trees(self): - summary_dict: Dict[str, self.StatsTreeNode] = {} + summary_dict: Dict[str, _StatsTreeNode] = {} total_cuda_time = self._total_cuda_time() def pct_cuda_time(cuda_time_us): diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py index 1ab691a7ef047..21a58fc426275 100644 --- a/vllm/spec_decode/medusa_worker.py +++ b/vllm/spec_decode/medusa_worker.py @@ -9,17 +9,15 @@ from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase from vllm.spec_decode.top1_proposer import Top1Proposer -from vllm.worker.worker_base import WorkerWrapperBase +from vllm.worker.worker_base import DelegateWorkerBase -class MedusaWorker(NonLLMProposerWorkerBase, WorkerWrapperBase): +class MedusaWorker(NonLLMProposerWorkerBase, DelegateWorkerBase): """Worker for Medusa. """ def __init__(self, *args, **kwargs): - super().__init__(kwargs.get("vllm_config")) - self.init_worker(*args, **kwargs) - + DelegateWorkerBase.__init__(self, *args, **kwargs) # Lazy initialization list. self._proposer: Top1Proposer diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 676ac5eb3609d..32197f8cc8f2f 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -16,10 +16,10 @@ from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeProposer) from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.top1_proposer import Top1Proposer -from vllm.worker.worker_base import WorkerWrapperBase +from vllm.worker.worker_base import DelegateWorkerBase -class MultiStepWorker(ProposerWorkerBase, WorkerWrapperBase): +class MultiStepWorker(ProposerWorkerBase, DelegateWorkerBase): """The MultiStepWorker is equivalent to a Worker except that it allows multiple forward passes in a single call, assuming the scheduler has allocated enough space to store the additional KV. This reduces overhead @@ -32,15 +32,12 @@ class MultiStepWorker(ProposerWorkerBase, WorkerWrapperBase): """ def __init__(self, *args, **kwargs): - super().__init__(kwargs.get("vllm_config")) - self.init_worker(*args, **kwargs) - + DelegateWorkerBase.__init__(self, *args, **kwargs) # Lazy initialization list. self._proposer: SpeculativeProposer def init_device(self) -> None: self.worker.init_device() - self._proposer = Top1Proposer( weakref.proxy(self), # type: ignore[arg-type] self.device, @@ -56,18 +53,6 @@ class MultiStepWorker(ProposerWorkerBase, WorkerWrapperBase): self.model_runner.model.sampler.should_modify_greedy_probs_inplace = ( True) - def determine_num_available_blocks(self) -> Tuple[int, int]: - return self.worker.determine_num_available_blocks() - - def get_cache_block_size_bytes(self) -> int: - return self.worker.get_cache_block_size_bytes() - - def initialize_cache(self, *args, **kwargs) -> None: - self.worker.initialize_cache(*args, **kwargs) - - def execute_model(self, *args, **kwargs) -> List[SamplerOutput]: - return self.worker.execute_model(*args, **kwargs) - @torch.inference_mode() def sampler_output( self, diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index bb6b99135580e..e906b1789cde8 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -2,6 +2,7 @@ import weakref from typing import List, Optional, Set, Tuple import torch +import torch.nn as nn from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest @@ -10,6 +11,10 @@ from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase from vllm.spec_decode.top1_proposer import Top1Proposer +class _DummyModel(nn.Module): + pass + + class NGramWorker(NonLLMProposerWorkerBase): """NGramWorker provides a light drafter without need for model. @@ -36,7 +41,6 @@ class NGramWorker(NonLLMProposerWorkerBase): def init_device(self): self.device = torch.device(f"{self.device_type}:{self.local_rank}") - self.load_model = lambda *args, **kwargs: None # Current NGramWorker only supports Top1Proposer self._proposer = Top1Proposer( @@ -45,6 +49,12 @@ class NGramWorker(NonLLMProposerWorkerBase): vocab_size=self.vocab_size, ) + def load_model(self) -> None: + pass # Dummy + + def get_model(self) -> nn.Module: + return _DummyModel() + def sampler_output( self, execute_model_req: ExecuteModelRequest, diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 8896b7dbc6b8a..c6ff5e52f9388 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -1,6 +1,7 @@ from typing import List, Optional, Set, Tuple import torch +import torch.nn as nn from vllm.distributed.parallel_state import (get_tp_group, init_model_parallel_group, @@ -15,6 +16,10 @@ from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase logger = init_logger(__name__) +class _DummyModel(nn.Module): + pass + + class SmallerTpProposerWorker(ProposerWorkerBase): """Class which allows a speculative draft model to run with smaller tensor parallel degree than target model. @@ -139,6 +144,13 @@ class SmallerTpProposerWorker(ProposerWorkerBase): return self._worker.get_spec_proposals( execute_model_req, seq_ids_with_bonus_token_in_last_step) + def get_model(self) -> nn.Module: + if self._is_dummy: + return _DummyModel() + + with self._patch_tensor_parallel_group(): + return self._worker.get_model() + def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index e369da1a70c23..0d66ede3d907a 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -4,6 +4,7 @@ from functools import cached_property from typing import Any, Dict, List, Optional, Set, Tuple, Type import torch +import torch.nn as nn from vllm.config import ParallelConfig, SpeculativeConfig, VllmConfig from vllm.distributed.communication_op import broadcast_tensor_dict @@ -40,8 +41,8 @@ from vllm.spec_decode.util import (Timer, create_logprobs_output, get_all_num_logprobs, get_sampled_token_logprobs, nvtx_range, split_batch_by_proposal_len) -from vllm.worker.worker_base import (LoraNotSupportedWorkerBase, WorkerBase, - WorkerWrapperBase) +from vllm.utils import resolve_obj_by_qualname +from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase logger = init_logger(__name__) @@ -64,8 +65,9 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": target_worker_config = copy.deepcopy(vllm_config) target_worker_config.parallel_config.worker_cls =\ target_worker_config.parallel_config.sd_worker_cls - target_worker = WorkerWrapperBase(vllm_config=target_worker_config) - target_worker.init_worker(*args, **kwargs) + cls = resolve_obj_by_qualname( + target_worker_config.parallel_config.worker_cls) + target_worker = cls(*args, **kwargs) # Set the disable_logprobs variable in the TargetModelRunner instance # as per its value specified in the SpeculativeConfig. target_worker.model_runner.disable_logprobs =\ @@ -402,6 +404,9 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): self.proposer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) + def get_model(self) -> nn.Module: + return self.scorer_worker.get_model() + @torch.inference_mode() def execute_model( self, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 58417980e7b47..f57dfded0a62f 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -22,8 +22,9 @@ from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger # yapf conflicts with isort for this block # yapf: disable -from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config, - DbrxConfig, EAGLEConfig, +from vllm.transformers_utils.configs import (AriaConfig, ChatGLMConfig, + Cohere2Config, DbrxConfig, + DeepseekVLV2Config, EAGLEConfig, ExaoneConfig, H2OVLChatConfig, InternVLChatConfig, JAISConfig, MedusaConfig, MllamaConfig, @@ -51,9 +52,11 @@ _CONFIG_REGISTRY_OVERRIDE_HF: Dict[str, Type[PretrainedConfig]] = { } _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { + "aria": AriaConfig, "chatglm": ChatGLMConfig, "cohere2": Cohere2Config, "dbrx": DbrxConfig, + "deepseek_vl_v2": DeepseekVLV2Config, "mpt": MPTConfig, "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct) diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index a41a35c88b3a1..807ef4fbfd0c0 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -1,6 +1,8 @@ +from vllm.transformers_utils.configs.aria import AriaConfig from vllm.transformers_utils.configs.chatglm import ChatGLMConfig from vllm.transformers_utils.configs.cohere2 import Cohere2Config from vllm.transformers_utils.configs.dbrx import DbrxConfig +from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config from vllm.transformers_utils.configs.eagle import EAGLEConfig from vllm.transformers_utils.configs.exaone import ExaoneConfig # RWConfig is for the original tiiuae/falcon-40b(-instruct) and @@ -22,9 +24,11 @@ from vllm.transformers_utils.configs.telechat2 import Telechat2Config from vllm.transformers_utils.configs.ultravox import UltravoxConfig __all__ = [ + "AriaConfig", "ChatGLMConfig", "Cohere2Config", "DbrxConfig", + "DeepseekVLV2Config", "MPTConfig", "RWConfig", "H2OVLChatConfig", diff --git a/vllm/transformers_utils/configs/aria.py b/vllm/transformers_utils/configs/aria.py index d253da0d96a34..f4b531225b5d0 100644 --- a/vllm/transformers_utils/configs/aria.py +++ b/vllm/transformers_utils/configs/aria.py @@ -1,7 +1,32 @@ +# Copyright 2024 Rhymes AI. All rights reserved. +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from typing import Mapping + +from transformers import PretrainedConfig from transformers.models.idefics2.configuration_idefics2 import ( Idefics2VisionConfig) from transformers.models.llama.configuration_llama import LlamaConfig +from vllm.logger import init_logger + +logger = init_logger(__name__) + class AriaVisionConfig(Idefics2VisionConfig): model_type = "aria_vision_model" @@ -45,3 +70,96 @@ class AriaMoELMConfig(LlamaConfig): self.moe_num_experts = moe_num_experts self.moe_topk = moe_topk self.moe_num_shared_experts = moe_num_shared_experts + + +class AriaConfig(PretrainedConfig): + """ + Configuration class for Aria model. + This class handles the configuration for both vision and text components of + the Aria model, + as well as additional parameters for image token handling and projector + mapping. + + Args: + vision_config (AriaVisionConfig or dict): Configuration for the vision + component. + text_config (AriaMoELMConfig or dict): Configuration for the text + component. + projector_patch_to_query_dict (dict): Mapping of patch sizes to query + dimensions. + ignore_index (int): Index to ignore in loss calculation. + image_token_index (int): Index used to represent image tokens. + **kwargs: Additional keyword arguments passed to the parent class. + Attributes: + model_type (str): Type of the model, set to "aria". + is_composition (bool): Whether the model is a composition of multiple + components. + ignore_index (int): Index to ignore in loss calculation. + image_token_index (int): Index used to represent image tokens. + projector_patch_to_query_dict (dict): Mapping of patch sizes to query + dimensions. + vision_config (AriaVisionConfig): Configuration for the vision + component. + text_config (AriaMoELMConfig): Configuration for the text component. + """ + + model_type = "aria" + is_composition = False + + def __init__( + self, + vision_config: AriaVisionConfig = AriaVisionConfig(), # noqa: B008 + text_config: AriaMoELMConfig = AriaMoELMConfig(), # noqa: B008 + projector_patch_to_query_dict: Mapping[int, int] = { + 1225: 128, + 4900: 256, + }, + ignore_index=-100, + image_token_index=32000, + tie_word_embeddings=False, + **kwargs, + ): + super().__init__(**kwargs) + self.ignore_index = ignore_index + self.image_token_index = image_token_index + self.tie_word_embeddings = tie_word_embeddings + attn_implementation = kwargs.pop("attn_implementation", None) + + # Set the default attention implementation to flash_attention_2 if not + # specified + self._attn_implementation = ("flash_attention_2" + if attn_implementation is None else + attn_implementation) + + # Convert the keys and values of projector_patch_to_query_dict to + # integers + # This ensures consistency even if they were provided as strings + self.projector_patch_to_query_dict = { + int(k): int(v) + for k, v in projector_patch_to_query_dict.items() + } + + if isinstance(vision_config, dict) and "model_type" in vision_config: + vision_config = AriaVisionConfig(**vision_config) + if attn_implementation is None: + vision_attn_implementation = "flash_attention_2" + elif attn_implementation == "sdpa": + logger.warning("SDPA is not supported for vit, using " + "flash_attention_2 instead") + vision_attn_implementation = "flash_attention_2" + else: + vision_attn_implementation = attn_implementation + vision_config._attn_implementation = vision_attn_implementation + + self.vision_config = vision_config + + if isinstance(text_config, dict) and "model_type" in text_config: + text_attn_implementation = ("sdpa" if attn_implementation is None + else attn_implementation) + text_config = AriaMoELMConfig(**text_config) + text_config._attn_implementation = text_attn_implementation + + self.text_config = text_config + + # This is needed for the static kv cache + self.num_hidden_layers = self.text_config.num_hidden_layers diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py new file mode 100644 index 0000000000000..681528c3c0116 --- /dev/null +++ b/vllm/transformers_utils/configs/deepseek_vl2.py @@ -0,0 +1,214 @@ +# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268 +from typing import Tuple + +from transformers.configuration_utils import PretrainedConfig + + +class VisionEncoderConfig(PretrainedConfig): + model_type: str = "vision" + + model_name: str = "vit_so400m_patch14_siglip_384.webli" + image_size: int = 384 + patch_size: int = 16 + width: int = 1024 + layers: int = 24 + heads: int = 16 + mlp_ratio: int = 4 + global_pool: str = "map" + ignore_head: bool = True + class_token: bool = False + num_classes: int = 0 + use_checkpoint: bool = False + weight_init: str = "skip" + deterministic: bool = False + num_recomputing_layers: int = 0 + + def __init__(self, + model_name: str = "vit_so400m_patch14_siglip_384.webli", + image_size: int = 384, + patch_size: int = 16, + width: int = 1024, + layers: int = 24, + heads: int = 16, + mlp_ratio: int = 4, + global_pool: str = "map", + ignore_head: bool = True, + class_token: bool = False, + num_classes: int = 0, + use_checkpoint: bool = False, + **kwargs): + self.model_name = model_name + self.image_size = image_size + self.patch_size = patch_size + self.width = width + self.layers = layers + self.heads = heads + self.mlp_ratio = mlp_ratio + self.global_pool = global_pool + self.ignore_head = ignore_head + self.class_token = class_token + self.num_classes = num_classes + self.use_checkpoint = use_checkpoint + + super().__init__(**kwargs) + + +class MlpProjectorConfig(PretrainedConfig): + model_type = "mlp_projector" + projector_type: str = "downsample_mlp_gelu" + input_dim: int = 1152 + n_embed: int = 2048 + depth: int = 2 + mlp_ratio: int = 1 + downsample_ratio: int = 2 + token_pooling: bool = False + + def __init__(self, + projector_type: str = "downsample_mlp_gelu", + input_dim: int = 1152, + n_embed: int = 2048, + depth: int = 2, + mlp_ratio: int = 1, + downsample_ratio: int = 2, + **kwargs): + self.projector_type = projector_type + self.input_dim = input_dim + self.n_embed = n_embed + self.depth = depth + self.mlp_ratio = mlp_ratio + self.downsample_ratio = downsample_ratio + + super().__init__(**kwargs) + + +class DeepseekV2Config(PretrainedConfig): + + model_type = "deepseek_v2" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=102400, + hidden_size=4096, + intermediate_size=11008, + moe_intermediate_size=1407, + num_hidden_layers=30, + num_attention_heads=32, + num_key_value_heads=32, + n_shared_experts=None, + n_routed_experts=None, + ep_size=1, + routed_scaling_factor=1.0, + kv_lora_rank=512, + q_lora_rank=1536, + qk_rope_head_dim=64, + v_head_dim=128, + qk_nope_head_dim=128, + topk_method='gready', + n_group=None, + topk_group=None, + num_experts_per_tok=None, + moe_layer_freq=1, + first_k_dense_replace=0, + norm_topk_prob=False, + scoring_func='softmax', + aux_loss_alpha=0.001, + seq_aux=True, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=None, + bos_token_id=100000, + eos_token_id=100001, + pretraining_tp=1, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + use_mla=True, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.moe_intermediate_size = moe_intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.n_shared_experts = n_shared_experts + self.n_routed_experts = n_routed_experts + self.ep_size = ep_size + self.routed_scaling_factor = routed_scaling_factor + self.kv_lora_rank = kv_lora_rank + self.q_lora_rank = q_lora_rank + self.qk_rope_head_dim = qk_rope_head_dim + self.v_head_dim = v_head_dim + self.qk_nope_head_dim = qk_nope_head_dim + self.topk_method = topk_method + self.n_group = n_group + self.topk_group = topk_group + self.num_experts_per_tok = num_experts_per_tok + self.moe_layer_freq = moe_layer_freq + self.first_k_dense_replace = first_k_dense_replace + self.norm_topk_prob = norm_topk_prob + self.scoring_func = scoring_func + self.aux_loss_alpha = aux_loss_alpha + self.seq_aux = seq_aux + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = float(rms_norm_eps) + self.pretraining_tp = pretraining_tp + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.use_mla = use_mla + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + +class DeepseekVLV2Config(PretrainedConfig): + model_type = "deepseek_vl_v2" + vision_config: VisionEncoderConfig + projector_config: MlpProjectorConfig + + tile_tag: str = "2D" + global_view_pos: str = "head" + candidate_resolutions: Tuple[Tuple[int, int]] = ((384, 384), ) + + def __init__(self, + tile_tag: str = "tile_tag", + global_view_pos: str = "head", + candidate_resolutions: Tuple[Tuple[int, + int]] = ((384, 384), ), + **kwargs): + super().__init__(**kwargs) + + vision_config = kwargs.get("vision_config", {}) + self.vision_config = VisionEncoderConfig(**vision_config) + + projector_config = kwargs.get("projector_config", {}) + self.projector_config = MlpProjectorConfig(**projector_config) + + language_config = kwargs.get("language_config", {}) + self.text_config = DeepseekV2Config(**language_config) + + self.tile_tag = tile_tag + self.global_view_pos = global_view_pos + self.candidate_resolutions = candidate_resolutions + self.vocab_size = self.text_config.vocab_size diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py new file mode 100644 index 0000000000000..9c71b8cada32e --- /dev/null +++ b/vllm/transformers_utils/processors/__init__.py @@ -0,0 +1,4 @@ +from vllm.transformers_utils.processors.deepseek_vl2 import ( + DeepseekVLV2Processor) + +__all__ = ["DeepseekVLV2Processor"] diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py new file mode 100644 index 0000000000000..27cdf6bc22d0e --- /dev/null +++ b/vllm/transformers_utils/processors/deepseek_vl2.py @@ -0,0 +1,361 @@ +# yapf: disable +# ruff: noqa: E501 +# coding=utf-8 +# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py +# Copyright (c) 2023-2024 DeepSeek. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import math +from typing import List, Tuple + +import torch +import torchvision.transforms as T +from PIL import Image, ImageOps +from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast +from transformers.processing_utils import ProcessorMixin + + +class ImageTransform: + + def __init__(self, + mean: Tuple[float, float, float] = (0.5, 0.5, 0.5), + std: Tuple[float, float, float] = (0.5, 0.5, 0.5), + normalize: bool = True): + self.mean = mean + self.std = std + self.normalize = normalize + + transform_pipelines = [T.ToTensor()] + + if normalize: + transform_pipelines.append(T.Normalize(mean, std)) + + self.transform = T.Compose(transform_pipelines) + + def __call__(self, pil_img: Image.Image): + x = self.transform(pil_img) + return x + + +class DeepseekVLV2Processor(ProcessorMixin): + tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast") + attributes = ["tokenizer"] + + def __init__( + self, + tokenizer: LlamaTokenizerFast, + candidate_resolutions: Tuple[Tuple[int, int]], + patch_size: int, + downsample_ratio: int, + image_mean: Tuple[float, float, float] = (0.5, 0.5, 0.5), + image_std: Tuple[float, float, float] = (0.5, 0.5, 0.5), + normalize: bool = True, + image_token: str = "", + pad_token: str = "<|▁pad▁|>", + add_special_token: bool = False, + sft_format: str = "deepseek", + mask_prompt: bool = True, + ignore_id: int = -100, + **kwargs, + ): + + self.candidate_resolutions = candidate_resolutions + self.image_size = candidate_resolutions[0][0] + self.patch_size = patch_size + self.image_mean = image_mean + self.image_std = image_std + self.normalize = normalize + self.downsample_ratio = downsample_ratio + + self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize) + self.tokenizer = tokenizer + self.tokenizer.padding_side = 'left' # must set this,padding side with make a difference in batch inference + + # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id' + if tokenizer.pad_token is None: + self.tokenizer.add_special_tokens({'pad_token': pad_token}) + + # add image token + image_token_id = self.tokenizer.vocab.get(image_token) + if image_token_id is None: + special_tokens = [image_token] + special_tokens_dict = {"additional_special_tokens": special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + self.image_token_id = self.tokenizer.vocab.get(image_token) + + # add five special tokens for grounding-related tasks + # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|> + special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>'] + special_tokens_dict = {"additional_special_tokens": special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + + # add special tokens for SFT data + special_tokens = ["<|User|>", "<|Assistant|>"] + special_tokens_dict = {"additional_special_tokens": special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + + self.image_token = image_token + self.pad_token = pad_token + self.add_special_token = add_special_token + self.sft_format = sft_format + self.mask_prompt = mask_prompt + self.ignore_id = ignore_id + + super().__init__( + tokenizer, + **kwargs, + ) + + def select_best_resolution(self, image_size): + # used for cropping + original_width, original_height = image_size + best_fit = None + max_effective_resolution = 0 + min_wasted_resolution = float("inf") + + for width, height in self.candidate_resolutions: + scale = min(width / original_width, height / original_height) + downscaled_width, downscaled_height = int( + original_width * scale), int(original_height * scale) + effective_resolution = min(downscaled_width * downscaled_height, + original_width * original_height) + wasted_resolution = (width * height) - effective_resolution + + if effective_resolution > max_effective_resolution or ( + effective_resolution == max_effective_resolution + and wasted_resolution < min_wasted_resolution): + max_effective_resolution = effective_resolution + min_wasted_resolution = wasted_resolution + best_fit = (width, height) + + return best_fit + + @property + def bos_id(self): + return self.tokenizer.bos_token_id + + @property + def eos_id(self): + return self.tokenizer.eos_token_id + + @property + def pad_id(self): + return self.tokenizer.pad_token_id + + def encode(self, text: str, bos: bool = True, eos: bool = False): + t = self.tokenizer.encode(text, add_special_tokens=False) + + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + + return t + + def decode(self, t: List[int], **kwargs) -> str: + return self.tokenizer.decode(t, **kwargs) + + def process_one( + self, + prompt: str, + images: List[Image.Image], + inference_mode: bool = True, + **kwargs, + ): + """ + + Args: + prompt (str): the formatted prompt; + conversations (List[Dict]): conversations with a list of messages; + images (List[ImageType]): the list of images; + inference_mode (bool): if True, then remove the last eos token; + system_prompt (str): the system prompt; + **kwargs: + + Returns: + outputs (BaseProcessorOutput): the output of the processor, + - input_ids (torch.LongTensor): [N + image tokens] + - target_ids (torch.LongTensor): [N + image tokens] + - pixel_values (torch.FloatTensor): [n_patches, 3, H, W] + - image_id (int): the id of the image token + - num_image_tokens (List[int]): the number of image tokens + """ + + assert (prompt is not None and images is not None + ), "prompt and images must be used at the same time." + + sft_format = prompt + tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images( + sft_format, images, bos=True, eos=True, cropping=len(images) <= 2) + masked_tokenized_str = [] + for token_index in tokenized_str: + if token_index != self.image_token_id: + masked_tokenized_str.append(token_index) + else: + masked_tokenized_str.append(self.ignore_id) + + assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \ + (f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, " + f"imags_seq_mask's length {len(images_seq_mask)}, are not equal") + + input_ids = torch.LongTensor(tokenized_str) + target_ids = torch.LongTensor(masked_tokenized_str) + images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool) + + # set input_ids < 0 | input_ids == self.image_token_id as ignore_id + target_ids[(input_ids < 0) | + (input_ids == self.image_token_id)] = self.ignore_id + input_ids[input_ids < 0] = self.pad_id + + if inference_mode: + # 去掉结尾的eos token + assert input_ids[-1] == self.eos_id + input_ids = input_ids[:-1] + target_ids = target_ids[:-1] + images_seq_mask = images_seq_mask[:-1] + + if len(images_list) == 0: + pixel_values = torch.zeros((1, 3, self.image_size, self.image_size)) + images_spatial_crop = torch.zeros((1, 2), dtype=torch.long) + else: + pixel_values = torch.stack(images_list, dim=0) + images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long) + + input_ids = input_ids.unsqueeze(0) + + prepare = BatchFeature( + data=dict( + input_ids=input_ids, + pixel_values=pixel_values, + images_seq_mask=images_seq_mask, + images_spatial_crop=images_spatial_crop, + num_image_tokens=num_image_tokens, + ), + tensor_type="pt", + ) + return prepare + + def __call__( + self, + *, + prompt: str, + images: List[Image.Image], + inference_mode: bool = True, + **kwargs, + ): + """ + + Args: + prompt (str): the formatted prompt; + images (List[ImageType]): the list of images; + inference_mode (bool): if True, then remove the last eos token; + **kwargs: + + Returns: + outputs (BaseProcessorOutput): the output of the processor, + - input_ids (torch.LongTensor): [N + image tokens] + - images (torch.FloatTensor): [n_images, 3, H, W] + - image_id (int): the id of the image token + - num_image_tokens (List[int]): the number of image tokens + """ + + prepare = self.process_one( + prompt=prompt, + images=images, + inference_mode=inference_mode, + ) + + return prepare + + def tokenize_with_images( + self, + conversation: str, + images: List[Image.Image], + bos: bool = True, + eos: bool = True, + cropping: bool = True, + ): + """Tokenize text with tags.""" + assert conversation.count(self.image_token) == len(images) + text_splits = conversation.split(self.image_token) + images_list, images_seq_mask, images_spatial_crop = [], [], [] + num_image_tokens = [] + tokenized_str = [] + for text_sep, image in zip(text_splits, images): + """encode text_sep""" + tokenized_sep = self.encode(text_sep, bos=False, eos=False) + tokenized_str += tokenized_sep + images_seq_mask += [False] * len(tokenized_sep) + + """select best resolution for anyres""" + if cropping: + best_width, best_height = self.select_best_resolution(image.size) + else: + best_width, best_height = self.image_size, self.image_size + + """process the global view""" + global_view = ImageOps.pad(image, (self.image_size, self.image_size), + color=tuple(int(x * 255) for x in self.image_transform.mean)) + images_list.append(self.image_transform(global_view)) + + """process the local views""" + local_view = ImageOps.pad(image, (best_width, best_height), + color=tuple(int(x * 255) for x in self.image_transform.mean)) + for i in range(0, best_height, self.image_size): + for j in range(0, best_width, self.image_size): + images_list.append( + self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size)))) + + """record height / width crop num""" + num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size + images_spatial_crop.append([num_width_tiles, num_height_tiles]) + + """add image tokens""" + h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio) + # global views tokens h * (w + 1), 1 is for line separator + tokenized_image = [self.image_token_id] * h * (w + 1) + # add a separator between global and local views + tokenized_image += [self.image_token_id] + # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1) + tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1) + + tokenized_str += tokenized_image + images_seq_mask += [True] * len(tokenized_image) + num_image_tokens.append(len(tokenized_image)) + + """process the last text split""" + tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False) + tokenized_str += tokenized_sep + images_seq_mask += [False] * len(tokenized_sep) + + """add the bos and eos tokens""" + if bos: + tokenized_str = [self.bos_id] + tokenized_str + images_seq_mask = [False] + images_seq_mask + if eos: + tokenized_str = tokenized_str + [self.eos_id] + images_seq_mask = images_seq_mask + [False] + + assert len(tokenized_str) == len( + images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}" + + return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens + + +AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 97920f42ec52f..294262484f2fb 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -1,3 +1,4 @@ +import contextlib import os import warnings from pathlib import Path @@ -67,7 +68,15 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer: tokenizer.all_special_tokens_extended) tokenizer_all_special_tokens = set(tokenizer.all_special_tokens) tokenizer_len = len(tokenizer) + max_token_id = max(tokenizer.get_vocab().values()) + # Some tokenizers (e.g., QwenTokenizer) have special tokens that + # are added and included in the implementation of the vocab_size + # property, but not in get_vocab(); if there is an implementation + # of vocab size, we should take the greater value. + if hasattr(tokenizer, "vocab_size"): + with contextlib.suppress(NotImplementedError): + max_token_id = max(max_token_id, tokenizer.vocab_size) class CachedTokenizer(tokenizer.__class__): # type: ignore diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py index d400276796996..09569c564a58d 100644 --- a/vllm/transformers_utils/tokenizer_group/__init__.py +++ b/vllm/transformers_utils/tokenizer_group/__init__.py @@ -24,7 +24,8 @@ def init_tokenizer_from_configs(model_config: ModelConfig, max_input_length=None, tokenizer_mode=model_config.tokenizer_mode, trust_remote_code=model_config.trust_remote_code, - revision=model_config.tokenizer_revision) + revision=model_config.tokenizer_revision, + truncation_side=model_config.truncation_side) return get_tokenizer_group(parallel_config.tokenizer_pool_config, **init_kwargs) diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 17d722e3d88fe..d801cf4e4c7b1 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -18,6 +18,7 @@ from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy, Tekkenizer) from vllm.logger import init_logger +from vllm.utils import is_list_of if TYPE_CHECKING: from vllm.entrypoints.chat_utils import ChatCompletionMessageParam @@ -27,7 +28,7 @@ logger = init_logger(__name__) @dataclass class Encoding: - input_ids: List[int] + input_ids: Union[List[int], List[List[int]]] def maybe_serialize_tool_calls(request: ChatCompletionRequest): @@ -223,17 +224,25 @@ class MistralTokenizer: def __call__( self, - prompt: str, + prompt: Union[str, List[str], List[int]], add_special_tokens: bool = False, truncation: bool = False, max_length: Optional[int] = None, ): - # Mistral Tokenizers should not add special tokens - input_ids = self.encode(prompt) - - if truncation: - input_ids = input_ids[:max_length] - + input_ids: Union[List[int], List[List[int]]] + # For List[str], original prompt text + if is_list_of(prompt, str): + input_ids_: List[List[int]] = [] + for p in prompt: + each_input_ids = self.encode_one(p, truncation, max_length) + input_ids_.append(each_input_ids) + input_ids = input_ids_ + # For List[int], apply chat template output, already tokens. + elif is_list_of(prompt, int): + input_ids = prompt + # For str, single prompt text + else: + input_ids = self.encode_one(prompt, truncation, max_length) return Encoding(input_ids=input_ids) def get_vocab(self) -> Dict[str, int]: @@ -245,6 +254,19 @@ class MistralTokenizer: # Mistral tokenizers have no added vocabulary return {} + def encode_one( + self, + prompt: str, + truncation: bool = False, + max_length: Optional[int] = None, + ) -> List[int]: + # Mistral Tokenizers should not add special tokens + input_ids = self.encode(prompt) + + if truncation: + input_ids = input_ids[:max_length] + return input_ids + def encode(self, prompt: str) -> List[int]: # `encode` should only be used for prompt completion # it should never be used for chat_completion. diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index a9deee881f41a..841df3994fba2 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -27,6 +27,17 @@ _USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER _GLOBAL_RUNTIME_DATA: Dict[str, Union[str, int, bool]] = {} +_USAGE_ENV_VARS_TO_COLLECT = [ + "VLLM_USE_MODELSCOPE", + "VLLM_USE_TRITON_FLASH_ATTN", + "VLLM_ATTENTION_BACKEND", + "VLLM_USE_FLASHINFER_SAMPLER", + "VLLM_PP_LAYER_PARTITION", + "VLLM_USE_TRITON_AWQ", + "VLLM_USE_V1", + "VLLM_ENABLE_V1_MULTIPROCESSING", +] + def set_runtime_usage_data(key: str, value: Union[str, int, bool]) -> None: """Set global usage data that will be sent with every usage heartbeat.""" @@ -122,6 +133,7 @@ class UsageMessage: self.gpu_count: Optional[int] = None self.gpu_type: Optional[str] = None self.gpu_memory_per_device: Optional[int] = None + self.env_var_json: Optional[str] = None # vLLM Information self.model_architecture: Optional[str] = None @@ -176,6 +188,12 @@ class UsageMessage: self.vllm_version = VLLM_VERSION self.model_architecture = model_architecture + # Environment variables + self.env_var_json = json.dumps({ + env_var: getattr(envs, env_var) + for env_var in _USAGE_ENV_VARS_TO_COLLECT + }) + # Metadata self.log_time = _get_current_timestamp_ns() self.source = envs.VLLM_USAGE_SOURCE diff --git a/vllm/utils.py b/vllm/utils.py index 487088591ebc2..17bffd2846b46 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -36,6 +36,7 @@ from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, overload) from uuid import uuid4 +import cloudpickle import numpy as np import numpy.typing as npt import psutil @@ -446,7 +447,7 @@ def get_ip() -> str: logger.warning( "The environment variable HOST_IP is deprecated and ignored, as" " it is often used by Docker and other software to" - "interact with the container's network stack. Please" + "interact with the container's network stack. Please " "use VLLM_HOST_IP instead to set the IP address for vLLM processes" " to communicate with each other.") if host_ip: @@ -710,13 +711,7 @@ class DeviceMemoryProfiler: def current_memory_usage(self) -> float: # Return the memory usage in bytes. from vllm.platforms import current_platform - if current_platform.is_cuda_alike(): - torch.cuda.reset_peak_memory_stats(self.device) - mem = torch.cuda.max_memory_allocated(self.device) - elif current_platform.is_xpu(): - torch.xpu.reset_peak_memory_stats(self.device) # type: ignore - mem = torch.xpu.max_memory_allocated(self.device) # type: ignore - return mem + return current_platform.get_current_memory_usage(self.device) def __enter__(self): self.initial_memory = self.current_memory_usage() @@ -1928,36 +1923,57 @@ def kill_process_tree(pid: int): @dataclass class MemorySnapshot: """Memory snapshot.""" - torch_peak_in_bytes: int = 0 - torch_memory_in_bytes: int = 0 + torch_peak: int = 0 + cuda_memory: int = 0 + torch_memory: int = 0 + non_torch_memory: int = 0 timestamp: float = 0.0 + auto_measure: bool = True + + def __post_init__(self): + if self.auto_measure: + self.measure() def measure(self): - self.torch_peak_in_bytes = torch.cuda.max_memory_reserved() + # we measure the torch peak memory usage via allocated_bytes, + # rather than `torch.cuda.memory_reserved()` . + # After `torch.cuda.reset_peak_memory_stats()`, + # `torch.cuda.memory_reserved()` will keep growing, and only shrink + # when we call `torch.cuda.empty_cache()` or OOM happens. + self.torch_peak = torch.cuda.memory_stats().get( + "allocated_bytes.all.peak", 0) + + self.cuda_memory = torch.cuda.mem_get_info( + )[1] - torch.cuda.mem_get_info()[0] + # torch.cuda.memory_reserved() is how many bytes # PyTorch gets from cuda (by calling cudaMalloc, etc.) - self.torch_memory_in_bytes = torch.cuda.memory_reserved() + # this is used to measure the non-torch memory usage + self.torch_memory = torch.cuda.memory_reserved() + + self.non_torch_memory = self.cuda_memory - self.torch_memory self.timestamp = time.time() def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot": - """support a - b""" return MemorySnapshot( - torch_peak_in_bytes=self.torch_peak_in_bytes - - other.torch_peak_in_bytes, - torch_memory_in_bytes=self.torch_memory_in_bytes - - other.torch_memory_in_bytes, - timestamp=self.timestamp - other.timestamp) + torch_peak=self.torch_peak - other.torch_peak, + cuda_memory=self.cuda_memory - other.cuda_memory, + torch_memory=self.torch_memory - other.torch_memory, + non_torch_memory=self.non_torch_memory - other.non_torch_memory, + timestamp=self.timestamp - other.timestamp, + auto_measure=False, + ) @dataclass class MemoryProfilingResult: - """Memory profiling result. - """ # noqa - baseline_memory_in_bytes: int = 0 - non_kv_cache_memory_in_bytes: int = 0 - torch_peak_increase_in_bytes: int = 0 - non_torch_increase_in_bytes: int = 0 - weights_memory_in_bytes: float = 0 + """Memory profiling result. All numbers are in bytes. + """ + non_kv_cache_memory: int = 0 + torch_peak_increase: int = 0 + non_torch_increase: int = 0 + weights_memory: float = 0 + before_create: MemorySnapshot = field(default_factory=MemorySnapshot) before_profile: MemorySnapshot = field(default_factory=MemorySnapshot) after_profile: MemorySnapshot = field(default_factory=MemorySnapshot) profile_time: float = 0.0 @@ -1965,18 +1981,14 @@ class MemoryProfilingResult: @contextlib.contextmanager def memory_profiling( - baseline_memory_in_bytes: int, weights_memory_in_bytes: int -) -> Generator[MemoryProfilingResult, None, None]: + baseline_snapshot: MemorySnapshot, + weights_memory: int) -> Generator[MemoryProfilingResult, None, None]: """Memory profiling context manager. - baseline_memory_in_bytes: memory used by all the components other than - the current vLLM instance. It contains: memory used by other processes, memory - used by another vLLM instance in the same process, etc. It is usually measured - before the current vLLM instance initialize the device. And we assume it is - constant during the profiling of the current vLLM instance. - weights_memory_in_bytes: memory used by PyTorch when loading the model weights. + baseline_snapshot: the memory snapshot before the current vLLM instance. + weights_memory: memory used by PyTorch when loading the model weights. Note that, before loading the model weights, we also initialize the device and distributed environment, which may consume some memory. This part is not - included in the weights_memory_in_bytes because PyTorch does not control it. + included in the weights_memory because PyTorch does not control it. The memory in one GPU can be classified into 3 categories: 1. memory used by anything other than the current vLLM instance. @@ -2011,20 +2023,21 @@ def memory_profiling( b. 2 GiB reserved for the peak activation tensors (category 2) c. 1 GiB used by non-torch components (category 3) - The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`. + The memory used for loading weights (a.) is directly given from the argument `weights_memory`. - The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.). + The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` during profiling gives (b.). - (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`), - subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_reserved()`. + The increase of `non_torch_memory` from creating the current vLLM instance until after profiling to get (c.). """ # noqa + gc.collect() + torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() result = MemoryProfilingResult() - result.baseline_memory_in_bytes = baseline_memory_in_bytes + result.before_create = baseline_snapshot # the part of memory used for holding the model weights - result.weights_memory_in_bytes = weights_memory_in_bytes + result.weights_memory = weights_memory result.before_profile.measure() @@ -2035,13 +2048,12 @@ def memory_profiling( result.after_profile.measure() - diff = result.after_profile - result.before_profile - result.torch_peak_increase_in_bytes = diff.torch_peak_in_bytes - current_cuda_memory_bytes = torch.cuda.mem_get_info( - )[1] - torch.cuda.mem_get_info()[0] - result.non_torch_increase_in_bytes = current_cuda_memory_bytes - baseline_memory_in_bytes - weights_memory_in_bytes - diff.torch_memory_in_bytes # noqa - result.profile_time = diff.timestamp - result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes # noqa + diff_profile = result.after_profile - result.before_profile + diff_from_create = result.after_profile - result.before_create + result.torch_peak_increase = diff_profile.torch_peak + result.non_torch_increase = diff_from_create.non_torch_memory + result.profile_time = diff_profile.timestamp + result.non_kv_cache_memory = result.non_torch_increase + result.torch_peak_increase + result.weights_memory # noqa # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501 @@ -2138,3 +2150,59 @@ def get_mp_context(): _check_multiproc_method() mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD return multiprocessing.get_context(mp_method) + + +def bind_kv_cache( + ctx: Dict[str, Any], + kv_cache: List[List[torch.Tensor]], # [virtual_engine][layer_index] +) -> None: + # Bind the kv_cache tensor to Attention modules, similar to + # ctx[layer_name].kv_cache[ve]=kv_cache[ve][extract_layer_index(layer_name)] + # Special things handled here: + # 1. Some models have non-attention layers, e.g., Jamba + # 2. Pipeline parallelism, each rank only has a subset of layers + # 3. Encoder attention has no kv cache + # 4. Encoder-decoder models, encoder-decoder attention and decoder-only + # attention of the same layer (e.g., bart's decoder.layers.1.self_attn + # and decoder.layers.1.encoder_attn) is mapped to the same kv cache + # tensor + from vllm.attention import AttentionType + from vllm.model_executor.models.utils import extract_layer_index + layer_need_kv_cache = [ + layer_name for layer_name in ctx + if ctx[layer_name].attn_type in (AttentionType.DECODER, + AttentionType.ENCODER_DECODER) + ] + layer_index_sorted = sorted( + set( + extract_layer_index(layer_name) + for layer_name in layer_need_kv_cache)) + for layer_name in layer_need_kv_cache: + kv_cache_idx = layer_index_sorted.index( + extract_layer_index(layer_name)) + forward_ctx = ctx[layer_name] + assert len(forward_ctx.kv_cache) == len(kv_cache) + for ve, ve_kv_cache in enumerate(kv_cache): + forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx] + + +def run_method(obj: Any, method: Union[str, bytes, Callable], args: Tuple[Any], + kwargs: Dict[str, Any]) -> Any: + """ + Run a method of an object with the given arguments and keyword arguments. + If the method is string, it will be converted to a method using getattr. + If the method is serialized bytes and will be deserialized using + cloudpickle. + If the method is a callable, it will be called directly. + """ + if isinstance(method, bytes): + func = partial(cloudpickle.loads(method), obj) + elif isinstance(method, str): + try: + func = getattr(obj, method) + except AttributeError: + raise NotImplementedError(f"Method {method!r} is not" + " implemented.") from None + else: + func = partial(method, obj) # type: ignore + return func(*args, **kwargs) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index b02bc9ffde538..7b0786261a6a6 100644 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -15,6 +15,8 @@ from vllm.vllm_flash_attn import flash_attn_varlen_func class FlashAttentionBackend(AttentionBackend): + accept_output_buffer: bool = True + @staticmethod def get_supported_head_sizes() -> List[int]: return [32, 64, 96, 128, 160, 192, 224, 256] diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index 845bd5ea05e3c..0cd8c806a3e47 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -1,7 +1,14 @@ -from typing import Dict, List, Set, Tuple +from typing import TYPE_CHECKING, Dict, List, Set, Tuple +from vllm.logger import init_logger +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.v1.request import Request +if TYPE_CHECKING: + from vllm.config import ModelConfig, SchedulerConfig + +logger = init_logger(__name__) + class EncoderCacheManager: @@ -46,3 +53,72 @@ class EncoderCacheManager: freed = self.freed self.freed = [] return freed + + +def compute_encoder_budget( + model_config: "ModelConfig", + scheduler_config: "SchedulerConfig", +) -> Tuple[int, int]: + """Compute the encoder cache budget based on the model and scheduler + configurations. + + Args: + model_config: Model configuration. + scheduler_config: Scheduler configuration. + + Returns: + - Compute budget for encoder execution, in unit of number of tokens + in the input sequence. + - Space budget for encoder cache size, in unit of number of tokens + in the input sequence. + """ + + if not model_config.is_multimodal_model: + return 0, 0 + + # TODO: handle encoder-decoder models once we support them. + ( + encoder_compute_budget, + encoder_cache_size, + ) = _compute_encoder_budget_multimodal(model_config, scheduler_config) + + return encoder_compute_budget, encoder_cache_size + + +def _compute_encoder_budget_multimodal( + model_config: "ModelConfig", + scheduler_config: "SchedulerConfig", +) -> Tuple[int, int]: + """Compute the encoder cache budget based on the model and scheduler + configurations for a multimodal model. + + Args: + model_config: Model configuration. + scheduler_config: Scheduler configuration. + + Returns: + - Compute budget for encoder execution, in unit of number of tokens + in the input sequence. + - Space budget for encoder cache size, in unit of number of tokens + in the input sequence. + """ + + max_tokens_by_modality_dict = MULTIMODAL_REGISTRY.get_max_tokens_per_item_by_nonzero_modality( # noqa: E501 + model_config) + + if not max_tokens_by_modality_dict: + logger.warning( + "All non-text modalities supported by the model have been " + "explicitly disabled via limit_mm_per_prompt. Encoder cache will " + "not be initialized.") + return 0, 0 + + _, max_tokens_per_mm_item = max(max_tokens_by_modality_dict.items(), + key=lambda item: item[1]) + + encoder_compute_budget = max(scheduler_config.max_num_encoder_input_tokens, + max_tokens_per_mm_item) + encoder_cache_size = max(scheduler_config.encoder_cache_size, + max_tokens_per_mm_item) + + return encoder_compute_budget, encoder_cache_size diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 1cbff1e2d767e..bac77443c8560 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import Dict, Iterable, List, Optional +from typing import Dict, Iterable, List, Optional, Tuple from vllm.logger import init_logger from vllm.utils import cdiv @@ -69,7 +69,8 @@ class KVCacheManager: # is finished. self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {} - def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]: + def get_computed_blocks( + self, request: Request) -> Tuple[List[KVCacheBlock], int]: """Get the computed (cached) blocks for the request. Note that the computed blocks must be full. @@ -77,11 +78,13 @@ class KVCacheManager: request: The request to get the computed blocks. Returns: - A list of blocks that are computed for the request. + A tuple containing: + - A list of blocks that are computed for the request. + - The number of computed tokens. """ if not self.enable_caching: # Prefix caching is disabled. - return [] + return [], 0 computed_blocks = [] @@ -101,7 +104,11 @@ class KVCacheManager: else: break - return computed_blocks + # NOTE(woosuk): Since incomplete blocks are not eligible for + # sharing, `num_computed_tokens` is always a multiple of + # `block_size`. + num_computed_tokens = len(computed_blocks) * self.block_size + return computed_blocks, num_computed_tokens def append_slots( self, diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 22a5d2fb08a48..bab99fe37caee 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -3,7 +3,10 @@ from collections.abc import Sequence from dataclasses import dataclass from typing import Any, List, NamedTuple, Optional, Tuple +from vllm.config import VllmConfig from vllm.logger import init_logger +from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheSpec, + KVCacheTensor) from vllm.v1.request import Request logger = init_logger(__name__) @@ -305,3 +308,124 @@ def hash_request_tokens(block_size: int, ret.append(block_hash) parent_block_hash_value = block_hash.hash_value return ret + + +def check_enough_kv_cache_memory(vllm_config: VllmConfig, + kv_cache_spec: KVCacheSpec, + available_memory: int): + """ + Checks whether `available_memory` is enough for the KV cache to hold at + least one request with the model's max_model_len. + + Args: + vllm_config: The global VllmConfig + kv_cache_spec: The kv cache spec of the model + available_memory: Memory available for KV cache in bytes. + + Raises: + ValueError: If there is not enough memory available for the KV cache. + """ + + if available_memory <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + + max_model_len = vllm_config.model_config.max_model_len + needed_memory = 0 + for layer_spec in kv_cache_spec.values(): + needed_memory += layer_spec.bytes_for_tokens(max_model_len) + + if needed_memory > available_memory: + raise ValueError( + f"To serve at least one request with the models's max seq len " + f"({max_model_len}), ({needed_memory/1024/1024/1024:.2f} GB KV " + f"cache is needed, which is larger than the available KV cache " + f"memory ({available_memory/1024/1024/1024:.2f} GB). Try " + f"increasing `gpu_memory_utilization` or decreasing " + f"`max_model_len` when initializing the engine.") + + +def is_kv_cache_type_uniform(kv_cache_spec: KVCacheSpec) -> bool: + """ + Whether all layers in the given KVCacheSpec have the same type of KV cache. + + Args: + kv_cache_spec: The KVCacheSpec of the model + + Returns: + True if all layers have the same type, False otherwise. + """ + + layer_keys = set(layer.type_id for layer in kv_cache_spec.values()) + return len(layer_keys) == 1 + + +def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, + kv_cache_spec: KVCacheSpec, + available_memory: int) -> KVCacheConfig: + """ + Generates the KV cache configuration for a model with one type of KV cache. + Divide the available memory equally among all layers. + + Args: + vllm_config: The global VllmConfig + kv_cache_spec: The kv cache spec of the model + available_memory: Memory available for KV cache in bytes. + + Returns: + The generated KVCacheConfig + """ + + page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()} + assert len(page_sizes) == 1 + page_size = page_sizes.pop() + + num_blocks = int(available_memory // page_size // len(kv_cache_spec)) + num_blocks = max(num_blocks, 0) + + if vllm_config.cache_config.num_gpu_blocks_override is not None: + num_gpu_blocks_override = \ + vllm_config.cache_config.num_gpu_blocks_override + logger.info( + "Overriding num_gpu_blocks=%d with " + "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override) + num_blocks = num_gpu_blocks_override + + logger.info("# GPU blocks: %d", num_blocks) + + per_layer_size = page_size * num_blocks + + kv_cache_config = KVCacheConfig( + num_blocks=num_blocks, + tensors={ + layer_name: KVCacheTensor(size=per_layer_size) + for layer_name in kv_cache_spec + }, + groups=[[layer_name for layer_name in kv_cache_spec]], + kv_cache_spec=kv_cache_spec) + return kv_cache_config + + +def get_kv_cache_config(vllm_config: VllmConfig, kv_cache_spec: KVCacheSpec, + available_memory: int) -> KVCacheConfig: + """ + Generates the KV cache configuration for a model + TODO: support hybrid models with more than one type of KV cache. + + Args: + vllm_config: The global VllmConfig + kv_cache_spec: The kv cache spec of the model + available_memory: Memory available for KV cache in bytes. + + Returns: + The generated KVCacheConfig + """ + check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory) + if is_kv_cache_type_uniform(kv_cache_spec): + # KV cache of all layers are the same, which is true for most models. + # Allocate the same amount of memory for each layer. + return _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec, + available_memory) + else: + raise NotImplementedError diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index b26716f5c02e6..64df21d59fef4 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -3,12 +3,14 @@ from dataclasses import dataclass from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set, Tuple, Union) -from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig +from vllm.config import CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig from vllm.logger import init_logger from vllm.sampling_params import SamplingParams -from vllm.v1.core.encoder_cache_manager import EncoderCacheManager +from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager, + compute_encoder_budget) from vllm.v1.core.kv_cache_manager import KVCacheManager -from vllm.v1.engine import EngineCoreOutput +from vllm.v1.engine import EngineCoreOutput, EngineCoreOutputs +from vllm.v1.metrics.stats import SchedulerStats from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus @@ -24,6 +26,7 @@ class Scheduler: def __init__( self, scheduler_config: SchedulerConfig, + model_config: ModelConfig, cache_config: CacheConfig, lora_config: Optional[LoRAConfig], ) -> None: @@ -68,16 +71,24 @@ class Scheduler: self.running_reqs_data: Dict[str, RunningRequestData] = {} # Encoder-related. + # Calculate encoder cache size if applicable + # NOTE: For now we use the same budget for both compute and space. + # This can be changed when we make encoder cache for embedding caching + # across requests. + encoder_compute_budget, encoder_cache_size = compute_encoder_budget( + model_config=model_config, + scheduler_config=scheduler_config, + ) + # NOTE(woosuk): Here, "encoder" includes the vision encoder (and # projector if needed). Currently, we assume that the encoder also # has the Transformer architecture (e.g., ViT). - self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens #noqa: E501 - # NOTE(woosuk): For the models without encoder (e.g., text-only models), - # the encoder cache will not be initialized and used, regardless of - # the cache size. This is because the memory space for the encoder cache - # is preallocated in the profiling run. + self.max_num_encoder_input_tokens = encoder_compute_budget + # NOTE: For the models without encoder (e.g., text-only models), + # the encoder cache will not be initialized because cache size is 0 + # for these models. self.encoder_cache_manager = EncoderCacheManager( - cache_size=self.scheduler_config.encoder_cache_size) + cache_size=encoder_cache_size) def schedule(self) -> "SchedulerOutput": # NOTE(woosuk) on the scheduling algorithm: @@ -183,12 +194,8 @@ class Scheduler: request = self.waiting[0] # Get already-cached tokens. - computed_blocks = self.kv_cache_manager.get_computed_blocks( - request) - # NOTE(woosuk): Since incomplete blocks are not eligible for - # sharing, `num_computed_tokens` is always a multiple of - # `block_size`. - num_computed_tokens = len(computed_blocks) * self.block_size + computed_blocks, num_computed_tokens = \ + self.kv_cache_manager.get_computed_blocks(request) # Number of tokens to be scheduled. # We use `request.num_tokens` instead of # `request.num_prompt_tokens` to consider the resumed requests, @@ -372,18 +379,22 @@ class Scheduler: if self.encoder_cache_manager.has_cache(request, i): # The encoder input is already computed and cached. continue - if not self.encoder_cache_manager.can_allocate(request, i): - # The encoder cache is full. We can only schedule the decoder - # tokens just before the encoder input. - num_new_tokens = start_pos - num_computed_tokens - break - if num_encoder_tokens > encoder_budget: - # The encoder budget is exhausted. We can only schedule the - # decoder tokens up until the encoder input. - # NOTE(woosuk): We assume that the encoder tokens should be - # processed altogether, as the encoder usually uses + if (not self.encoder_cache_manager.can_allocate(request, i) + or num_encoder_tokens > encoder_budget): + # The encoder cache is full or the encoder budget is exhausted. + # NOTE(woosuk): We assume that the encoder input tokens should + # be processed altogether, as the encoder usually uses # bidirectional attention. - num_new_tokens = start_pos - num_computed_tokens + if num_computed_tokens < start_pos: + # We only schedule the decoder tokens just before the + # encoder input. + num_new_tokens = start_pos - num_computed_tokens + else: + # Because of prefix caching, num_computed_tokens is greater + # than start_pos even though its encoder input is not + # available. In this case, we can't schedule any token for + # the request in this step. + num_new_tokens = 0 break encoder_budget -= num_encoder_tokens @@ -394,12 +405,12 @@ class Scheduler: self, scheduler_output: "SchedulerOutput", model_runner_output: "ModelRunnerOutput", - ) -> List[EngineCoreOutput]: + ) -> EngineCoreOutputs: # NOTE(woosuk): This method doesn't consider speculative decoding. sampled_token_ids = model_runner_output.sampled_token_ids num_scheduled_tokens = scheduler_output.num_scheduled_tokens new_running: List[Request] = [] - engine_core_outputs: List[EngineCoreOutput] = [] + outputs: List[EngineCoreOutput] = [] for request in self.running: req_id = request.request_id request.num_computed_tokens += num_scheduled_tokens[req_id] @@ -438,7 +449,7 @@ class Scheduler: finished=request.is_finished(), finish_reason=request.get_finished_reason(), stop_reason=request.stop_reason) - engine_core_outputs.append(output) + outputs.append(output) # Breakout of the loop. if stopped: @@ -446,7 +457,10 @@ class Scheduler: new_running.append(request) self.running = new_running - return engine_core_outputs + return EngineCoreOutputs( + outputs=outputs, + scheduler_stats=self.make_stats(), + ) def _check_stop(self, request: Request) -> bool: if (request.num_tokens >= self.max_model_len @@ -515,6 +529,12 @@ class Scheduler: def has_unfinished_requests(self) -> bool: return self.get_num_unfinished_requests() > 0 + def make_stats(self) -> SchedulerStats: + return SchedulerStats( + num_running_reqs=len(self.running), + num_waiting_reqs=len(self.waiting), + ) + @dataclass class NewRequestData: diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 5e3c5e327ef63..6d90c38c72cf5 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -4,6 +4,8 @@ from typing import TYPE_CHECKING, List, Optional, Union import msgspec +from vllm.v1.metrics.stats import SchedulerStats + if TYPE_CHECKING: from vllm.lora.request import LoRARequest from vllm.multimodal import MultiModalKwargs @@ -19,8 +21,8 @@ class EngineCoreRequest: # due to circular imports and typing we have in data.py request_id: str - #NOTE(Nick): I don't think we need to pass prompt here since it should - # always be tokenized? + # NOTE(ywang96): original text prompt is needed when a request is added to + # Detokenizer, but set to None when it is added to EngineCoreClient. prompt: Optional[str] prompt_token_ids: List[int] mm_inputs: Optional[List[Optional["MultiModalKwargs"]]] @@ -56,6 +58,7 @@ class EngineCoreOutputs( # [num_reqs] outputs: List[EngineCoreOutput] + scheduler_stats: SchedulerStats @dataclass diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index b963ba74f13f0..a74699f7513e6 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,10 +1,9 @@ import asyncio import os -from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union +from typing import AsyncGenerator, List, Mapping, Optional, Type, Union from vllm.config import ModelConfig, VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.metrics_types import StatLoggerBase from vllm.engine.protocol import EngineClient from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType from vllm.inputs.preprocess import InputPreprocessor @@ -19,9 +18,11 @@ from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext from vllm.utils import kill_process_tree from vllm.v1.engine.core_client import EngineCoreClient -from vllm.v1.engine.detokenizer import Detokenizer +from vllm.v1.engine.output_processor import OutputProcessor from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor +from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase +from vllm.v1.metrics.stats import IterationStats, SchedulerStats logger = init_logger(__name__) @@ -34,7 +35,6 @@ class AsyncLLM(EngineClient): executor_class: Type[Executor], log_stats: bool, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, input_registry: InputRegistry = INPUT_REGISTRY, use_cached_outputs: bool = False, log_requests: bool = True, @@ -45,7 +45,10 @@ class AsyncLLM(EngineClient): self.log_requests = log_requests self.log_stats = log_stats - self.stat_loggers = stat_loggers + self.stat_loggers: List[StatLoggerBase] = [ + LoggingStatLogger(), + # TODO(rob): PrometheusStatLogger(), + ] self.model_config = vllm_config.model_config # Tokenizer (+ ensure liveness if running in another process). @@ -56,9 +59,6 @@ class AsyncLLM(EngineClient): lora_config=vllm_config.lora_config) self.tokenizer.ping() - # Request streams (map of request_id -> queue). - self.rid_to_queue: Dict[str, asyncio.Queue] = {} - # Processor (converts Inputs --> EngineCoreRequests). self.processor = Processor( model_config=vllm_config.model_config, @@ -68,13 +68,9 @@ class AsyncLLM(EngineClient): input_registry=input_registry, ) - # Detokenizer (converts EngineCoreOutputs --> RequestOutput). - self.detokenizer = Detokenizer( - tokenizer_name=vllm_config.model_config.tokenizer, - tokenizer_mode=vllm_config.model_config.tokenizer_mode, - trust_remote_code=vllm_config.model_config.trust_remote_code, - revision=vllm_config.model_config.tokenizer_revision, - ) + # OutputProcessor (converts EngineCoreOutputs --> RequestOutput). + self.output_processor = OutputProcessor(self.tokenizer, + log_stats=self.log_stats) # EngineCore (starts the engine in background process). self.engine_core = EngineCoreClient.make_client( @@ -82,7 +78,6 @@ class AsyncLLM(EngineClient): asyncio_mode=True, vllm_config=vllm_config, executor_class=executor_class, - log_stats=self.log_stats, ) self.output_handler: Optional[asyncio.Task] = None @@ -94,7 +89,6 @@ class AsyncLLM(EngineClient): engine_config: Optional[VllmConfig] = None, start_engine_loop: bool = True, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, ) -> "AsyncLLM": """Create an AsyncLLM from the EngineArgs.""" @@ -114,7 +108,6 @@ class AsyncLLM(EngineClient): log_stats=not engine_args.disable_log_stats, start_engine_loop=start_engine_loop, usage_context=usage_context, - stat_loggers=stat_loggers, ) def shutdown(self): @@ -140,9 +133,9 @@ class AsyncLLM(EngineClient): """Add new request to the AsyncLLM.""" # 1) Create a new output queue for the request. - if request_id in self.rid_to_queue: + if self.output_processor.is_request_active(request_id): raise ValueError(f"Request id {request_id} already running.") - self.rid_to_queue[request_id] = asyncio.Queue() + queue: asyncio.Queue[RequestOutput] = asyncio.Queue() # 2) Convert Input --> Request. request = self.processor.process_inputs(request_id, prompt, params, @@ -151,8 +144,8 @@ class AsyncLLM(EngineClient): prompt_adapter_request, priority) - # 3) Add the request to Detokenizer (this process). - self.detokenizer.add_request(request) + # 3) Add the request to OutputProcessor (this process). + self.output_processor.add_request(request, queue) # 4) Add the EngineCoreRequest to EngineCore (separate process). await self.engine_core.add_request_async(request) @@ -160,7 +153,7 @@ class AsyncLLM(EngineClient): if self.log_requests: logger.info("Added request %s.", request_id) - return self.rid_to_queue[request_id] + return queue # TODO: we should support multiple prompts in one call, as you # can do with LLM.generate. So that for multi-prompt completion @@ -217,10 +210,9 @@ class AsyncLLM(EngineClient): # task switching under load which helps performance). out = q.get_nowait() if q.qsize() > 0 else await q.get() - # Note: both Detokenizer and EngineCore handle their + # Note: both OutputProcessor and EngineCore handle their # own request cleanup based on finished. if out.finished: - del self.rid_to_queue[request_id] yield out break @@ -233,50 +225,56 @@ class AsyncLLM(EngineClient): await self.abort(request_id) raise - def _process_request_outputs(self, request_outputs: List[RequestOutput]): - """Process outputs by putting them into per-request queues.""" - - for request_output in request_outputs: - request_id = request_output.request_id - - # Note: it is possible a request was aborted and removed from - # the state due to client cancellations, so if we encounter a - # request id not in the state, we skip. - if request_id in self.rid_to_queue: - self.rid_to_queue[request_id].put_nowait(request_output) - async def _run_output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" try: while True: - # 1) Pull EngineCoreOutput from the EngineCore. + # 1) Pull EngineCoreOutputs from the EngineCore. outputs = await self.engine_core.get_output_async() - # 2) Detokenize based on the output. - request_outputs, reqs_to_abort = self.detokenizer.step(outputs) + # 2) Process EngineCoreOutputs. + processed_outputs = self.output_processor.process_outputs( + outputs.outputs) + # NOTE: RequestOutputs are pushed to their queues. + assert len(processed_outputs.request_outputs) == 0 - # 3) Put the RequestOutputs into the per-request queues. - self._process_request_outputs(request_outputs) + # 3) Abort any reqs that finished due to stop strings. + await self.engine_core.abort_requests_async( + processed_outputs.reqs_to_abort) - # 4) Abort any requests that finished due to stop strings. - await self.engine_core.abort_requests_async(reqs_to_abort) + # 4) Logging. + # TODO(rob): make into a coroutine and launch it in + # background thread once we add Prometheus. + self._log_stats( + scheduler_stats=outputs.scheduler_stats, + iteration_stats=processed_outputs.iteration_stats, + ) except Exception as e: logger.exception("EngineCore output handler hit an error: %s", e) kill_process_tree(os.getpid()) async def abort(self, request_id: str) -> None: - """Abort RequestId in self, detokenizer, and engine core.""" + """Abort RequestId in OutputProcessor and EngineCore.""" request_ids = [request_id] await self.engine_core.abort_requests_async(request_ids) - self.detokenizer.abort_requests(request_ids) + self.output_processor.abort_requests(request_ids) - # If a request finishes while we await then the request_id - # will be removed from the tracked queues before we get here. - if request_id in self.rid_to_queue: - del self.rid_to_queue[request_id] + if self.log_requests: + logger.info("Aborted request %s.", request_id) + + def _log_stats( + self, + scheduler_stats: SchedulerStats, + iteration_stats: IterationStats, + ): + if not self.log_stats: + return + + for logger in self.stat_loggers: + logger.log(scheduler_stats=scheduler_stats) def encode( self, @@ -302,8 +300,7 @@ class AsyncLLM(EngineClient): self, lora_request: Optional[LoRARequest] = None, ) -> AnyTokenizer: - assert lora_request is None - return self.detokenizer.tokenizer + return self.tokenizer.get_lora_tokenizer(lora_request) async def is_tracing_enabled(self) -> bool: return False @@ -339,3 +336,7 @@ class AsyncLLM(EngineClient): @property def dead_error(self) -> BaseException: return Exception() # TODO: implement + + async def add_lora(self, lora_request: LoRARequest) -> None: + """Load a new LoRA adapter into the engine for future requests.""" + raise NotImplementedError("LoRA not yet supported in V1") diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 975ce11fe8aff..26ebc7edcf03e 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -11,15 +11,16 @@ import zmq import zmq.asyncio from msgspec import msgpack -from vllm.config import CacheConfig, VllmConfig +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.utils import get_exception_traceback, zmq_socket_ctx +from vllm.v1.core.kv_cache_utils import get_kv_cache_config from vllm.v1.core.scheduler import Scheduler -from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, - EngineCoreProfile, EngineCoreRequest, - EngineCoreRequestType, EngineCoreRequestUnion) +from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile, + EngineCoreRequest, EngineCoreRequestType, + EngineCoreRequestUnion) from vllm.v1.engine.mm_input_mapper import MMInputMapperServer from vllm.v1.executor.abstract import Executor from vllm.v1.request import Request, RequestStatus @@ -28,9 +29,7 @@ from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) -POLLING_TIMEOUT_MS = 5000 -POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000 -LOGGING_TIME_S = 5 +POLLING_TIMEOUT_S = 2.5 class EngineCore: @@ -40,10 +39,8 @@ class EngineCore: self, vllm_config: VllmConfig, executor_class: Type[Executor], - log_stats: bool = False, ): assert vllm_config.model_config.runner_type != "pooling" - self.log_stats = log_stats logger.info("Initializing an LLM engine (v%s) with config: %s", VLLM_VERSION, vllm_config) @@ -53,36 +50,41 @@ class EngineCore: # Setup KV Caches and update CacheConfig after profiling. num_gpu_blocks, num_cpu_blocks = self._initialize_kv_caches( - vllm_config.cache_config) + vllm_config) vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks # Setup scheduler. - self.scheduler = Scheduler(vllm_config.scheduler_config, - vllm_config.cache_config, - vllm_config.lora_config) - - self._last_logging_time = time.time() + self.scheduler = Scheduler( + scheduler_config=vllm_config.scheduler_config, + model_config=vllm_config.model_config, + cache_config=vllm_config.cache_config, + lora_config=vllm_config.lora_config, + ) self.mm_input_mapper_server = MMInputMapperServer( vllm_config.model_config) def _initialize_kv_caches(self, - cache_config: CacheConfig) -> Tuple[int, int]: + vllm_config: VllmConfig) -> Tuple[int, int]: start = time.time() - num_gpu_blocks, _ = self.model_executor.determine_num_available_blocks( - ) - if cache_config.num_gpu_blocks_override is not None: - num_gpu_blocks_override = cache_config.num_gpu_blocks_override - logger.info( - "Overriding num_gpu_blocks=%d with " - "num_gpu_blocks_override=%d", num_gpu_blocks, - num_gpu_blocks_override) - num_gpu_blocks = num_gpu_blocks_override + # Get all kv cache needed by the model + kv_cache_spec = self.model_executor.get_kv_cache_spec() + # Profiles the peak memory usage of the model to determine how much + # memory can be allocated for kv cache. + availble_gpu_memory = self.model_executor.determine_available_memory() + + # Get the kv cache tensor size + kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, + availble_gpu_memory) + num_gpu_blocks = kv_cache_config.num_blocks num_cpu_blocks = 0 - self.model_executor.initialize(num_gpu_blocks) + + # Initialize kv cache and warmup the execution + self.model_executor.initialize(kv_cache_config) + elapsed = time.time() - start logger.info(("init engine (profile, create kv cache, " "warmup model) took %.2f seconds"), elapsed) @@ -114,11 +116,12 @@ class EngineCore: self.scheduler.finish_requests(request_ids, RequestStatus.FINISHED_ABORTED) - def step(self) -> List[EngineCoreOutput]: + def step(self) -> EngineCoreOutputs: """Schedule, execute, and make output.""" if not self.scheduler.has_unfinished_requests(): - return [] + return EngineCoreOutputs( + outputs=[], scheduler_stats=self.scheduler.make_stats()) scheduler_output = self.scheduler.schedule() output = self.model_executor.execute_model(scheduler_output) @@ -145,7 +148,9 @@ class EngineCoreProc(EngineCore): executor_class: Type[Executor], log_stats: bool = False, ): - super().__init__(vllm_config, executor_class, log_stats) + super().__init__(vllm_config, executor_class) + + self.log_stats = log_stats # Background Threads and Queues for IO. These enable us to # overlap ZMQ socket IO with GPU since they release the GIL, @@ -153,7 +158,7 @@ class EngineCoreProc(EngineCore): # model forward pass. # Threads handle Socket <-> Queues and core_busy_loop uses Queue. self.input_queue: queue.Queue[EngineCoreRequestUnion] = queue.Queue() - self.output_queue: queue.Queue[List[EngineCoreOutput]] = queue.Queue() + self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue() threading.Thread(target=self.process_input_socket, args=(input_path, ), daemon=True).start() @@ -217,8 +222,10 @@ class EngineCoreProc(EngineCore): self._handle_client_request(req) break except queue.Empty: - self._log_stats() logger.debug("EngineCore busy loop waiting.") + # Break out the loop so we can log_stats in step(). + if self.log_stats: + break except BaseException: raise @@ -230,28 +237,9 @@ class EngineCoreProc(EngineCore): # 3) Step the engine core. outputs = self.step() - # 4) Put EngineCoreOutputs into the output queue. + # 5) Put EngineCoreOutputs into the output queue. self.output_queue.put_nowait(outputs) - self._log_stats() - - def _log_stats(self): - """Log basic stats every LOGGING_TIME_S""" - - if not self.log_stats: - return - - now = time.time() - - if now - self._last_logging_time > LOGGING_TIME_S: - logger.info( - "RUNNING: %s | WAITING: %s", - len(self.scheduler.running), - len(self.scheduler.waiting), - ) - - self._last_logging_time = now - def _handle_client_request(self, request: EngineCoreRequestUnion) -> None: """Handle EngineCoreRequest or EngineCoreABORT from Client.""" @@ -301,7 +289,6 @@ class EngineCoreProc(EngineCore): with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket: while True: - engine_core_outputs = self.output_queue.get() - outputs = EngineCoreOutputs(outputs=engine_core_outputs) + outputs = self.output_queue.get() encoder.encode_into(outputs, buffer) socket.send_multipart((buffer, ), copy=False) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index a4a45ae05ff9e..ac0f0f14bf1ab 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -12,9 +12,9 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree, make_zmq_socket) -from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, - EngineCoreProfile, EngineCoreRequest, - EngineCoreRequestType, EngineCoreRequestUnion) +from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile, + EngineCoreRequest, EngineCoreRequestType, + EngineCoreRequestUnion) from vllm.v1.engine.core import EngineCore, EngineCoreProc from vllm.v1.executor.abstract import Executor from vllm.v1.serial_utils import PickleEncoder @@ -40,7 +40,6 @@ class EngineCoreClient(ABC): asyncio_mode: bool, vllm_config: VllmConfig, executor_class: Type[Executor], - log_stats: bool = False, ) -> "EngineCoreClient": # TODO: support this for debugging purposes. @@ -50,18 +49,18 @@ class EngineCoreClient(ABC): "is not currently supported.") if multiprocess_mode and asyncio_mode: - return AsyncMPClient(vllm_config, executor_class, log_stats) + return AsyncMPClient(vllm_config, executor_class) if multiprocess_mode and not asyncio_mode: - return SyncMPClient(vllm_config, executor_class, log_stats) + return SyncMPClient(vllm_config, executor_class) - return InprocClient(vllm_config, executor_class, log_stats) + return InprocClient(vllm_config, executor_class) @abstractmethod def shutdown(self): ... - def get_output(self) -> List[EngineCoreOutput]: + def get_output(self) -> EngineCoreOutputs: raise NotImplementedError def add_request(self, request: EngineCoreRequest) -> None: @@ -73,7 +72,7 @@ class EngineCoreClient(ABC): def abort_requests(self, request_ids: List[str]) -> None: raise NotImplementedError - async def get_output_async(self) -> List[EngineCoreOutput]: + async def get_output_async(self) -> EngineCoreOutputs: raise NotImplementedError async def add_request_async(self, request: EngineCoreRequest) -> None: @@ -99,14 +98,15 @@ class InprocClient(EngineCoreClient): def __init__(self, *args, **kwargs): self.engine_core = EngineCore(*args, **kwargs) - def get_output(self) -> List[EngineCoreOutput]: + def get_output(self) -> EngineCoreOutputs: return self.engine_core.step() def add_request(self, request: EngineCoreRequest) -> None: self.engine_core.add_request(request) def abort_requests(self, request_ids: List[str]) -> None: - self.engine_core.abort_requests(request_ids) + if len(request_ids) > 0: + self.engine_core.abort_requests(request_ids) def shutdown(self): self.engine_core.shutdown() @@ -133,7 +133,7 @@ class MPClient(EngineCoreClient): asyncio_mode: bool, vllm_config: VllmConfig, executor_class: Type[Executor], - log_stats: bool = False, + log_stats: bool, ): # The child processes will send SIGUSR1 when unrecoverable # errors happen. We kill the process tree here so that the @@ -194,22 +194,19 @@ class MPClient(EngineCoreClient): class SyncMPClient(MPClient): """Synchronous client for multi-proc EngineCore.""" - def __init__(self, - vllm_config: VllmConfig, - executor_class: Type[Executor], - log_stats: bool = False): + def __init__(self, vllm_config: VllmConfig, + executor_class: Type[Executor]): super().__init__( asyncio_mode=False, vllm_config=vllm_config, executor_class=executor_class, - log_stats=log_stats, + log_stats=False, ) - def get_output(self) -> List[EngineCoreOutput]: + def get_output(self) -> EngineCoreOutputs: (frame, ) = self.output_socket.recv_multipart(copy=False) - engine_core_outputs = self.decoder.decode(frame.buffer).outputs - return engine_core_outputs + return self.decoder.decode(frame.buffer) def _send_input(self, request_type: EngineCoreRequestType, request: EngineCoreRequestUnion) -> None: @@ -219,10 +216,14 @@ class SyncMPClient(MPClient): self.input_socket.send_multipart(msg, copy=False) def add_request(self, request: EngineCoreRequest) -> None: + # NOTE: text prompt is not needed in the core engine as it has been + # tokenized. + request.prompt = None self._send_input(EngineCoreRequestType.ADD, request) def abort_requests(self, request_ids: List[str]) -> None: - self._send_input(EngineCoreRequestType.ABORT, request_ids) + if len(request_ids) > 0: + self._send_input(EngineCoreRequestType.ABORT, request_ids) def profile(self, is_start: bool = True) -> None: self._send_input(EngineCoreRequestType.PROFILE, @@ -232,23 +233,19 @@ class SyncMPClient(MPClient): class AsyncMPClient(MPClient): """Asyncio-compatible client for multi-proc EngineCore.""" - def __init__(self, - vllm_config: VllmConfig, - executor_class: Type[Executor], - log_stats: bool = False): + def __init__(self, vllm_config: VllmConfig, + executor_class: Type[Executor]): super().__init__( asyncio_mode=True, vllm_config=vllm_config, executor_class=executor_class, - log_stats=log_stats, + log_stats=True, ) - async def get_output_async(self) -> List[EngineCoreOutput]: + async def get_output_async(self) -> EngineCoreOutputs: frames = await self.output_socket.recv_multipart(copy=False) - engine_core_outputs = self.decoder.decode(frames[0].buffer).outputs - - return engine_core_outputs + return self.decoder.decode(frames[0].buffer) async def _send_input(self, request_type: EngineCoreRequestType, request: EngineCoreRequestUnion) -> None: @@ -257,6 +254,9 @@ class AsyncMPClient(MPClient): await self.input_socket.send_multipart(msg, copy=False) async def add_request_async(self, request: EngineCoreRequest) -> None: + # NOTE: text prompt is not needed in the core engine as it has been + # tokenized. + request.prompt = None await self._send_input(EngineCoreRequestType.ADD, request) async def abort_requests_async(self, request_ids: List[str]) -> None: diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 65be9e58e03c8..4a8b61beec037 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -1,18 +1,25 @@ from dataclasses import dataclass -from typing import Dict, Iterable, List, Optional, Tuple, Union +from typing import List, Optional, Union from vllm.engine.output_processor.stop_checker import StopChecker from vllm.logger import init_logger -from vllm.outputs import RequestOutput from vllm.sampling_params import RequestOutputKind from vllm.transformers_utils.detokenizer_utils import ( AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) -from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest logger = init_logger(__name__) +@dataclass +class DetokenizerOutput: + output_text: str + token_ids: List[int] + finished: bool + finish_reason: Optional[str] = None + stop_reason: Union[int, str, None] = None + + @dataclass class IncrementalDetokenizer: @@ -20,6 +27,7 @@ class IncrementalDetokenizer: output_text: str tokens: List[str] token_ids: List[int] + prompt_len: int # Stop strings stop: List[str] @@ -34,11 +42,6 @@ class IncrementalDetokenizer: spaces_between_special_tokens: bool output_kind: RequestOutputKind - # TODO: Probably decouple these - request_id: str - prompt: Optional[str] - prompt_token_ids: List[int] - # Tokenizer for this request tokenizer: AnyTokenizer @@ -48,8 +51,7 @@ class IncrementalDetokenizer: @property def output_token_ids(self) -> List[int]: - assert len(self.token_ids) >= len(self.prompt_token_ids) - return self.token_ids[len(self.prompt_token_ids):] + return self.token_ids[self.prompt_len:] @classmethod def from_new_request( @@ -87,25 +89,25 @@ class IncrementalDetokenizer: spaces_between_special_tokens=request.sampling_params. spaces_between_special_tokens, output_kind=request.sampling_params.output_kind, - request_id=request.request_id, - prompt=request.prompt, - prompt_token_ids=request.prompt_token_ids, + prompt_len=len(request.prompt_token_ids), tokenizer=tokenizer, stop_buffer_length=stop_buffer_length, ) - def add_tokens( + def update_from_output( self, - new_token_ids: List[int], - finish_reason: Optional[str], - stop_reason: Optional[Union[int, str, None]], - ) -> Optional[RequestOutput]: + output: EngineCoreOutput, + ) -> Optional[DetokenizerOutput]: """ Update RequestState for the request_id by: 1) Detokenize the new token ids incrementally. 2) Update the RequestOutput with the new text. """ + new_token_ids = output.new_token_ids + finish_reason = output.finish_reason + stop_reason = output.stop_reason + # 1) Detokenize the new token ids incrementally. # TODO(woosuk): This method becomes very inefficient when the number of # new_token_ids is more than 1. We need to optimize this. @@ -158,21 +160,8 @@ class IncrementalDetokenizer: output_text = self._get_next_output_text(finished, delta) token_ids = new_token_ids if delta else self.output_token_ids - request_output = RequestOutput.new( - self.request_id, - self.prompt, - self.prompt_token_ids, - output_text, - token_ids, - finished, - ) - - if finished: - completion_output = request_output.outputs[0] - completion_output.finish_reason = finish_reason - completion_output.stop_reason = stop_reason - - return request_output + return DetokenizerOutput(output_text, token_ids, finished, + finish_reason, stop_reason) def _get_next_output_text(self, finished: bool, delta: bool) -> str: """If delta is True, only new text since the last call to @@ -189,85 +178,3 @@ class IncrementalDetokenizer: self._last_output_text_offset = length return self.output_text[last_offset:length] return "" - - -class Detokenizer: - - def __init__(self, - tokenizer_name: str, - tokenizer_mode: str = "auto", - trust_remote_code: bool = False, - revision: Optional[str] = None): - # TODO: once we support LoRA, we should should pass the tokenizer - # here. We currently have two copies (this + in the LLMEngine). - self.tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, - tokenizer_mode=tokenizer_mode, - trust_remote_code=trust_remote_code, - revision=revision) - - # Request id -> IncrementalDetokenizer - self.request_states: Dict[str, IncrementalDetokenizer] = {} - - def is_request_active(self, request_id: str): - return request_id in self.request_states - - def get_num_unfinished_requests(self): - return len(self.request_states) - - def has_unfinished_requests(self) -> bool: - return len(self.request_states) > 0 - - def abort_requests( - self, - request_ids: Iterable[str], - ) -> None: - """Remove the request_ids from the Detokenizer.""" - - for request_id in request_ids: - self.request_states.pop(request_id, None) - - def add_request( - self, - request: EngineCoreRequest, - ): - """Add new request to the Detokenizer.""" - - assert (request.request_id not in self.request_states) - - request_state = IncrementalDetokenizer.from_new_request( - self.tokenizer, request) - self.request_states[request.request_id] = request_state - - def step( - self, encore_core_outputs: List[EngineCoreOutput] - ) -> Tuple[List[RequestOutput], List[str]]: - """Update state and request the RequestOutputs to the LLMEngine.""" - - request_outputs: List[RequestOutput] = [] - requests_to_abort: List[str] = [] - for engine_core_output in encore_core_outputs: - request_id = engine_core_output.request_id - detokenizer = self.request_states.get(request_id) - if detokenizer is None: - # Ignore output for already-aborted request. - continue - - # Detokenize and update state. - request_output = detokenizer.add_tokens( - new_token_ids=engine_core_output.new_token_ids, - finish_reason=engine_core_output.finish_reason, - stop_reason=engine_core_output.stop_reason, - ) - - if request_output is not None: - # Add to RequestOutputs list. - request_outputs.append(request_output) - - # Free completed requests. - if request_output.finished: - self.request_states.pop(request_id) - if not engine_core_output.finished: - requests_to_abort.append(request_id) - - # Return to EngineClient. - return request_outputs, requests_to_abort diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 8ced3a34d2da3..f5999ccda6447 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -18,7 +18,7 @@ from vllm.transformers_utils.tokenizer_group import ( BaseTokenizerGroup, init_tokenizer_from_configs) from vllm.usage.usage_lib import UsageContext from vllm.v1.engine.core_client import EngineCoreClient -from vllm.v1.engine.detokenizer import Detokenizer +from vllm.v1.engine.output_processor import OutputProcessor from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor @@ -60,13 +60,9 @@ class LLMEngine: input_registry=input_registry, mm_registry=mm_registry) - # Detokenizer (converts EngineCoreOutputs --> RequestOutput) - self.detokenizer = Detokenizer( - tokenizer_name=vllm_config.model_config.tokenizer, - tokenizer_mode=vllm_config.model_config.tokenizer_mode, - trust_remote_code=vllm_config.model_config.trust_remote_code, - revision=vllm_config.model_config.tokenizer_revision, - ) + # OutputProcessor (convert EngineCoreOutputs --> RequestOutput). + self.output_processor = OutputProcessor(self.tokenizer, + log_stats=False) # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs) self.engine_core = EngineCoreClient.make_client( @@ -74,7 +70,6 @@ class LLMEngine: asyncio_mode=False, vllm_config=vllm_config, executor_class=executor_class, - log_stats=False, ) @classmethod @@ -104,10 +99,10 @@ class LLMEngine: multiprocess_mode=enable_multiprocessing) def get_num_unfinished_requests(self) -> int: - return self.detokenizer.get_num_unfinished_requests() + return self.output_processor.get_num_unfinished_requests() def has_unfinished_requests(self) -> bool: - return self.detokenizer.has_unfinished_requests() + return self.output_processor.has_unfinished_requests() @classmethod def validate_outputs(cls, outputs, output_type): @@ -117,7 +112,7 @@ class LLMEngine: """Remove request_ids from EngineCore and Detokenizer.""" self.engine_core.abort_requests(request_ids) - self.detokenizer.abort_requests(request_ids) + self.output_processor.abort_requests(request_ids) def add_request( self, @@ -138,8 +133,8 @@ class LLMEngine: prompt_adapter_request, priority) - # 2) Add the request to Detokenizer. - self.detokenizer.add_request(request) + # 2) Make a new RequestState and queue. + self.output_processor.add_request(request) # 3) Add the request to EngineCore. self.engine_core.add_request(request) @@ -147,17 +142,16 @@ class LLMEngine: def step(self) -> List[RequestOutput]: # 1) Get EngineCoreOutput from the EngineCore. - engine_core_outputs = self.engine_core.get_output() + outputs = self.engine_core.get_output() - # 2) Detokenizer the EngineCoreOutput. - request_outputs, requests_to_abort = self.detokenizer.step( - engine_core_outputs) + # 2) Process EngineCoreOutputs. + processed_outputs = self.output_processor.process_outputs( + outputs.outputs) - # 3) Abort requests that finished due to stopping criteria. - if requests_to_abort: - self.abort_request(requests_to_abort) + # 3) Abort any reqs that finished due to stop strings. + self.engine_core.abort_requests(processed_outputs.reqs_to_abort) - return request_outputs + return processed_outputs.request_outputs def get_model_config(self): return self.model_config diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py new file mode 100644 index 0000000000000..749f4f5043c97 --- /dev/null +++ b/vllm/v1/engine/output_processor.py @@ -0,0 +1,200 @@ +import asyncio +from dataclasses import dataclass +from typing import Dict, List, Optional + +from vllm.outputs import RequestOutput +from vllm.transformers_utils.detokenizer_utils import AnyTokenizer +from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup +from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest +from vllm.v1.engine.detokenizer import (DetokenizerOutput, + IncrementalDetokenizer) +from vllm.v1.metrics.stats import IterationStats + + +@dataclass +class OutputProcessorOutput: + + request_outputs: List[RequestOutput] + reqs_to_abort: List[str] + iteration_stats: IterationStats + + +class RequestState: + + def __init__( + self, + request_id: str, + prompt: Optional[str], + prompt_token_ids: List[int], + detokenizer: IncrementalDetokenizer, + queue: Optional[asyncio.Queue[RequestOutput]], + ): + self.request_id = request_id + self.prompt = prompt + self.prompt_token_ids = prompt_token_ids + self.prompt_len = len(prompt_token_ids) + self.detokenizer = detokenizer + self.is_prefilling = True + self.queue = queue + + @classmethod + def from_new_request( + cls, + tokenizer: AnyTokenizer, + request: EngineCoreRequest, + queue: Optional[asyncio.Queue[RequestOutput]] = None, + ) -> "RequestState": + return cls( + request_id=request.request_id, + prompt=request.prompt, + prompt_token_ids=request.prompt_token_ids, + detokenizer=IncrementalDetokenizer.from_new_request( + tokenizer=tokenizer, + request=request, + ), + queue=queue, + ) + + +class OutputProcessor: + """Process EngineCoreOutputs into RequestOutputs.""" + + def __init__( + self, + tokenizer: BaseTokenizerGroup, + log_stats: bool, + ): + self.log_stats = log_stats + self.tokenizer = tokenizer + self.request_states: Dict[str, RequestState] = {} + + def is_request_active(self, request_id: str) -> bool: + return request_id in self.request_states + + def get_num_unfinished_requests(self): + return len(self.request_states) + + def has_unfinished_requests(self) -> bool: + return len(self.request_states) > 0 + + def abort_requests( + self, + request_ids: List[str], + ) -> None: + for request_id in request_ids: + self.request_states.pop(request_id, None) + + def add_request( + self, + request: EngineCoreRequest, + queue: Optional[asyncio.Queue[RequestOutput]] = None, + ) -> None: + request_id = request.request_id + if request_id in self.request_states: + raise ValueError(f"Request id {request_id} already running.") + + self.request_states[request_id] = RequestState.from_new_request( + tokenizer=self.tokenizer.get_lora_tokenizer(request.lora_request), + request=request, + queue=queue) + + def process_outputs( + self, + engine_core_outputs: List[EngineCoreOutput], + ) -> OutputProcessorOutput: + """ + Process the EngineCoreOutputs: + 1) Compute stats for logging + 2) Detokenize + 3) Create and handle RequestOutput objects: + * If there is a queue (for usage with AsyncLLM), + put the RequestOutput objects into the queue for + handling by the per-request generate() tasks. + + * If there is no queue (for usage with LLMEngine), + return a list of RequestOutput objects. + + ****************** NOTE FOR DEVELOPERS ****************** + + VLLM V1 minimizes the number of python loops over the full + batch to ensure system overheads are minimized. This is the + only function that should loop over EngineCoreOutputs. + + If you need to touch every element of the batch, implement a + method called XXXClass.update_from_output() to be called + within the loop below. For examples, see: + * IterationStats.update_from_output() + * Detokenizer.update_from_output() + + TODO(rob): add Protocol makes update_from_output explicit. + + ********************************************************** + """ + + request_outputs: List[RequestOutput] = [] + reqs_to_abort: List[str] = [] + iteration_stats = IterationStats(self.log_stats) + for engine_core_output in engine_core_outputs: + req_id = engine_core_output.request_id + req_state = self.request_states.get(req_id) + if req_state is None: + # Ignore output for already-aborted request. + continue + + # 1) Compute stats for this iteration. + iteration_stats.update_from_output(engine_core_output, + req_state.is_prefilling, + req_state.prompt_len) + req_state.is_prefilling = False + + # 2) Detokenize the token ids into text. + detokenizer_output = req_state.detokenizer.update_from_output( + engine_core_output) + + # 3) Create and handle RequestOutput objects. + if request_output := self._make_request_output( + req_state, detokenizer_output): + if req_state.queue is not None: + # AsyncLLM: put into queue for handling by generate(). + req_state.queue.put_nowait(request_output) + else: + # LLMEngine: return list of RequestOutputs. + request_outputs.append(request_output) + + # Free completed requests. + if request_output.finished: + self.request_states.pop(req_id) + if not engine_core_output.finished: + # If req not finished in EngineCore, but Detokenizer + # detected stop string, abort needed in EngineCore. + reqs_to_abort.append(req_id) + + return OutputProcessorOutput( + request_outputs=request_outputs, + reqs_to_abort=reqs_to_abort, + iteration_stats=iteration_stats, + ) + + def _make_request_output( + self, + request_state: RequestState, + detokenizer_output: Optional[DetokenizerOutput], + ) -> Optional[RequestOutput]: + + if detokenizer_output is None: + return None + + request_output = RequestOutput.new( + request_state.request_id, + request_state.prompt, + request_state.prompt_token_ids, + detokenizer_output.output_text, + detokenizer_output.token_ids, + detokenizer_output.finished, + ) + if detokenizer_output.finished: + completion_output = request_output.outputs[0] + completion_output.finish_reason = detokenizer_output.finish_reason + completion_output.stop_reason = detokenizer_output.stop_reason + + return request_output diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index 5d74d4b01f500..131be759842c7 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -1,57 +1,92 @@ -from abc import ABC, abstractmethod -from typing import Tuple, Type +from typing import Type from vllm.config import VllmConfig +from vllm.executor.executor_base import ExecutorBase +from vllm.executor.ray_distributed_executor import ( # noqa + RayDistributedExecutor as RayDistributedExecutorV0) +from vllm.executor.uniproc_executor import ( # noqa + ExecutorWithExternalLauncher as ExecutorWithExternalLauncherV0) +from vllm.executor.uniproc_executor import ( # noqa + UniProcExecutor as UniProcExecutorV0) +from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ModelRunnerOutput -class Executor(ABC): - """Abstract class for executors.""" +class Executor(ExecutorBase): + """ + Abstract class for v1 executors, mainly define some methods for v1. + For methods shared by v0 and v1, define them in ExecutorBase""" @staticmethod def get_class(vllm_config: VllmConfig) -> Type["Executor"]: executor_class: Type[Executor] + parallel_config = vllm_config.parallel_config distributed_executor_backend = ( - vllm_config.parallel_config.distributed_executor_backend) + parallel_config.distributed_executor_backend) + if distributed_executor_backend is None: + # If the user does not specify the distributed executor backend, + # we will choose the backend based on the world size. + if parallel_config.world_size > 1: + distributed_executor_backend = "mp" + else: + distributed_executor_backend = "uni" + if distributed_executor_backend == "ray": - from vllm.v1.executor.ray_executor import RayExecutor - executor_class = RayExecutor + executor_class = RayDistributedExecutor elif distributed_executor_backend == "mp": from vllm.v1.executor.multiproc_executor import MultiprocExecutor executor_class = MultiprocExecutor + elif distributed_executor_backend == "uni": + executor_class = UniProcExecutor + elif distributed_executor_backend == "external_launcher": + # TODO: make v1 scheduling deterministic + # to support external launcher + executor_class = ExecutorWithExternalLauncher else: - assert (distributed_executor_backend is None) - from vllm.v1.executor.uniproc_executor import UniprocExecutor - executor_class = UniprocExecutor + raise ValueError("Unknown distributed executor backend: " + f"{distributed_executor_backend}") return executor_class - @abstractmethod - def __init__(self, vllm_config: VllmConfig) -> None: - raise NotImplementedError + def initialize(self, kv_cache_config: KVCacheConfig) -> None: + """ + Initialize the KV caches and begin the model execution loop of the + underlying workers. + """ + self.collective_rpc("initialize_cache", args=(kv_cache_config, )) + self.collective_rpc("compile_or_warm_up_model") - @abstractmethod - def initialize(self, num_gpu_blocks: int) -> None: - raise NotImplementedError + def determine_available_memory(self) -> int: # in bytes + output = self.collective_rpc("determine_available_memory") + # Since we use a shared centralized controller, we take the minimum + # memory size across all workers to make sure all the memory + # operators can be applied to all workers. + return min(output) - @abstractmethod - def determine_num_available_blocks(self) -> Tuple[int, int]: - raise NotImplementedError + def get_kv_cache_spec(self) -> KVCacheSpec: + output = self.collective_rpc("get_kv_cache_spec") + for x in output: + assert x == output[0] + return output[0] - @abstractmethod def execute_model( self, scheduler_output, ) -> ModelRunnerOutput: - raise NotImplementedError + output = self.collective_rpc("execute_model", + args=(scheduler_output, )) + return output[0] - @abstractmethod def profile(self, is_start: bool = True): - raise NotImplementedError + self.collective_rpc("profile", args=(is_start, )) - @abstractmethod - def shutdown(self): - pass - @abstractmethod - def check_health(self) -> None: - raise NotImplementedError +class UniProcExecutor(UniProcExecutorV0, Executor): + pass + + +class ExecutorWithExternalLauncher(ExecutorWithExternalLauncherV0, Executor): + pass + + +class RayDistributedExecutor(RayDistributedExecutorV0, Executor): + pass diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 41e6abbd67956..f6cf35da0106b 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -6,9 +6,11 @@ import time import weakref from dataclasses import dataclass from enum import Enum, auto +from functools import partial from multiprocessing.process import BaseProcess -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +import cloudpickle import psutil import zmq @@ -23,7 +25,6 @@ from vllm.logger import init_logger from vllm.utils import (get_distributed_init_method, get_mp_context, get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx) from vllm.v1.executor.abstract import Executor -from vllm.v1.outputs import ModelRunnerOutput from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -34,7 +35,7 @@ POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000 class MultiprocExecutor(Executor): - def __init__(self, vllm_config: VllmConfig) -> None: + def _init_executor(self) -> None: # Call self.shutdown at exit to clean up # and ensure workers will be terminated. self._finalizer = weakref.finalize(self, self.shutdown) @@ -52,9 +53,6 @@ class MultiprocExecutor(Executor): signal.signal(signal.SIGUSR1, sigusr1_handler) - self.vllm_config = vllm_config - self.parallel_config = vllm_config.parallel_config - self.world_size = self.parallel_config.world_size tensor_parallel_size = self.parallel_config.tensor_parallel_size assert self.world_size == tensor_parallel_size, ( @@ -79,7 +77,8 @@ class MultiprocExecutor(Executor): # Create workers self.workers: List[WorkerProcHandle] = [] for rank in range(self.world_size): - worker = WorkerProc.make_worker_process(vllm_config, rank, rank, + worker = WorkerProc.make_worker_process(self.vllm_config, rank, + rank, distributed_init_method, scheduler_output_handle) self.workers.append(worker) @@ -90,53 +89,24 @@ class MultiprocExecutor(Executor): for w in self.workers: w.worker_response_mq.wait_until_ready() - def initialize(self, num_gpu_blocks: int) -> None: - """ - Initialize the KV caches and begin the model execution loop of the - underlying workers. - """ - logger.info("# GPU blocks: %d", num_gpu_blocks) - self.collective_rpc("initialize_cache", args=(num_gpu_blocks, )) - self.collective_rpc("compile_or_warm_up_model") - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """ - Determine the number of available KV blocks by invoking the - underlying worker. - """ - num_blocks = self.collective_rpc("determine_num_available_blocks") - - # Since we use a shared centralized controller, we take the minimum - # number of blocks across all workers to make sure all the memory - # operators can be applied to all workers. - num_gpu_blocks = min(b[0] for b in num_blocks) - num_cpu_blocks = min(b[1] for b in num_blocks) - - return num_gpu_blocks, num_cpu_blocks - def collective_rpc(self, - method: str, + method: Union[str, Callable], timeout: Optional[float] = None, args: Tuple = (), kwargs: Optional[Dict] = None) -> List[Any]: - """ - Execute an RPC call on workers. - - Args: - method: Name of the worker method to execute - timeout: Maximum time in seconds to wait for execution. Rases a - TimeoutError on timeout. None means wait indefinitely. - args: Positional arguments to pass to the worker method - kwargs: Keyword arguments to pass to the worker method - - Returns: - List of results from each worker - """ start_time = time.monotonic() kwargs = kwargs or {} + # NOTE: If the args are heterogeneous, then we pack them into a list, + # and unpack them in the method of every worker, because every worker + # knows their own rank. try: - self.rpc_broadcast_mq.enqueue((method, args, kwargs)) + if isinstance(method, str): + send_method = method + else: + send_method = cloudpickle.dumps( + method, protocol=pickle.HIGHEST_PROTOCOL) + self.rpc_broadcast_mq.enqueue((send_method, args, kwargs)) responses = [None] * self.world_size for w in self.workers: @@ -160,18 +130,6 @@ class MultiprocExecutor(Executor): # Re-raise any other exceptions raise e - def execute_model( - self, - scheduler_output, - ) -> ModelRunnerOutput: - model_output = self.collective_rpc("execute_model", - args=(scheduler_output, ))[0] - return model_output - - def profile(self, is_start: bool = True): - self.collective_rpc("profile", args=(is_start, )) - return - def _ensure_worker_termination(self): """Ensure that all worker processes are terminated. Assumes workers have received termination requests. Waits for processing, then sends @@ -246,9 +204,18 @@ class WorkerProc: ready_path: str, ): self.rank = rank - wrapper = WorkerWrapperBase(vllm_config=vllm_config) - wrapper.init_worker(vllm_config, local_rank, rank, - distributed_init_method) + wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank) + # TODO: move `init_worker` to executor level as a collective rpc call + all_kwargs: List[Dict] = [ + {} for _ in range(vllm_config.parallel_config.world_size) + ] + all_kwargs[rank] = { + "vllm_config": vllm_config, + "local_rank": local_rank, + "rank": rank, + "distributed_init_method": distributed_init_method, + } + wrapper.init_worker(all_kwargs) self.worker = wrapper.worker pid = os.getpid() @@ -270,7 +237,7 @@ class WorkerProc: ready_socket.send_string(WorkerProc.READY_STR) ready_socket.send(payload) - self.worker.initialize() + self.worker.init_device() self.worker.load_model() @staticmethod @@ -394,7 +361,11 @@ class WorkerProc: method, args, kwargs = self.rpc_broadcast_mq.dequeue() try: - output = getattr(self.worker, method)(*args, **kwargs) + if isinstance(method, str): + func = getattr(self.worker, method) + elif isinstance(method, bytes): + func = partial(cloudpickle.loads(method), self.worker) + output = func(*args, **kwargs) except Exception as e: self.worker_response_mq.enqueue( (WorkerProc.ResponseStatus.FAILURE, e)) diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py deleted file mode 100644 index 79acc60001c99..0000000000000 --- a/vllm/v1/executor/ray_executor.py +++ /dev/null @@ -1,342 +0,0 @@ -import os -from collections import defaultdict -from itertools import islice, repeat -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple - -import vllm.envs as envs -from vllm.config import VllmConfig -from vllm.logger import init_logger -from vllm.utils import get_distributed_init_method, get_ip, get_open_port -from vllm.v1.executor.abstract import Executor -from vllm.v1.executor.ray_utils import (RayWorkerWrapper, - initialize_ray_cluster, ray) -from vllm.v1.outputs import ModelRunnerOutput - -if ray is not None: - from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy - -if TYPE_CHECKING: - from ray.util.placement_group import PlacementGroup - -logger = init_logger(__name__) - - -class RayExecutor(Executor): - - def __init__(self, vllm_config: VllmConfig) -> None: - self.vllm_config = vllm_config - self.parallel_config = vllm_config.parallel_config - self.model_config = vllm_config.model_config - self.forward_dag: Optional[ray.dag.CompiledDAG] = None - - # Disable Ray usage stats collection. - ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") - if ray_usage != "1": - os.environ["RAY_USAGE_STATS_ENABLED"] = "0" - - initialize_ray_cluster(self.parallel_config) - placement_group = self.parallel_config.placement_group - - # Create the parallel GPU workers. - self._init_workers_ray(placement_group) - - def _init_workers_ray(self, placement_group: "PlacementGroup", - **ray_remote_kwargs): - # A list of workers to run a model. - self.workers: List[RayWorkerWrapper] = [] - if self.parallel_config.ray_workers_use_nsight: - ray_remote_kwargs = self._configure_ray_workers_use_nsight( - ray_remote_kwargs) - - # Create the workers. - driver_ip = get_ip() - for bundle_id, bundle in enumerate(placement_group.bundle_specs): - if not bundle.get("GPU", 0): - # Skip bundles that don't have GPUs, - # as each worker needs one GPU. - continue - scheduling_strategy = PlacementGroupSchedulingStrategy( - placement_group=placement_group, - placement_group_capture_child_tasks=True, - placement_group_bundle_index=bundle_id, - ) - - worker = ray.remote( - num_cpus=0, - num_gpus=1, - scheduling_strategy=scheduling_strategy, - **ray_remote_kwargs, - )(RayWorkerWrapper).remote(vllm_config=self.vllm_config) - self.workers.append(worker) - - logger.debug("workers: %s", self.workers) - worker_ips = [ - ray.get(worker.get_node_ip.remote()) # type: ignore[attr-defined] - for worker in self.workers - ] - ip_counts: Dict[str, int] = {} - for ip in worker_ips: - ip_counts[ip] = ip_counts.get(ip, 0) + 1 - - worker_to_ip = dict(zip(self.workers, worker_ips)) - - def sort_by_driver_then_worker_ip(worker): - """ - Sort the workers based on 3 properties: - 1. If the worker is on the same node as the driver (vllm engine), - it should be placed first. - 2. Then, if the worker is on a node with fewer workers, it should - be placed first. - 3. Finally, if the work is on a node with smaller IP address, it - should be placed first. This is simply a tiebreaker to make - sure the workers are sorted in a deterministic way. - """ - ip = worker_to_ip[worker] - return (ip != driver_ip, ip_counts[ip], ip) - - # After sorting, the workers on the same node will be - # close to each other, and the workers on the driver - # node will be placed first. - self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip) - - # Get the set of GPU IDs used on each node. - worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids") - - node_workers = defaultdict(list) # node id -> list of worker ranks - node_gpus = defaultdict(list) # node id -> list of gpu ids - - for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): - node_workers[node_id].append(i) - # `gpu_ids` can be a list of strings or integers. - # convert them to integers for consistency. - # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs), - # string sorting is not sufficient. - # see https://github.com/vllm-project/vllm/issues/5590 - gpu_ids = [int(x) for x in gpu_ids] - node_gpus[node_id].extend(gpu_ids) - - for node_id, gpu_ids in node_gpus.items(): - node_gpus[node_id] = sorted(gpu_ids) - - all_ips = set(worker_ips) - n_ips = len(all_ips) - n_nodes = len(node_workers) - - if n_nodes != n_ips: - raise RuntimeError( - f"Every node should have a unique IP address. Got {n_nodes}" - f" nodes with node ids {list(node_workers.keys())} and " - f"{n_ips} unique IP addresses {all_ips}. Please check your" - " network configuration. If you set `VLLM_HOST_IP` or " - "`HOST_IP` environment variable, make sure it is unique for" - " each node.") - - # Set environment variables for the driver and workers. - all_args_to_update_environment_variables = [({ - "CUDA_VISIBLE_DEVICES": - ",".join(map(str, node_gpus[node_id])), - "VLLM_TRACE_FUNCTION": - str(envs.VLLM_TRACE_FUNCTION), - "VLLM_USE_V1": - str(int(envs.VLLM_USE_V1)), - **({ - "VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND - } if envs.VLLM_ATTENTION_BACKEND is not None else {}) - }, ) for (node_id, _) in worker_node_and_gpu_ids] - - self._env_vars_for_all_workers = ( - all_args_to_update_environment_variables) - - self._run_workers("update_environment_variables", - all_args=self._get_env_vars_to_be_updated()) - - if len(node_gpus) == 1: - # in single node case, we don't need to get the IP address. - # the loopback address is sufficient - # NOTE: a node may have several IP addresses, one for each - # network interface. `get_ip()` might return any of them, - # while they might not work for communication inside the node - # if the network setup is complicated. Using the loopback address - # solves this issue, as it always works for communication inside - # the node. - driver_ip = "127.0.0.1" - distributed_init_method = get_distributed_init_method( - driver_ip, get_open_port()) - - # Initialize the actual workers inside worker wrapper. - init_worker_all_kwargs = [ - self._get_worker_kwargs( - local_rank=node_workers[node_id].index(rank), - rank=rank, - distributed_init_method=distributed_init_method, - ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids) - ] - self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) - self._run_workers("initialize") - self._run_workers("load_model") - - def _configure_ray_workers_use_nsight(self, - ray_remote_kwargs) -> Dict[str, Any]: - # If nsight profiling is enabled, we need to set the profiling - # configuration for the ray workers as runtime env. - runtime_env = ray_remote_kwargs.setdefault("runtime_env", {}) - runtime_env.update({ - "nsight": { - "t": "cuda,cudnn,cublas", - "o": "'worker_process_%p'", - "cuda-graph-trace": "node", - } - }) - - return ray_remote_kwargs - - def _get_env_vars_to_be_updated(self): - return self._env_vars_for_all_workers - - def _get_worker_kwargs( - self, - local_rank: int = 0, - rank: int = 0, - distributed_init_method: Optional[str] = None) -> Dict[str, Any]: - """ - Return worker init args for a given rank. - """ - if distributed_init_method is None: - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - return dict( - vllm_config=self.vllm_config, - local_rank=local_rank, - rank=rank, - distributed_init_method=distributed_init_method, - ) - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """ - Determine the number of available KV blocks. - - This invokes `determine_num_available_blocks` on each worker and takes - the min of the results, guaranteeing that the selected cache sizes are - compatible with all workers. - - Returns: - - tuple[num_gpu_blocks, num_cpu_blocks] - """ - # Get the maximum number of blocks that can be allocated on GPU and CPU. - num_blocks = self._run_workers("determine_num_available_blocks") - - # Since we use a shared centralized controller, we take the minimum - # number of blocks across all workers to make sure all the memory - # operators can be applied to all workers. - num_gpu_blocks = min(b[0] for b in num_blocks) - num_cpu_blocks = min(b[1] for b in num_blocks) - - return num_gpu_blocks, num_cpu_blocks - - def initialize(self, num_gpu_blocks: int) -> None: - """ - Initialize the KV cache in all workers. - """ - # NOTE: This is logged in the executor because there can be >1 worker - # with other executors. We could log in the engine level, but work - # remains to abstract away the device for non-GPU configurations. - logger.info("# GPU blocks: %d", num_gpu_blocks) - self._run_workers("initialize_cache", num_gpu_blocks) - self._run_workers("compile_or_warm_up_model") - - def _run_workers( - self, - method: str, - *args, - all_args: Optional[List[Tuple[Any, ...]]] = None, - all_kwargs: Optional[List[Dict[str, Any]]] = None, - **kwargs, - ) -> Any: - """ - Runs the given method on all workers. Can be used in the following - ways: - - Args: - - args/kwargs: All workers share the same args/kwargs - - all_args/all_kwargs: args/kwargs for each worker are specified - individually - """ - count = len(self.workers) - all_worker_args = repeat(args, count) if all_args is None \ - else islice(all_args, 0, None) - all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \ - else islice(all_kwargs, 0, None) - - ray_worker_refs = [ - worker.execute_method.remote( # type: ignore[attr-defined] - method, *worker_args, **worker_kwargs) - for (worker, worker_args, worker_kwargs - ) in zip(self.workers, all_worker_args, all_worker_kwargs) - ] - return ray.get(ray_worker_refs) - - def execute_model( - self, - scheduler_output, - ) -> ModelRunnerOutput: - if self.forward_dag is None: - self.forward_dag = self._compiled_ray_dag() - # Only the first worker (with rank 0) returns the execution result. - # Others return None. - output = ray.get(self.forward_dag.execute(scheduler_output))[0] - return output - - def profile(self, is_start=True): - raise NotImplementedError - - def shutdown(self): - if hasattr(self, "forward_dag") and self.forward_dag is not None: - self.forward_dag.teardown() - import ray - for worker in self.workers: - ray.kill(worker) - self.forward_dag = None - - def check_health(self) -> None: - logger.debug("Called check_health.") - - def _check_ray_compiled_graph_installation(self): - import pkg_resources - from packaging import version - - required_version = version.parse("2.39") - current_version = version.parse( - pkg_resources.get_distribution("ray").version) - if current_version < required_version: - raise ValueError(f"Ray version {required_version} is " - f"required, but found {current_version}") - - import importlib.util - raycg = importlib.util.find_spec("ray.experimental.compiled_dag_ref") - if raycg is None: - raise ValueError("Ray Compiled Graph is not installed. " - "Run `pip install ray[adag]` to install it.") - - cupy_spec = importlib.util.find_spec("cupy") - if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: - raise ValueError( - "cupy is not installed but required since " - "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set." - "Run `pip install ray[adag]` and check cupy installation.") - - def _compiled_ray_dag(self): - assert self.parallel_config.use_ray - self._check_ray_compiled_graph_installation() - from ray.dag import InputNode, MultiOutputNode - - with InputNode() as input_batches: - outputs = [ - worker.execute_model.bind( # type: ignore[attr-defined] - input_batches) for worker in self.workers - ] - forward_dag = MultiOutputNode(outputs) - - return forward_dag.experimental_compile() - - def __del__(self): - self.shutdown() diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py deleted file mode 100644 index 7733610e59c7f..0000000000000 --- a/vllm/v1/executor/ray_utils.py +++ /dev/null @@ -1,271 +0,0 @@ -import time -from collections import defaultdict -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple - -from vllm.config import ParallelConfig -from vllm.logger import init_logger -from vllm.platforms import current_platform -from vllm.utils import get_ip -from vllm.v1.outputs import ModelRunnerOutput -from vllm.worker.worker_base import WorkerWrapperBase - -if TYPE_CHECKING: - from vllm.v1.core.scheduler import SchedulerOutput - -logger = init_logger(__name__) -PG_WAIT_TIMEOUT = 60 - -try: - import ray - from ray.util import placement_group_table - from ray.util.placement_group import PlacementGroup - try: - from ray._private.state import available_resources_per_node - except ImportError: - # Ray 2.9.x doesn't expose `available_resources_per_node` - from ray._private.state import state as _state - available_resources_per_node = _state._available_resources_per_node - - class RayWorkerWrapper(WorkerWrapperBase): - - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - # Since the compiled DAG runs a main execution - # in a different thread that calls cuda.set_device. - # The flag indicates is set_device is called on - # that thread. It will be removed soon. - self.compiled_dag_cuda_device_set = False - - def get_node_ip(self) -> str: - return get_ip() - - def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]: - node_id = ray.get_runtime_context().get_node_id() - gpu_ids = ray.get_gpu_ids() - return node_id, gpu_ids - - def setup_device_if_necessary(self): - # TODO(swang): This is needed right now because Ray CG executes - # on a background thread, so we need to reset torch's current - # device. - # We can remove this API after it is fixed in compiled graph. - import torch - assert self.worker is not None, "Worker is not initialized" - if not self.compiled_dag_cuda_device_set: - torch.cuda.set_device(self.worker.device) - self.compiled_dag_cuda_device_set = True - - def execute_model( - self, - scheduler_output: "SchedulerOutput", - ) -> ModelRunnerOutput: - self.setup_device_if_necessary() - assert self.worker is not None, "Worker is not initialized" - output = self.worker.model_runner.execute_model(scheduler_output) - return output - - ray_import_err = None - -except ImportError as e: - ray = None # type: ignore - ray_import_err = e - RayWorkerWrapper = None # type: ignore - - -def ray_is_available() -> bool: - """Returns True if Ray is available.""" - return ray is not None - - -def assert_ray_available(): - """ - Raise an exception if Ray is not available. - """ - if ray is None: - raise ValueError("Failed to import Ray, please install Ray with " - "`pip install ray`.") from ray_import_err - - -def _verify_bundles(placement_group: "PlacementGroup", - parallel_config: ParallelConfig, device_str: str): - """ - Verify a given placement group has bundles located in the right place. - - There are 2 rules. - - Warn if all tensor parallel workers cannot fit in a single node. - - Fail if driver node is not included in a placement group. - - Args: - placement_group: The placement group to verify. - parallel_config: The parallel configuration. - device_str: The required device. - """ - assert ray.is_initialized(), ( - "Ray is not initialized although distributed-executor-backend is ray.") - pg_data = placement_group_table(placement_group) - # bundle_idx -> node_id - bundle_to_node_ids = pg_data["bundles_to_node_id"] - # bundle_idx -> bundle (e.g., {"GPU": 1}) - bundles = pg_data["bundles"] - # node_id -> List of bundle (e.g., {"GPU": 1}) - node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list) - - for bundle_idx, node_id in bundle_to_node_ids.items(): - node_id_to_bundle[node_id].append(bundles[bundle_idx]) - driver_node_id = ray.get_runtime_context().get_node_id() - - if driver_node_id not in node_id_to_bundle: - raise RuntimeError( - f"driver node id {driver_node_id} is not included in a placement " - f"group {placement_group.id}. Node id -> bundles " - f"{node_id_to_bundle}. " - "You don't have enough GPUs available in a current node. Check " - "`ray status` to see if you have available GPUs in a node " - f"{driver_node_id} before starting an vLLM engine.") - - for node_id, bundles in node_id_to_bundle.items(): - if len(bundles) < parallel_config.tensor_parallel_size: - logger.warning( - "tensor_parallel_size=%d " - "is bigger than a reserved number of %ss (%d " - "%ss) in a node %s. Tensor parallel workers can be " - "spread out to 2+ nodes which can degrade the performance " - "unless you have fast interconnect across nodes, like " - "Infiniband. To resolve this issue, make sure you have more " - "than %d GPUs available at each node.", - parallel_config.tensor_parallel_size, device_str, len(bundles), - device_str, node_id, parallel_config.tensor_parallel_size) - - -def _wait_until_pg_ready(current_placement_group: "PlacementGroup"): - """Wait until a placement group is ready. - - It prints the informative log messages if the placement group is - not created within time. - - """ - # Wait until PG is ready - this will block until all - # requested resources are available, and will timeout - # if they cannot be provisioned. - placement_group_specs = current_placement_group.bundle_specs - - s = time.time() - pg_ready_ref = current_placement_group.ready() - wait_interval = 10 - while time.time() - s < PG_WAIT_TIMEOUT: - ready, _ = ray.wait([pg_ready_ref], timeout=wait_interval) - if len(ready) > 0: - break - - # Exponential backoff for warning print. - wait_interval *= 2 - logger.info( - "Waiting for creating a placement group of specs for " - "%d seconds. specs=%s. Check " - "`ray status` to see if you have enough resources.", - int(time.time() - s), placement_group_specs) - - try: - ray.get(pg_ready_ref, timeout=0) - except ray.exceptions.GetTimeoutError: - raise ValueError( - "Cannot provide a placement group of " - f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See " - "`ray status` to make sure the cluster has enough resources." - ) from None - - -def initialize_ray_cluster( - parallel_config: ParallelConfig, - ray_address: Optional[str] = None, -): - """Initialize the distributed cluster with Ray. - - it will connect to the Ray cluster and create a placement group - for the workers, which includes the specification of the resources - for each distributed worker. - - Args: - parallel_config: The configurations for parallel execution. - ray_address: The address of the Ray cluster. If None, uses - the default Ray cluster address. - """ - assert_ray_available() - - # Connect to a ray cluster. - if current_platform.is_rocm() or current_platform.is_xpu(): - # Try to connect existing ray instance and create a new one if not found - try: - ray.init("auto") - except ConnectionError: - logger.warning( - "No existing RAY instance detected. " - "A new instance will be launched with current node resources.") - ray.init(address=ray_address, - ignore_reinit_error=True, - num_gpus=parallel_config.world_size) - else: - ray.init(address=ray_address, ignore_reinit_error=True) - - if parallel_config.placement_group: - # Placement group is already set. - return - - device_str = "GPU" if not current_platform.is_tpu() else "TPU" - # Create placement group for worker processes - current_placement_group = ray.util.get_current_placement_group() - if current_placement_group: - # We are in a placement group - bundles = current_placement_group.bundle_specs - # Verify that we can use the placement group. - device_bundles = 0 - for bundle in bundles: - bundle_devices = bundle.get(device_str, 0) - if bundle_devices > 1: - raise ValueError( - "Placement group bundle cannot have more than 1 " - f"{device_str}.") - if bundle_devices: - device_bundles += 1 - if parallel_config.world_size > device_bundles: - raise ValueError( - f"The number of required {device_str}s exceeds the total " - f"number of available {device_str}s in the placement group." - f"Required number of devices: {parallel_config.world_size}. " - f"Total number of devices: {device_bundles}.") - else: - num_devices_in_cluster = ray.cluster_resources().get(device_str, 0) - if parallel_config.world_size > num_devices_in_cluster: - raise ValueError( - f"The number of required {device_str}s exceeds the total " - f"number of available {device_str}s in the placement group.") - # Create a new placement group - placement_group_specs: List[Dict[str, float]] = ([{ - device_str: 1.0 - } for _ in range(parallel_config.world_size)]) - - # vLLM engine is also a worker to execute model with an accelerator, - # so it requires to have the device in a current node. Check if - # the current node has at least one device. - current_ip = get_ip() - current_node_id = ray.get_runtime_context().get_node_id() - current_node_resource = available_resources_per_node()[current_node_id] - if current_node_resource.get(device_str, 0) < 1: - raise ValueError( - f"Current node has no {device_str} available. " - f"{current_node_resource=}. vLLM engine cannot start without " - f"{device_str}. Make sure you have at least 1 {device_str} " - f"available in a node {current_node_id=} {current_ip=}.") - # This way, at least bundle is required to be created in a current - # node. - placement_group_specs[0][f"node:{current_ip}"] = 0.001 - - # By default, Ray packs resources as much as possible. - current_placement_group = ray.util.placement_group( - placement_group_specs, strategy="PACK") - _wait_until_pg_ready(current_placement_group) - - assert current_placement_group is not None - _verify_bundles(current_placement_group, parallel_config, device_str) - # Set the placement group in the parallel config - parallel_config.placement_group = current_placement_group diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py deleted file mode 100644 index be058318de58b..0000000000000 --- a/vllm/v1/executor/uniproc_executor.py +++ /dev/null @@ -1,84 +0,0 @@ -import os -from typing import Optional, Tuple - -from vllm.config import VllmConfig -from vllm.logger import init_logger -from vllm.utils import get_distributed_init_method, get_ip, get_open_port -from vllm.v1.executor.abstract import Executor -from vllm.v1.outputs import ModelRunnerOutput -from vllm.v1.worker.gpu_worker import Worker - -logger = init_logger(__name__) - - -class UniprocExecutor(Executor): - - def __init__(self, vllm_config: VllmConfig) -> None: - self.vllm_config = vllm_config - self.model_config = vllm_config.model_config - self.cache_config = vllm_config.cache_config - self.lora_config = vllm_config.lora_config - self.load_config = vllm_config.load_config - self.parallel_config = vllm_config.parallel_config - self.scheduler_config = vllm_config.scheduler_config - self.device_config = vllm_config.device_config - self.speculative_config = vllm_config.speculative_config - self.prompt_adapter_config = vllm_config.prompt_adapter_config - self.observability_config = vllm_config.observability_config - - self.worker: Worker = self._create_worker() - self.worker.initialize() - self.worker.load_model() - - def _create_worker( - self, - local_rank: int = 0, - rank: int = 0, - distributed_init_method: Optional[str] = None) -> Worker: - """Return worker init args for a given rank.""" - # see https://github.com/NVIDIA/nccl/issues/1234 - os.environ['NCCL_CUMEM_ENABLE'] = '0' - - if distributed_init_method is None: - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - return Worker( - vllm_config=self.vllm_config, - local_rank=local_rank, - rank=rank, - distributed_init_method=distributed_init_method, - ) - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Determine the number of available KV blocks by invoking the - underlying worker. - """ - return self.worker.determine_num_available_blocks() - - def initialize(self, num_gpu_blocks: int) -> None: - """Initialize the KV cache by invoking the underlying worker. - """ - # NOTE: This is logged in the executor because there can be >1 worker - # with other executors. We could log in the engine level, but work - # remains to abstract away the device for non-GPU configurations. - logger.info("# GPU blocks: %d", num_gpu_blocks) - self.worker.initialize_cache(num_gpu_blocks) - self.worker.compile_or_warm_up_model() - - def execute_model( - self, - scheduler_output, - ) -> ModelRunnerOutput: - output = self.worker.execute_model(scheduler_output) - return output - - def profile(self, is_start: bool = True): - self.worker.profile(is_start) - - def shutdown(self): - pass - - def check_health(self) -> None: - # UniprocExecutor will always be healthy as long as - # it's running. - return diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py new file mode 100644 index 0000000000000..6d5cc32ffc5b8 --- /dev/null +++ b/vllm/v1/kv_cache_interface.py @@ -0,0 +1,111 @@ +from dataclasses import dataclass +from typing import Dict, List + +import torch + +from vllm.logger import init_logger +from vllm.utils import cdiv, get_dtype_size + +logger = init_logger(__name__) + + +@dataclass +class KVCacheSpecBase: + """ + A base class for specifying the KV cache format of one layer. + """ + + # number of tokens in a block + block_size: int + + @property + def type_id(self) -> str: + """ + The type identifier of this KV cache. + Return different strings for layers with different KV cache type (e.g., + different number of tokens like full attention vs sliding window + attention, different KV cache size per token like layers with different + number of heads) + + Returns: + The type identifier of this KV cache. + """ + raise NotImplementedError + + @property + def page_size_bytes(self) -> int: + """ + The size of a page with `block_size` tokens in bytes. + + Returns: + The page size + """ + raise NotImplementedError + + def bytes_for_tokens(self, num_tokens: int) -> int: + """ + The KV cache size for `num_tokens` tokens in bytes. Returns the real + memory size after padding `num_tokens` to full blocks. + + Returns: + The KV cache size + """ + raise NotImplementedError + + +@dataclass +class FullAttentionSpec(KVCacheSpecBase): + num_kv_heads: int + head_size: int + dtype: torch.dtype + + @property + def type_id(self) -> str: + return f"full_attention_{self.block_size}_{self.page_size_bytes}" + + @property + def page_size_bytes(self) -> int: + return 2 * self.block_size * self.num_kv_heads * self.head_size \ + * get_dtype_size(self.dtype) + + def bytes_for_tokens(self, num_tokens: int) -> int: + return cdiv(num_tokens, self.block_size) * self.page_size_bytes + + +KVCacheSpec = Dict[str, KVCacheSpecBase] + + +@dataclass +class KVCacheTensor: + """ + A dataclass for specifying how the workers should initialize the KV cache + for a layer. Only contains the size of KV cache for that layer for now. Will + be extended to support multiple layers sharing the same memory pool. + """ + size: int # The size of KV cache Tensor in bytes + + +@dataclass +class KVCacheConfig: + """ + The KV cache configuration of a model. + """ + """The number of KV cache blocks""" + num_blocks: int + """layer_name -> how to initialize KV cache for that layer""" + tensors: Dict[str, KVCacheTensor] + """ + A list of kv-cache groups. Each group includes a set of layers with + the same kv-cache spec, and the total page_size of layers inside a group + is same across all groups (as the KVCacheManager only supports allocating + pages of the same size). For example: + 1. A model only uses full attention: one group with all layers in the model. + 2. (not implemented yet) A model with the same number of full attention + layers and sliding window attention layers: two groups, one for full + attention layers and one for sliding window attention layers. + 3. (not implemented yet) A model with 2 full attention layers and 4 sliding + window attention layers: three groups, (full * 2), (sw * 2), (sw * 2). + """ + groups: List[List[str]] + """the KVCacheSpec of the model""" + kv_cache_spec: KVCacheSpec diff --git a/vllm/v1/metrics/__init__.py b/vllm/v1/metrics/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py new file mode 100644 index 0000000000000..8feeef17542e6 --- /dev/null +++ b/vllm/v1/metrics/loggers.py @@ -0,0 +1,38 @@ +import time +from abc import ABC, abstractmethod + +from vllm.logger import init_logger +from vllm.v1.metrics.stats import SchedulerStats + +logger = init_logger(__name__) + +_LOCAL_LOGGING_INTERVAL_SEC = 5.0 + + +class StatLoggerBase(ABC): + + @abstractmethod + def log(self, scheduler_stats: SchedulerStats): + ... + + +class LoggingStatLogger(StatLoggerBase): + + def __init__(self): + self.last_log_time = time.monotonic() + + def log(self, scheduler_stats: SchedulerStats): + """Log Stats to standard output.""" + + # Log every _LOCAL_LOGGING_INTERVAL_SEC. + now = time.monotonic() + if now - self.last_log_time < _LOCAL_LOGGING_INTERVAL_SEC: + return + self.last_log_time = now + + # Format and print output. + logger.info( + "Running: %d reqs, Waiting: %d reqs ", + scheduler_stats.num_running_reqs, + scheduler_stats.num_waiting_reqs, + ) diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py new file mode 100644 index 0000000000000..60cb986f8bbce --- /dev/null +++ b/vllm/v1/metrics/stats.py @@ -0,0 +1,39 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from vllm.v1.engine import EngineCoreOutput + + +@dataclass +class SchedulerStats: + """Stats associated with the scheduler.""" + + num_running_reqs: int = 0 + num_waiting_reqs: int = 0 + + # gpu_cache_usage: float = 0.0 + # gpu_prefix_cache_hit_rate: float = 0.0 + + +class IterationStats: + """Stats associated with a single set of EngineCoreOutputs.""" + + def __init__(self, log_stats: bool): + self.log_stats = log_stats + self.num_generation_tokens = 0 + self.num_prompt_tokens = 0 + + def update_from_output(self, output: "EngineCoreOutput", + is_prefilling: bool, prompt_len: int): + if not self.log_stats: + return + + self.num_generation_tokens += len(output.new_token_ids) + if is_prefilling: + # This relies on the invariant that EngineCore does + # not stream outputs for partially completed prefills + # (scheduler.update_from_output makes EngineCoreOutput + # iff num_computed_tokens == num_tokens). + assert (len(output.new_token_ids) > 0) + self.num_prompt_tokens += prompt_len diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index b0a7affbebb7e..8dfcf2dd78606 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -1,13 +1,20 @@ import multiprocessing import os import weakref +from collections import defaultdict from collections.abc import Sequence -from typing import (Any, Callable, Dict, Generic, List, Optional, TypeVar, - Union, overload) +from typing import (TYPE_CHECKING, Any, Callable, Dict, Generic, List, + Optional, TypeVar, Union, overload) + +import torch from vllm.logger import init_logger +from vllm.model_executor.models.utils import extract_layer_index from vllm.utils import get_mp_context, kill_process_tree +if TYPE_CHECKING: + from vllm.attention.layer import Attention + logger = init_logger(__name__) T = TypeVar("T") @@ -134,3 +141,48 @@ def shutdown(proc: multiprocessing.Process, input_path: str, output_path: str): socket_file = ipc_socket.replace("ipc://", "") if os and os.path.exists(socket_file): os.remove(socket_file) + + +def bind_kv_cache( + kv_caches: Dict[str, torch.Tensor], + forward_context: Dict[str, "Attention"], + runner_kv_caches: List[torch.Tensor], +) -> None: + """ + Bind the allocated KV cache to both ModelRunner and forward context so + that the KV cache can be used in the forward pass. + + This function: + 1) Fills the ModelRunner's kv cache list (`runner_kv_caches`) with + kv_caches. + 2) Associates each attention layer in the `forward_context` with its + corresponding KV cache in kv_caches. + + Args: + kv_caches: The allocated kv_caches with layer names as keys. + forward_context: The global forward context containing all Attention + layers with layer names as keys. + runner_kv_caches: The kv_cache declared by ModelRunner. + """ + # Bind kv_caches to ModelRunner + assert len(runner_kv_caches) == 0 + + # Convert kv_caches dict to a list of tensors in the order of layer_index. + index2name = defaultdict(list) + for layer_name in kv_caches: + index2name[extract_layer_index(layer_name)].append(layer_name) + + for layer_index in sorted(index2name.keys()): + layer_names = index2name[layer_index] + if len(layer_names) > 1: + # One typical case is encoder-decoder model, e.g., bart. + # The cross attention and self attention in the same decoder layer + # has different layer_name but the same layer_index. + raise NotImplementedError + layer_name = layer_names[0] + runner_kv_caches.append(kv_caches[layer_name]) + + # Bind kv_caches to forward context + for layer_name, kv_cache in kv_caches.items(): + # NOTE: Use list because of v0 PP virtual engine. + forward_context[layer_name].kv_cache = [kv_cache] diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 40494e64b22f0..28d8e39053874 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -30,6 +30,9 @@ class CachedRequestState: num_computed_tokens: int output_token_ids: List[int] + mrope_positions: Optional[torch.Tensor] = None + mrope_position_delta: Optional[int] = None + @property def num_tokens(self) -> int: return len(self.prompt_token_ids) + len(self.output_token_ids) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a1d4f9b135789..2350074c23a59 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -7,11 +7,14 @@ import torch import torch.distributed import torch.nn as nn +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.config import CompilationLevel, VllmConfig from vllm.distributed.parallel_state import graph_capture from vllm.forward_context import set_forward_context from vllm.inputs import INPUT_REGISTRY from vllm.logger import init_logger +from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import get_model from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.sampling_params import SamplingType @@ -19,9 +22,13 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, LayerBlockType, cdiv, is_pin_memory_available) from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend, FlashAttentionMetadata) +from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.engine.mm_input_mapper import MMInputMapperClient +from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, + KVCacheSpec) from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.utils import bind_kv_cache from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch if TYPE_CHECKING: @@ -87,8 +94,12 @@ class GPUModelRunner: self.mm_input_mapper_profiling = MMInputMapperClient(self.model_config) self.mm_input_mapper_profiling.use_cache = False - self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens # noqa: E501 - self.encoder_cache_size = self.scheduler_config.encoder_cache_size + encoder_compute_budget, encoder_cache_size = compute_encoder_budget( + model_config=model_config, + scheduler_config=scheduler_config, + ) + self.max_num_encoder_input_tokens = encoder_compute_budget + self.encoder_cache_size = encoder_cache_size # Lazy initialization # self.model: nn.Module # Set after load_model @@ -129,6 +140,32 @@ class GPUModelRunner: self.positions = torch.zeros(self.max_num_tokens, dtype=torch.int64, device=self.device) + + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.model_config.uses_mrope: + # NOTE: `mrope_positions` is implemented as a permuted tensor to + # satisfy the following properties to allow `torch.compile` to work + # properly: + # - shape: (3, ) + # - stride: (1, 3) + # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1921022256 + + # NOTE: When M-RoPE is enabled, position ids are 3D regardless of + # the modality of inputs. For text-only inputs, each dimension has + # identical position IDs, making M-RoPE functionally equivalent to + # 1D-RoPE. + # See page 5 of https://arxiv.org/abs/2409.12191 + self.mrope_positions = torch.zeros((self.max_num_tokens, 3), + dtype=torch.int64, + device=self.device) + self.mrope_positions_cpu = torch.zeros((self.max_num_tokens, 3), + dtype=torch.int64, + device="cpu", + pin_memory=self.pin_memory) + + self.mrope_positions = self.mrope_positions.permute((1, 0)) + self.mrope_positions_cpu = self.mrope_positions_cpu.permute((1, 0)) + self.inputs_embeds = torch.zeros( (self.max_num_tokens, self.hidden_size), dtype=self.dtype, @@ -236,6 +273,35 @@ class GPUModelRunner: num_computed_tokens=new_req_data.num_computed_tokens, output_token_ids=[], ) + + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.model_config.uses_mrope: + image_grid_thw = [] + video_grid_thw = [] + for mm_input in self.requests[req_id].mm_inputs: + if mm_input.get("image_grid_thw") is not None: + image_grid_thw.extend( + mm_input["image_grid_thw"].tolist()) + if mm_input.get("video_grid_thw") is not None: + video_grid_thw.extend( + mm_input["video_grid_thw"].tolist()) + + hf_config = self.model_config.hf_config + + self.requests[req_id].mrope_positions, \ + self.requests[req_id].mrope_position_delta = \ + MRotaryEmbedding.get_input_positions_tensor( + self.requests[req_id].prompt_token_ids, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + image_token_id=hf_config.image_token_id, + video_token_id=hf_config.video_token_id, + vision_start_token_id=hf_config.vision_start_token_id, + vision_end_token_id=hf_config.vision_end_token_id, + spatial_merge_size=hf_config.vision_config. + spatial_merge_size, + ) + req_ids_to_add.append(req_id) # Update the cached states of the resumed requests. @@ -303,6 +369,11 @@ class GPUModelRunner: arange, out=positions_np) + # Calculate M-RoPE positions. + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.model_config.uses_mrope: + self._calc_mrope_positions(scheduler_output) + # Get token indices. # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2] @@ -349,8 +420,16 @@ class GPUModelRunner: # Copy the tensors to the GPU. self.input_ids[:total_num_scheduled_tokens].copy_( self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True) - self.positions[:total_num_scheduled_tokens].copy_( - self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True) + if self.model_config.uses_mrope: + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + self.mrope_positions[:, :total_num_scheduled_tokens].copy_( + self.mrope_positions_cpu[:, :total_num_scheduled_tokens], + non_blocking=True) + else: + # Common case (1D positions) + self.positions[:total_num_scheduled_tokens].copy_( + self.positions_cpu[:total_num_scheduled_tokens], + non_blocking=True) query_start_loc = self.query_start_loc_cpu[:num_reqs + 1].to( self.device, non_blocking=True) seq_start_loc = self.seq_start_loc_cpu[:num_reqs + 1].to( @@ -462,6 +541,61 @@ class GPUModelRunner: logits_indices = query_start_loc[1:] - 1 return attn_metadata, logits_indices + def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"): + mrope_pos_ptr = 0 + num_reqs = self.input_batch.num_reqs + for index, req_id in enumerate(self.input_batch.req_ids[:num_reqs]): + assert req_id is not None + + req = self.requests[req_id] + assert req.mrope_positions is not None + + num_computed_tokens = \ + self.input_batch.num_computed_tokens_cpu[index] + num_scheduled_tokens = \ + scheduler_output.num_scheduled_tokens[req_id] + num_prompt_tokens = len(req.prompt_token_ids) + + if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens: + prompt_part_len = max(0, + num_prompt_tokens - num_computed_tokens) + completion_part_len = max( + 0, num_scheduled_tokens - prompt_part_len) + else: + prompt_part_len = num_scheduled_tokens + completion_part_len = 0 + + assert num_scheduled_tokens == prompt_part_len + completion_part_len + + if prompt_part_len > 0: + # prompt's mrope_positions are pre-computed + dst_start = mrope_pos_ptr + dst_end = mrope_pos_ptr + prompt_part_len + src_start = num_computed_tokens + src_end = num_computed_tokens + prompt_part_len + + self.mrope_positions_cpu[:, dst_start:dst_end] = \ + req.mrope_positions[:,src_start:src_end] + + mrope_pos_ptr += prompt_part_len + + if completion_part_len > 0: + # compute completion's mrope_positions on-the-fly + dst_start = mrope_pos_ptr + dst_end = mrope_pos_ptr + completion_part_len + + self.mrope_positions_cpu[:, dst_start:dst_end] = \ + MRotaryEmbedding.get_next_input_positions_tensor( + req.mrope_position_delta, + context_len=num_computed_tokens + + prompt_part_len, + seq_len=num_computed_tokens + + prompt_part_len + + completion_part_len, + ) + + mrope_pos_ptr += completion_part_len + def _prepare_sampling( self, scheduler_output: "SchedulerOutput", @@ -555,6 +689,9 @@ class GPUModelRunner: encoder_outputs.append(encoder_output[start_idx:end_idx]) return encoder_outputs + def get_model(self) -> nn.Module: + return self.model + @torch.inference_mode() def execute_model( self, @@ -608,9 +745,12 @@ class GPUModelRunner: # Run the decoder. # Use persistent buffers for CUDA graphs. with set_forward_context(attn_metadata, self.vllm_config): + positions = self.mrope_positions[:, :num_input_tokens] \ + if self.model_config.uses_mrope \ + else self.positions[:num_input_tokens] hidden_states = self.model( input_ids=input_ids, - positions=self.positions[:num_input_tokens], + positions=positions, kv_caches=self.kv_caches, attn_metadata=None, inputs_embeds=inputs_embeds, @@ -697,9 +837,12 @@ class GPUModelRunner: input_ids = self.input_ids[:num_tokens] inputs_embeds = None with set_forward_context(None, self.vllm_config): + positions = self.mrope_positions[:, :num_tokens] \ + if self.model_config.uses_mrope \ + else self.positions[:num_tokens] hidden_states = model( input_ids=input_ids, - positions=self.positions[:num_tokens], + positions=positions, kv_caches=kv_caches, attn_metadata=None, inputs_embeds=inputs_embeds, @@ -720,44 +863,30 @@ class GPUModelRunner: ] # Profile with multimodal encoder & encoder cache. - if self.is_multimodal_model: - - # Create dummy batch of multimodal inputs. - dummy_request_data = self.input_registry.dummy_data_for_profiling( - model_config=self.model_config, - seq_len=self.max_num_tokens, - mm_registry=self.mm_registry, - ) - dummy_mm_data = dummy_request_data.multi_modal_data + # TODO: handle encoder-decoder models once we support them. + if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0 + and self.encoder_cache_size > 0): # NOTE: Currently model is profiled with a single non-text # modality with the max possible input tokens even when # it supports multiple. - max_tokens_by_modality_dict = self.mm_registry.get_max_tokens_per_item_by_modality( # noqa: E501 + max_tokens_by_modality_dict = MULTIMODAL_REGISTRY.get_max_tokens_per_item_by_nonzero_modality( # noqa: E501 self.model_config) - dummy_data_modality, max_tokens_per_mm_item = max( max_tokens_by_modality_dict.items(), key=lambda item: item[1]) # Check how many items of this modality can be supported by - # the encoder cache budget. - encoder_cache_budget = min(self.max_num_encoder_input_tokens, - self.encoder_cache_size) - max_num_mm_items_encoder_budget = encoder_cache_budget // \ - max_tokens_per_mm_item + # the encoder budget. + encoder_budget = min(self.max_num_encoder_input_tokens, + self.encoder_cache_size) - # TODO: Allow users to set encoder_cache_budget in case this - # happens. - assert max_num_mm_items_encoder_budget > 0, ( - f"Encoder cache budget={encoder_cache_budget} is too small to " - f"support the maximum possible size of multimodal embeddings" - f"={max_tokens_per_mm_item}.") + max_num_mm_items_encoder_budget = cdiv(encoder_budget, + max_tokens_per_mm_item) # Check how many items of this modality can be supported by # the decoder budget. - max_mm_items_per_req = max( - self.mm_registry.get_mm_limits_per_prompt( - self.model_config).values()) + max_mm_items_per_req = self.mm_registry.get_mm_limits_per_prompt( + self.model_config)[dummy_data_modality] # NOTE: We do not consider max_num_batched_tokens on purpose # because the multimodal embeddings can be generated in advance @@ -768,6 +897,19 @@ class GPUModelRunner: max_num_mm_items = min(max_num_mm_items_encoder_budget, max_num_mm_items_decoder_budget) + logger.info( + "Encoder cache will be initialized with a budget of %s tokens," + " and profiled with %s %s items of the maximum feature size.", + encoder_budget, max_num_mm_items, dummy_data_modality) + + # Create dummy batch of multimodal inputs. + dummy_request_data = self.input_registry.dummy_data_for_profiling( + model_config=self.model_config, + seq_len=self.max_num_tokens, + mm_registry=self.mm_registry, + ) + dummy_mm_data = dummy_request_data.multi_modal_data + # Dummy data definition in V0 may contain multiple multimodal items # (e.g, multiple images) for a single request, therefore here we # always replicate first item by max_num_mm_items times since in V1 @@ -851,12 +993,71 @@ class GPUModelRunner: logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", elapsed_time, cuda_graph_size / (1 << 30)) - def initialize_kv_cache(self, num_blocks: int) -> None: - assert len(self.kv_caches) == 0 - kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape( - num_blocks, self.block_size, self.num_kv_heads, self.head_size) - for _ in range(self.num_attn_layers): - self.kv_caches.append( - torch.zeros(kv_cache_shape, - dtype=self.kv_cache_dtype, - device=self.device)) + def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: + """ + Initialize KV cache based on `kv_cache_config`. + Args: + kv_cache_config: Configuration for the KV cache, including the KV + cache size of each layer + """ + if len(kv_cache_config.groups) > 1: + raise NotImplementedError( + "Hybrid models with more than one KV cache type are not " + "supported yet.") + + kv_caches: Dict[str, torch.Tensor] = {} + + for layer_name, layer_spec in kv_cache_config.kv_cache_spec.items(): + tensor_config = kv_cache_config.tensors[layer_name] + assert tensor_config.size % layer_spec.page_size_bytes == 0 + num_blocks = tensor_config.size // layer_spec.page_size_bytes + if isinstance(layer_spec, FullAttentionSpec): + kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape( + num_blocks, layer_spec.block_size, layer_spec.num_kv_heads, + layer_spec.head_size) + dtype = layer_spec.dtype + kv_caches[layer_name] = torch.zeros(kv_cache_shape, + dtype=dtype, + device=self.device) + else: + raise NotImplementedError + + bind_kv_cache( + kv_caches, + self.vllm_config.compilation_config.static_forward_context, + self.kv_caches) + + def get_kv_cache_spec(self) -> KVCacheSpec: + """ + Generates the KVCacheSpec by parsing the kv cache format from each + Attention module in the static forward context. + Returns: + KVCacheSpec: A dictionary mapping layer names to their KV cache + format. Layers that do not need KV cache are not included. + """ + + forward_ctx = self.vllm_config.compilation_config.static_forward_context + block_size = self.vllm_config.cache_config.block_size + kv_cache_spec: KVCacheSpec = {} + for layer_name, attn_module in forward_ctx.items(): + # TODO: Support other attention modules, e.g., sliding window, + # cross-attention, MLA. + assert isinstance(attn_module, Attention) + if attn_module.attn_type == AttentionType.DECODER: + kv_cache_spec[layer_name] = FullAttentionSpec( + block_size=block_size, + num_kv_heads=attn_module.num_kv_heads, + head_size=attn_module.head_size, + dtype=attn_module.dtype, + ) + elif attn_module.attn_type in (AttentionType.ENCODER, + AttentionType.ENCODER_ONLY): + # encoder-only attention does not need KV cache. + continue + elif attn_module.attn_type == AttentionType.ENCODER_DECODER: + raise NotImplementedError + else: + raise ValueError( + f"Unknown attention type: {attn_module.attn_type}") + + return kv_cache_spec diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index af438f7d5820c..0929e64d58f1e 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -1,10 +1,11 @@ """A GPU worker class.""" import gc import os -from typing import TYPE_CHECKING, Optional, Tuple +from typing import TYPE_CHECKING, Optional import torch import torch.distributed +import torch.nn as nn import vllm.envs as envs from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig @@ -16,6 +17,7 @@ from vllm.model_executor import set_random_seed from vllm.platforms import current_platform from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, get_dtype_size from vllm.v1.core.scheduler import SchedulerOutput +from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.worker.gpu_model_runner import GPUModelRunner @@ -33,6 +35,7 @@ class Worker: local_rank: int, rank: int, distributed_init_method: str, + is_driver_worker: bool = False, ): # TODO: use WorkerBase.__init__(self, vllm_config=vllm_config) @@ -75,7 +78,7 @@ class Worker: else: self.profiler = None - def initialize(self): + def init_device(self): if self.device_config.device.type == "cuda": # torch.distributed.all_reduce does not free the input tensor until # the synchronization point. This causes the memory usage to grow @@ -111,20 +114,18 @@ class Worker: self.model_runner.load_model() @torch.inference_mode() - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Profiles the peak memory usage of the model to determine how many - KV blocks may be allocated without OOMs. + def determine_available_memory(self) -> int: + """Profiles the peak memory usage of the model to determine how much + memory can be used for KV cache without OOMs. The engine will first conduct a profiling of the existing memory usage. - Then, it calculate the maximum possible number of GPU and CPU blocks - that can be allocated with the remaining free memory. + Then, it calculate the free memory that can be used for KV cache in + bytes. .. tip:: You may limit the usage of GPU memory by adjusting the `gpu_memory_utilization` parameter. """ - # Profile the memory usage of the model and get the maximum number of - # cache blocks that can be allocated with the remaining free memory. torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() @@ -132,7 +133,6 @@ class Worker: # Execute a forward pass with dummy inputs to profile the memory usage # of the model. self.model_runner.profile_run() - torch.cuda.synchronize() free_gpu_memory, _ = torch.cuda.mem_get_info() # NOTE(woosuk): Here we assume that the other processes using the same @@ -161,33 +161,14 @@ class Worker: total_gpu_memory * self.cache_config.gpu_memory_utilization - peak_memory) - # Calculate the number of blocks that can be allocated with the - # profiled peak memory. - cache_block_size = _get_cache_block_size(self.cache_config, - self.model_config, - self.parallel_config) - num_gpu_blocks = int(available_kv_cache_memory // cache_block_size) - num_gpu_blocks = max(num_gpu_blocks, 0) - return num_gpu_blocks, 0 + return int(available_kv_cache_memory) - def initialize_cache(self, num_gpu_blocks: int) -> None: - """Allocate GPU and CPU KV cache with the specified number of blocks.""" - if num_gpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine.") + def get_kv_cache_spec(self) -> KVCacheSpec: + return self.model_runner.get_kv_cache_spec() - max_seq_len = self.cache_config.block_size * num_gpu_blocks - max_model_len = self.model_config.max_model_len - if max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`gpu_memory_utilization` or decreasing `max_model_len` when " - "initializing the engine.") - - self.model_runner.initialize_kv_cache(num_gpu_blocks) + def initialize_cache(self, kv_cache_config: KVCacheConfig) -> None: + """Allocate GPU KV cache with the specified kv_cache_config.""" + self.model_runner.initialize_kv_cache(kv_cache_config) def compile_or_warm_up_model(self) -> None: if not self.model_config.enforce_eager: @@ -196,11 +177,14 @@ class Worker: # the model initialization and profiling. set_random_seed(self.model_config.seed) + def get_model(self) -> nn.Module: + return self.model_runner.get_model() + @torch.inference_mode() def execute_model( self, scheduler_output: "SchedulerOutput", - ) -> ModelRunnerOutput: + ) -> Optional[ModelRunnerOutput]: output = self.model_runner.execute_model(scheduler_output) return output if self.rank == 0 else None diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py index cc24cfe04d2ba..fa6775cbd6c66 100644 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -305,7 +305,8 @@ class CPUEncoderDecoderModelRunner( intermediate_tensors, } - with set_forward_context(model_input.attn_metadata, self.vllm_config): + with set_forward_context(model_input.attn_metadata, self.vllm_config, + model_input.virtual_engine): hidden_states = model_executable(**execute_model_kwargs) # Compute the logits. diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index f1531e0fc0675..abbf6450ab7f6 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -2,8 +2,8 @@ import dataclasses import weakref from collections import defaultdict from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Type, TypeVar, - Union) +from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Set, Type, + TypeVar, Union) import torch from torch import nn @@ -12,10 +12,14 @@ from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import VllmConfig from vllm.forward_context import set_forward_context from vllm.logger import init_logger +from vllm.lora.layers import LoRAMapping +from vllm.lora.request import LoRARequest +from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader import get_model +from vllm.model_executor.models import supports_lora, supports_multimodal from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, MultiModalKwargs, MultiModalPlaceholderMap) from vllm.sequence import (IntermediateTensors, SequenceData, @@ -49,6 +53,8 @@ class ModelInputForCPU(ModelRunnerInputBase): virtual_engine: Optional[int] = None seq_lens: Optional[List[int]] = None query_lens: Optional[List[int]] = None + lora_mapping: Optional["LoRAMapping"] = None + lora_requests: Optional[Set[LoRARequest]] = None def as_broadcastable_tensor_dict( self) -> Dict[str, Union[int, torch.Tensor]]: @@ -57,6 +63,8 @@ class ModelInputForCPU(ModelRunnerInputBase): "input_positions": self.input_positions, "token_type_ids": self.token_type_ids, "multi_modal_kwargs": self.multi_modal_kwargs, + "lora_requests": self.lora_requests, + "lora_mapping": self.lora_mapping, } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) @@ -143,7 +151,11 @@ class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]): or runner.cache_config.enable_prefix_caching) self.model_input_cls = self.runner._model_input_cls self.attn_backend = self.runner.attn_backend + self.sliding_window = self.runner.sliding_window + self.block_size = self.runner.block_size + self.device = self.runner.device self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper + self.enable_lora = self.runner.lora_config is not None self.input_data = ModelInputForCPUBuilder.ModelInputData( self.runner.model_config.uses_mrope) self.att_metadata_builder = self.runner.attn_backend.get_builder_cls()( @@ -183,15 +195,28 @@ class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]): attn_metadata = self.att_metadata_builder.build( input_data.seq_lens, input_data.query_lens, -1, -1) - return self.model_input_cls( - input_tokens=input_tokens, - input_positions=input_positions, - token_type_ids=token_type_ids, - seq_lens=input_data.seq_lens, - query_lens=input_data.query_lens, - attn_metadata=attn_metadata, - multi_modal_kwargs=multi_modal_kwargs, - ) + is_prompt = (self.seq_group_metadata_list[0].is_prompt + if self.seq_group_metadata_list else None) + # LoRA data. + lora_requests = set() + lora_mapping = None + if self.enable_lora: + lora_requests = set(seq.lora_request + for seq in self.seq_group_metadata_list + if seq.lora_request is not None) + + lora_mapping = self._prepare_lora_input( + self.seq_group_metadata_list, is_prompt) + + return self.model_input_cls(input_tokens=input_tokens, + input_positions=input_positions, + token_type_ids=token_type_ids, + seq_lens=input_data.seq_lens, + query_lens=input_data.query_lens, + attn_metadata=attn_metadata, + multi_modal_kwargs=multi_modal_kwargs, + lora_mapping=lora_mapping, + lora_requests=lora_requests) def _build_input_data(self): for seq_group_metadata in self.seq_group_metadata_list: @@ -381,6 +406,24 @@ class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]): self.input_data.multi_modal_placeholder_maps[modality].extend( placeholder_map) + def _prepare_lora_input( + self, seq_group_metadata_list: List[SequenceGroupMetadata], + is_prefill: bool) -> LoRAMapping: + index_mapping = [] + prompt_mapping = [] + for seq in seq_group_metadata_list: + lora_id = seq.lora_int_id + query_len = seq.token_chunk_size + + index_mapping += [lora_id] * query_len + prompt_mapping += [lora_id] * ( + query_len if seq.sampling_params + and seq.sampling_params.prompt_logprobs is not None else 1) + + return LoRAMapping(index_mapping=tuple(index_mapping), + prompt_mapping=tuple(prompt_mapping), + is_prefill=is_prefill) + class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]): """ @@ -431,10 +474,44 @@ class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]): # Lazy initialization. self.model: nn.Module # Set after init_Model + # Set after load_model. + self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None def load_model(self) -> None: self.model = get_model(vllm_config=self.vllm_config) + if self.lora_config: + assert supports_lora( + self.model + ), f"{self.model.__class__.__name__} does not support LoRA yet." + + if supports_multimodal(self.model): + logger.warning("Regarding multimodal models, vLLM currently " + "only supports adding LoRA to language model.") + + # It's necessary to distinguish between the max_position_embeddings + # of VLMs and LLMs. + if hasattr(self.model.config, "max_position_embeddings"): + max_pos_embeddings = self.model.config.max_position_embeddings + else: + max_pos_embeddings = ( + self.model.config.text_config.max_position_embeddings) + + self.lora_manager = LRUCacheWorkerLoRAManager( + self.scheduler_config.max_num_seqs, + self.scheduler_config.max_num_batched_tokens, + self.vocab_size, + self.lora_config, + self.device, + self.model.embedding_modules, + self.model.embedding_padding_modules, + max_position_embeddings=max_pos_embeddings, + ) + self.model = self.lora_manager.create_lora_manager(self.model) + + def get_model(self) -> nn.Module: + return self.model + def _prepare_model_input_tensors( self, seq_group_metadata_list: List[SequenceGroupMetadata], @@ -459,6 +536,37 @@ class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]): def vocab_size(self) -> int: return self.model_config.get_vocab_size() + def remove_all_loras(self): + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + self.lora_manager.remove_all_adapters() + + def set_active_loras(self, lora_requests: Set[LoRARequest], + lora_mapping: LoRAMapping) -> None: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + self.lora_manager.set_active_adapters(lora_requests, lora_mapping) + + def add_lora(self, lora_request: LoRARequest) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.add_adapter(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.remove_adapter(lora_id) + + def pin_lora(self, lora_id: int) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.pin_adapter(lora_id) + + def list_loras(self) -> Set[int]: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.list_adapters() + class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]): _model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = ( @@ -515,6 +623,12 @@ class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]): raise ValueError( "CPU worker does not support multi-step execution.") + if self.lora_config: + assert model_input.lora_requests is not None + assert model_input.lora_mapping is not None + self.set_active_loras(model_input.lora_requests, + model_input.lora_mapping) + model_executable = self.model multimodal_kwargs = {} @@ -526,7 +640,8 @@ class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]): execute_model_kwargs.update( {"previous_hidden_states": previous_hidden_states}) - with set_forward_context(model_input.attn_metadata, self.vllm_config): + with set_forward_context(model_input.attn_metadata, self.vllm_config, + model_input.virtual_engine): hidden_states = model_executable( input_ids=model_input.input_tokens, positions=model_input.input_positions, diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py index 17b2fd2564a04..d31ba89e12375 100644 --- a/vllm/worker/cpu_pooling_model_runner.py +++ b/vllm/worker/cpu_pooling_model_runner.py @@ -69,7 +69,8 @@ class CPUPoolingModelRunner( intermediate_tensors, } - with set_forward_context(model_input.attn_metadata, self.vllm_config): + with set_forward_context(model_input.attn_metadata, self.vllm_config, + model_input.virtual_engine): hidden_states = model_executable(**execute_model_kwargs) # Only perform pooling in the driver worker. diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index b5dfebfce6f75..3e5fcf11b9e16 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -1,5 +1,5 @@ """A CPU worker class.""" -from typing import Dict, List, Optional, Tuple, Type +from typing import Dict, List, Optional, Set, Tuple, Type import torch import torch.distributed @@ -11,14 +11,14 @@ from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.logger import init_logger +from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed from vllm.sequence import ExecuteModelRequest -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, bind_kv_cache from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner from vllm.worker.cpu_model_runner import CPUModelRunner, CPUModelRunnerBase from vllm.worker.cpu_pooling_model_runner import CPUPoolingModelRunner -from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, - LoraNotSupportedWorkerBase, WorkerBase, +from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase, WorkerInput) logger = init_logger(__name__) @@ -111,7 +111,7 @@ class CPUCacheEngine: return dtype_size * total -class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): +class CPUWorker(LocalOrDistributedWorkerBase): """A worker class that executes (a partition of) the model on a CPU socket. Each worker is associated with a single CPU socket. The worker is @@ -266,6 +266,18 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): # Initialize the cache. self._init_cache_engine() + def add_lora(self, lora_request: LoRARequest) -> bool: + return self.model_runner.add_lora(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + return self.model_runner.remove_lora(lora_id) + + def pin_lora(self, lora_id: int) -> bool: + return self.model_runner.pin_lora(lora_id) + + def list_loras(self) -> Set[int]: + return self.model_runner.list_loras() + def _validate_num_cpu_blocks(self, num_cpu_blocks: int) -> None: """Raise errors if the num_cpu_blocks is invalid. """ @@ -293,6 +305,8 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): self.cache_engine[ve].cpu_cache for ve in range(self.parallel_config.pipeline_parallel_size) ] + bind_kv_cache(self.compilation_config.static_forward_context, + self.cpu_cache) self.model_runner.block_size = self.cache_engine[0].block_size assert all( diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 4d5d918087be8..8a161b740042d 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -175,7 +175,8 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): } if self.has_inner_state else {} multi_modal_kwargs = model_input.multi_modal_kwargs or {} - with set_forward_context(model_input.attn_metadata, self.vllm_config): + with set_forward_context(model_input.attn_metadata, self.vllm_config, + model_input.virtual_engine): hidden_or_intermediate_states = model_executable( input_ids=model_input.input_tokens, positions=model_input.input_positions, diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 9d479f412af46..4c8f69e449393 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -21,6 +21,7 @@ from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, import habana_frameworks.torch as htorch import habana_frameworks.torch.internal.bridge_config as bc import torch +import torch.nn as nn from vllm_hpu_extension.ops import LoraMask as LoraMask from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler, HabanaMemoryProfiler, format_bytes) @@ -28,6 +29,7 @@ from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler, from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import DeviceConfig, VllmConfig from vllm.distributed.parallel_state import get_world_group +from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest @@ -40,7 +42,8 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, from vllm.sampling_params import SamplingParams from vllm.sequence import (IntermediateTensors, SequenceData, SequenceGroupMetadata) -from vllm.utils import is_pin_memory_available, make_tensor_with_pad +from vllm.utils import (bind_kv_cache, is_pin_memory_available, + make_tensor_with_pad) from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, _add_attn_metadata_broadcastable_dict, @@ -287,12 +290,14 @@ def modify_decoder_layer(module: torch.nn.Module, suffix="DecoderLayer"): class HpuModelAdapter: - def __init__(self, model, block_size, dtype, enforce_eager): + def __init__(self, model, vllm_config): self.model = model self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', '0').lower() in ['1', 'true'] - self.block_size = block_size - self.dtype = dtype + self.vllm_config = vllm_config + self.block_size = vllm_config.cache_config.block_size + self.dtype = vllm_config.model_config.dtype + enforce_eager = vllm_config.model_config.enforce_eager if not htorch.utils.internal.is_lazy() and not enforce_eager: self.model = torch.compile(self.model, backend='hpu_backend', @@ -351,14 +356,20 @@ class HpuModelAdapter: selected_token_indices = kwargs.pop('selected_token_indices') if 'warmup_mode' in kwargs: kwargs.pop('warmup_mode') + virtual_engine = 0 + if 'virtual_engine' in kwargs: + virtual_engine = kwargs.pop('virtual_engine') input_ids = kwargs['input_ids'] kwargs['attn_metadata'] = self._update_metadata( kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), input_ids.device, self.dtype) LoraMask.setLoraMask(kwargs.pop('lora_mask')) - hidden_states = self.model(*args, **kwargs) - hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) - hidden_states = hidden_states.index_select(0, selected_token_indices) + with set_forward_context(kwargs['attn_metadata'], self.vllm_config, + virtual_engine): + hidden_states = self.model(*args, **kwargs) + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + hidden_states = hidden_states.index_select(0, + selected_token_indices) return hidden_states def compute_logits(self, *args, **kwargs): @@ -658,10 +669,7 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]): with HabanaMemoryProfiler() as m_wrap: self.model = _maybe_wrap_in_hpu_graph( - self.model, - self.block_size, - dtype=self.model_config.dtype, - enforce_eager=self.enforce_eager) + self.model, vllm_config=self.vllm_config) msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}" logger.info(msg) @@ -669,6 +677,9 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]): msg = f"Loading model weights took in total {m.get_summary_string()}" logger.info(msg) + def get_model(self) -> nn.Module: + return self.model + def _use_graphs(self, batch_size, seq_len, is_prompt): if self.enforce_eager: return False @@ -1286,6 +1297,9 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]): def profile_run(self) -> None: num_layers = self.model_config.get_num_layers(self.parallel_config) kv_caches = [None] * num_layers + bind_kv_cache( + self.vllm_config.compilation_config.static_forward_context, + [kv_caches]) max_seq_len = self.bucketing_global_state.prompt_seq_bucket_cfg[-1] max_batch_size = min(self.max_num_batched_tokens // max_seq_len, self.scheduler_config.max_num_seqs) @@ -1929,6 +1943,7 @@ class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]): "attn_metadata": self.trim_attn_metadata(attn_metadata), "intermediate_tensors": intermediate_tensors, "lora_mask": lora_mask, + "virtual_engine": model_input.virtual_engine, **(model_input.multi_modal_kwargs or {}), } if htorch.utils.internal.is_lazy(): diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index cca7cd50bfc7b..9401241073c7d 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -2,6 +2,7 @@ # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### +import contextlib import gc import os from typing import List, Optional, Set, Tuple, Type @@ -18,8 +19,10 @@ from vllm.distributed import (ensure_model_parallel_initialized, from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest +from vllm.utils import bind_kv_cache from vllm.worker.cache_engine import CacheEngine from vllm.worker.hpu_model_runner import HPUModelRunner from vllm.worker.model_runner_base import ModelRunnerBase @@ -123,6 +126,70 @@ class HPUWorker(LocalOrDistributedWorkerBase): def load_model(self): self.model_runner.load_model() + def execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None, + ) -> Optional[List[SamplerOutput]]: + assert execute_model_req is not None + # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501 + # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501 + # VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any # noqa:E501 + # VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL - will log cpu fallbacks per engine step, always, even if there were none # noqa:E501 + log_graph_compilation_all = os.environ.get( + 'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0' + log_graph_compilation = os.environ.get( + 'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION', + '0') != '0' or log_graph_compilation_all + log_cpu_fallbacks_all = os.environ.get( + 'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0' + log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS', + '0') != '0' or log_cpu_fallbacks_all + if log_graph_compilation or log_cpu_fallbacks: + from habana_frameworks.torch.hpu.metrics import metric_localcontext + seq_group_metadata_list = execute_model_req.seq_group_metadata_list + is_prompt = any([ + seq_group_metadata.is_prompt + for seq_group_metadata in seq_group_metadata_list + ]) + max_context_len = max([ + max([ + len(v.prompt_token_ids) + len(v.output_token_ids) + for v in seq_group_metadata.seq_data.values() + ]) for seq_group_metadata in seq_group_metadata_list + ]) # whoa, that's some spicy stuff right here + max_num_blocks = ( + (max_context_len - 1) // self.cache_config.block_size) + 1 + input_stats = (f'is_prompt: {is_prompt}, ' + f'num_seqs: {len(seq_group_metadata_list)}, ' + f'max_context_len: {max_context_len}, ' + f'max_num_blocks {max_num_blocks}') + gc_ctx = metric_localcontext( + "graph_compilation" + ) if log_graph_compilation else contextlib.nullcontext() + cpu_fallback_ctx = metric_localcontext( + "cpu_fallback" + ) if log_cpu_fallbacks else contextlib.nullcontext() + with gc_ctx as gc_local_metric, \ + cpu_fallback_ctx as cpu_fallback_local_metric: + output = LocalOrDistributedWorkerBase.execute_model( + self, execute_model_req) + if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0 + ) or log_graph_compilation_all: + msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: " + f"{gc_local_metric.stats()}, {input_stats}") + logger.warning(msg) + if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] > + 0) or log_cpu_fallbacks_all: + msg = ("VLLM_HPU_STEP_CPU_FALLBACK: " + f"{cpu_fallback_local_metric.stats()}, {input_stats}") + logger.warning(msg) + + return output + + output = LocalOrDistributedWorkerBase.execute_model( + self, execute_model_req) + return output + @torch.inference_mode() def determine_num_available_blocks(self) -> Tuple[int, int]: """Profiles the peak memory usage of the model to determine how many @@ -215,6 +282,8 @@ class HPUWorker(LocalOrDistributedWorkerBase): self.cache_engine[ve].gpu_cache for ve in range(self.parallel_config.pipeline_parallel_size) ] + bind_kv_cache(self.compilation_config.static_forward_context, + self.hpu_cache) def _warm_up_model(self) -> None: # NOTE(kzawora): We should use virtual engine index here diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 1c6d1bbee78ee..cb2ff0c934da3 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -5,6 +5,7 @@ import itertools import time import warnings import weakref +from contextlib import contextmanager from dataclasses import dataclass from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, Tuple, Type, TypeVar, Union) @@ -1028,6 +1029,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): self.has_inner_state = model_config.has_inner_state + self.in_profile_run = False + # When using CUDA graph, the input block tables must be padded to # max_seq_len_to_capture. However, creating the block table in # Python can be expensive. To optimize this, we cache the block table @@ -1173,6 +1176,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, backend=backend) + def get_model(self) -> nn.Module: + return self.model + def save_sharded_state( self, path: str, @@ -1228,110 +1234,123 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): return builder.build() # type: ignore + @contextmanager + def set_in_profile_run(self): + self.in_profile_run = True + try: + yield + finally: + self.in_profile_run = False + @torch.inference_mode() def profile_run(self) -> None: - # Enable top-k sampling to reflect the accurate memory usage. - sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) - max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens - max_num_seqs = self.scheduler_config.max_num_seqs - # This represents the maximum number of different requests - # that will have unique loras, an therefore the max amount of memory - # consumption create dummy lora request copies from the lora request - # passed in, which contains a lora from the lora warmup path. - dummy_lora_requests: List[LoRARequest] = [] - dummy_lora_requests_per_seq: List[LoRARequest] = [] - if self.lora_config: - assert self.lora_manager is not None - with self.lora_manager.dummy_lora_cache(): - for idx in range(self.lora_config.max_loras): - lora_id = idx + 1 - dummy_lora_request = LoRARequest( - lora_name=f"warmup_{lora_id}", - lora_int_id=lora_id, - lora_path="/not/a/real/path", - ) - self.lora_manager.add_dummy_lora(dummy_lora_request, - rank=LORA_WARMUP_RANK) - dummy_lora_requests.append(dummy_lora_request) - dummy_lora_requests_per_seq = [ - dummy_lora_requests[idx % len(dummy_lora_requests)] - for idx in range(max_num_seqs) - ] + with self.set_in_profile_run(): + # Enable top-k sampling to reflect the accurate memory usage. + sampling_params = \ + SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) + max_num_batched_tokens = \ + self.scheduler_config.max_num_batched_tokens + max_num_seqs = self.scheduler_config.max_num_seqs + # This represents the maximum number of different requests + # that will have unique loras, an therefore the max amount of memory + # consumption create dummy lora request copies from the lora request + # passed in, which contains a lora from the lora warmup path. + dummy_lora_requests: List[LoRARequest] = [] + dummy_lora_requests_per_seq: List[LoRARequest] = [] + if self.lora_config: + assert self.lora_manager is not None + with self.lora_manager.dummy_lora_cache(): + for idx in range(self.lora_config.max_loras): + lora_id = idx + 1 + dummy_lora_request = LoRARequest( + lora_name=f"warmup_{lora_id}", + lora_int_id=lora_id, + lora_path="/not/a/real/path", + ) + self.lora_manager.add_dummy_lora(dummy_lora_request, + rank=LORA_WARMUP_RANK) + dummy_lora_requests.append(dummy_lora_request) + dummy_lora_requests_per_seq = [ + dummy_lora_requests[idx % len(dummy_lora_requests)] + for idx in range(max_num_seqs) + ] - # Profile memory usage with max_num_sequences sequences and the total - # number of tokens equal to max_num_batched_tokens. - seqs: List[SequenceGroupMetadata] = [] - # Additional GPU memory may be needed for multi-modal encoding, which - # needs to be accounted for when calculating the GPU blocks for - # vLLM blocker manager. - # To exercise the worst scenario for GPU memory consumption, - # the number of seqs (batch_size) is chosen to maximize the number - # of images processed. + # Profile memory usage with max_num_sequences sequences and the + # total number of tokens equal to max_num_batched_tokens. + seqs: List[SequenceGroupMetadata] = [] + # Additional GPU memory may be needed for multi-modal encoding, + # which needs to be accounted for when calculating the GPU blocks + # for vLLM blocker manager. + # To exercise the worst scenario for GPU memory consumption, + # the number of seqs (batch_size) is chosen to maximize the number + # of images processed. - max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( - self.model_config) - if max_mm_tokens > 0: - max_num_seqs_orig = max_num_seqs - max_num_seqs = min(max_num_seqs, - max_num_batched_tokens // max_mm_tokens) - if max_num_seqs < 1: - expr = (f"min({max_num_seqs_orig}, " - f"{max_num_batched_tokens} // {max_mm_tokens})") - logger.warning( - "Computed max_num_seqs (%s) to be less than 1. " - "Setting it to the minimum value of 1.", expr) - max_num_seqs = 1 + max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( + self.model_config) + if max_mm_tokens > 0: + max_num_seqs_orig = max_num_seqs + max_num_seqs = min(max_num_seqs, + max_num_batched_tokens // max_mm_tokens) + if max_num_seqs < 1: + expr = (f"min({max_num_seqs_orig}, " + f"{max_num_batched_tokens} // {max_mm_tokens})") + logger.warning( + "Computed max_num_seqs (%s) to be less than 1. " + "Setting it to the minimum value of 1.", expr) + max_num_seqs = 1 - batch_size = 0 - for group_id in range(max_num_seqs): - seq_len = (max_num_batched_tokens // max_num_seqs + - (group_id < max_num_batched_tokens % max_num_seqs)) - batch_size += seq_len + batch_size = 0 + for group_id in range(max_num_seqs): + seq_len = (max_num_batched_tokens // max_num_seqs + + (group_id < max_num_batched_tokens % max_num_seqs)) + batch_size += seq_len - dummy_data = self.input_registry \ - .dummy_data_for_profiling(self.model_config, - seq_len, - self.mm_registry) + dummy_data = self.input_registry \ + .dummy_data_for_profiling(self.model_config, + seq_len, + self.mm_registry) - seq = SequenceGroupMetadata( - request_id=str(group_id), - is_prompt=True, - seq_data={group_id: dummy_data.seq_data}, - sampling_params=sampling_params, - block_tables=None, - lora_request=dummy_lora_requests_per_seq[group_id] - if dummy_lora_requests_per_seq else None, - multi_modal_data=dummy_data.multi_modal_data, - multi_modal_placeholders=dummy_data.multi_modal_placeholders, - ) - seqs.append(seq) + seq = SequenceGroupMetadata( + request_id=str(group_id), + is_prompt=True, + seq_data={group_id: dummy_data.seq_data}, + sampling_params=sampling_params, + block_tables=None, + lora_request=dummy_lora_requests_per_seq[group_id] + if dummy_lora_requests_per_seq else None, + multi_modal_data=dummy_data.multi_modal_data, + multi_modal_placeholders=dummy_data. + multi_modal_placeholders, + ) + seqs.append(seq) - # Run the model with the dummy inputs. - num_layers = self.model_config.get_num_layers(self.parallel_config) - # use an empty tensor instead of `None`` to force Dynamo to pass - # it by reference, rather by specializing on the value ``None``. - # the `dtype` argument does not matter, and we use `float32` as - # a placeholder (it has wide hardware support). - # it is important to create tensors inside the loop, rather than - # multiplying the list, to avoid Dynamo from treating them as - # tensor aliasing. - kv_caches = [ - torch.tensor([], dtype=torch.float32, device=self.device) - for _ in range(num_layers) - ] - finished_requests_ids = [seq.request_id for seq in seqs] - model_input = self.prepare_model_input( - seqs, finished_requests_ids=finished_requests_ids) - intermediate_tensors = None - if not get_pp_group().is_first_rank: - intermediate_tensors = self.model.make_empty_intermediate_tensors( - batch_size=batch_size, - dtype=self.model_config.dtype, - device=self.device) + # Run the model with the dummy inputs. + num_layers = self.model_config.get_num_layers(self.parallel_config) + # use an empty tensor instead of `None`` to force Dynamo to pass + # it by reference, rather by specializing on the value ``None``. + # the `dtype` argument does not matter, and we use `float32` as + # a placeholder (it has wide hardware support). + # it is important to create tensors inside the loop, rather than + # multiplying the list, to avoid Dynamo from treating them as + # tensor aliasing. + kv_caches = [ + torch.tensor([], dtype=torch.float32, device=self.device) + for _ in range(num_layers) + ] + finished_requests_ids = [seq.request_id for seq in seqs] + model_input = self.prepare_model_input( + seqs, finished_requests_ids=finished_requests_ids) + intermediate_tensors = None + if not get_pp_group().is_first_rank: + intermediate_tensors = \ + self.model.make_empty_intermediate_tensors( + batch_size=batch_size, + dtype=self.model_config.dtype, + device=self.device) - self.execute_model(model_input, kv_caches, intermediate_tensors) - torch.cuda.synchronize() - return + self.execute_model(model_input, kv_caches, intermediate_tensors) + torch.cuda.synchronize() + return def remove_all_loras(self): if not self.lora_manager: @@ -1527,7 +1546,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): self._update_inputs_to_capture_for_enc_dec_model( capture_inputs) - with set_forward_context(attn_metadata, self.vllm_config): + with set_forward_context(attn_metadata, self.vllm_config, + virtual_engine): graph_runner.capture(**capture_inputs) self.graph_memory_pool = graph_runner.graph.pool() self.graph_runners[virtual_engine][batch_size] = ( @@ -1695,7 +1715,7 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): if not bypass_model_exec: with set_forward_context(model_input.attn_metadata, - self.vllm_config): + self.vllm_config, virtual_engine): hidden_or_intermediate_states = model_executable( input_ids=model_input.input_tokens, positions=model_input.input_positions, diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index c7abad7e0258d..acfd6d0b03f62 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -7,6 +7,7 @@ from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, Optional, Type, TypeVar) import torch +import torch.nn as nn from torch import is_tensor from vllm.config import VllmConfig @@ -264,6 +265,10 @@ class ModelRunnerBase(ABC, Generic[T]): """ raise NotImplementedError + @abstractmethod + def get_model(self) -> nn.Module: + raise NotImplementedError + def execute_model( self, model_input: T, @@ -297,9 +302,9 @@ class ModelRunnerWrapperBase: def __init__( self, - moderl_runner: ModelRunnerBase, + model_runner: ModelRunnerBase, ) -> None: - self.model_runner: ModelRunnerBase = moderl_runner + self.model_runner: ModelRunnerBase = model_runner def __getattr__(self, attr): return getattr(self.model_runner, attr) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index acce923498d7e..4aab09c80826b 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -32,7 +32,7 @@ logger = init_logger(__name__) MULTI_STEP_ATTENTION_BACKENDS = [ "FLASH_ATTN", "ROCM_FLASH", "FLASHINFER", "NO_ATTENTION" ] -MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["FLASH_ATTN"] +MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["FLASH_ATTN", "FLASHINFER"] def _get_supported_attention_backends(chunked_prefill_enabled: bool) \ -> List[str]: diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index ae4eb6ba6eaec..596c26eac28bd 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -8,6 +8,7 @@ from torch import nn from transformers_neuronx.config import GenerationConfig from vllm.config import VllmConfig +from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput @@ -112,6 +113,9 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): raise NotImplementedError( "Supports only Transformer-NeuronX based models.") + def get_model(self) -> nn.Module: + return self.model + def _prepare_prompt( self, seq_group_metadata_list: List[SequenceGroupMetadata], @@ -314,13 +318,15 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): raise ValueError( "NeuronModelRunner does not support multi-step execution.") - hidden_states = self.model( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - input_block_ids=model_input.input_block_ids, - **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, - device=self.device), - ) + with set_forward_context(None, self.vllm_config, 0): + hidden_states = self.model( + input_ids=model_input.input_tokens, + positions=model_input.input_positions, + input_block_ids=model_input.input_block_ids, + **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs + or {}, + device=self.device), + ) # Compute the logits only if the on-device sampling is turned off as # on-device sampling outputs the token ids. diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 3f6269684ac93..e02c72faace70 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -8,6 +8,7 @@ from vllm.config import VllmConfig from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.model_executor import set_random_seed +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest from vllm.worker.neuron_model_runner import NeuronModelRunner from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, @@ -25,6 +26,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): local_rank: int, rank: int, distributed_init_method: str, + is_driver_worker: bool = True, ) -> None: WorkerBase.__init__(self, vllm_config=vllm_config) self.local_rank = local_rank @@ -37,7 +39,22 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): self.model_runner: NeuronModelRunner = NeuronModelRunner( vllm_config=vllm_config) - self.is_driver_worker = True + self.is_driver_worker = is_driver_worker + + def execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None, + ) -> Optional[List[SamplerOutput]]: + assert execute_model_req is not None + assert (not execute_model_req.blocks_to_swap_in + and not execute_model_req.blocks_to_swap_out + and not execute_model_req.blocks_to_copy), ( + "Cache operations are not supported for Neuron backend.") + assert execute_model_req.num_lookahead_slots == 0, ( + "lookahead not supported for Neuron backend.") + output = LocalOrDistributedWorkerBase.execute_model( + self, execute_model_req) + return output def init_device(self) -> None: self.init_distributed_environment() @@ -103,13 +120,14 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): def init_distributed_environment(self): """Neuron uses transformers-neuronx for tensor parallelism. - - vLLM still needs the environment inited when TP/PP > 1 + It has only one process to control multiple devices. + vLLM still needs the environment initialized when TP/PP > 1, + so we initialize a distributed environment with one process. """ init_distributed_environment( world_size=1, - rank=self.rank, - local_rank=self.local_rank, + rank=0, + local_rank=0, distributed_init_method=self.distributed_init_method, backend="gloo", ) diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index 6000e5dfe4e30..9d0a759ca2f21 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -8,6 +8,7 @@ from torch import nn from vllm.attention import get_attn_backend from vllm.attention.backends.openvino import OpenVINOAttentionMetadata from vllm.config import VllmConfig +from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput @@ -83,6 +84,9 @@ class OpenVINOModelRunner(ModelRunnerBase): kv_cache_dtype=self.kv_cache_dtype, ov_core=self.ov_core) + def get_model(self) -> nn.Module: + return self.model + def _prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], @@ -350,7 +354,8 @@ class OpenVINOModelRunner(ModelRunnerBase): device=self.device), } - hidden_states = model_executable(**execute_model_kwargs) + with set_forward_context(attn_metadata, self.vllm_config, 0): + hidden_states = model_executable(**execute_model_kwargs) # Compute the logits. logits = self.model.compute_logits(hidden_states, sampling_metadata) diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index 0bf522d5333ed..f5b46cde3969c 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple import openvino as ov import torch import torch.distributed +import torch.nn as nn import vllm.envs as envs from vllm.attention import get_attn_backend @@ -20,6 +21,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata +from vllm.utils import bind_kv_cache from vllm.worker.openvino_model_runner import OpenVINOModelRunner from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase @@ -210,16 +212,14 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase): def __init__( self, - ov_core: ov.Core, vllm_config: VllmConfig, local_rank: int, rank: int, distributed_init_method: str, - kv_cache_dtype: Optional[ov.Type] = ov.Type.undefined, is_driver_worker: bool = False, ) -> None: - self.ov_core = ov_core WorkerBase.__init__(self, vllm_config) + self.ov_core = ov.Core() self.parallel_config.rank = rank self.local_rank = local_rank self.rank = rank @@ -236,7 +236,7 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase): self.model_runner = OpenVINOModelRunner( self.ov_core, vllm_config=self.vllm_config, - kv_cache_dtype=kv_cache_dtype, + kv_cache_dtype=self.vllm_config.cache_config.cache_dtype, is_driver_worker=is_driver_worker, ) # Uninitialized cache engine. Will be initialized by @@ -339,6 +339,8 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase): ov_device, ) self.kv_cache = self.cache_engine.kv_cache + bind_kv_cache(self.compilation_config.static_forward_context, + [self.kv_cache]) self.model_runner.block_size = self.cache_engine.block_size assert self.kv_cache is not None @@ -361,6 +363,9 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase): ) -> None: self.cache_engine.copy(blocks_to_copy) # type: ignore + def get_model(self) -> nn.Module: + return self.model_runner.get_model() + @torch.inference_mode() def execute_model( self, @@ -507,12 +512,18 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase): self.model_runner.block_size = tmp_cache_config.block_size + bind_kv_cache(self.compilation_config.static_forward_context, + profiling_cache_engine.kv_cache) # Run the model with the dummy inputs. self.model_runner.execute_model(seqs, profiling_cache_engine.kv_cache) - # explicitly delete temporary KV cache manager to free KV cache - # when real inputs will be passed to OV + # Explicitly revert bind_kv_cache and delete temporary KV cache + # manager to free KV cache when real inputs will be passed to OV + bind_kv_cache(self.compilation_config.static_forward_context, [[ + torch.tensor([]) + for _ in range(len(profiling_cache_engine.kv_cache)) + ]]) del profiling_cache_engine logger.info( diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py index f79b3773bcbd2..6de227f3cb2b9 100644 --- a/vllm/worker/pooling_model_runner.py +++ b/vllm/worker/pooling_model_runner.py @@ -105,7 +105,8 @@ class PoolingModelRunner( if model_input.token_types is not None: cross_enc_kwargs["token_type_ids"] = model_input.token_types - with set_forward_context(model_input.attn_metadata, self.vllm_config): + with set_forward_context(model_input.attn_metadata, self.vllm_config, + virtual_engine): hidden_or_intermediate_states = model_executable( input_ids=model_input.input_tokens, positions=model_input.input_positions, diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 7bdb7f0e2d6a9..f5c7bc955a673 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -13,6 +13,7 @@ import torch_xla.runtime as xr from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import VllmConfig +from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader import get_model @@ -157,6 +158,9 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]): fullgraph=True, dynamic=False) + def get_model(self) -> nn.Module: + return self.model.model + def _dummy_run( self, batch_size: int, @@ -265,8 +269,9 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]): torch._dynamo.mark_dynamic(t, 0) torch._dynamo.mark_dynamic(p, 0) # Dummy run. - self.model(token_ids, position_ids, attn_metadata, input_lens, t, p, - num_samples, kv_caches) + with set_forward_context(attn_metadata, self.vllm_config, 0): + self.model(token_ids, position_ids, attn_metadata, input_lens, t, + p, num_samples, kv_caches) def warmup_model( self, @@ -663,10 +668,13 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]): input_lens = model_input.input_lens[i:i + 1].to(self.device) t = model_input.t[i:i + 1].to(self.device) p = model_input.p[i:i + 1].to(self.device) - output_token_ids = self.model(token_ids, position_ids, - attn_metadata, input_lens, t, p, - model_input.num_samples, - kv_caches) + with set_forward_context(model_input.attn_metadata, + self.vllm_config, + model_input.virtual_engine): + output_token_ids = self.model(token_ids, position_ids, + attn_metadata, input_lens, t, + p, model_input.num_samples, + kv_caches) next_token_ids.append(output_token_ids[0]) start_idx = end_idx @@ -711,10 +719,13 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]): input_lens = model_input.input_lens.to(self.device) for i in range(num_steps): slot_mapping = attn_metadata.slot_mapping - output_token_ids = self.model(token_ids, position_ids, - attn_metadata, input_lens, t, p, - model_input.num_samples, - kv_caches) + with set_forward_context(model_input.attn_metadata, + self.vllm_config, + model_input.virtual_engine): + output_token_ids = self.model(token_ids, position_ids, + attn_metadata, input_lens, t, + p, model_input.num_samples, + kv_caches) self.cached_step_outputs.append(output_token_ids) if i < num_steps - 1: diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 8754f7538f251..ea0e700545b16 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -12,7 +12,7 @@ from vllm.distributed import (ensure_model_parallel_initialized, from vllm.logger import init_logger from vllm.model_executor import set_random_seed from vllm.sequence import ExecuteModelRequest -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, bind_kv_cache, get_dtype_size from vllm.worker.tpu_model_runner import ExecutionMode, TPUModelRunner from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, LoraNotSupportedWorkerBase, WorkerBase, @@ -108,6 +108,8 @@ class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): torch.tensor([], dtype=torch.float32, device=self.device)) for _ in range(num_layers)] + bind_kv_cache(self.compilation_config.static_forward_context, + [kv_caches]) self.model_runner._dummy_run( batch_size=1, seq_len=self.scheduler_config.max_num_batched_tokens, @@ -170,6 +172,8 @@ class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): device="cpu") cpu_v_cache = torch.zeros_like(cpu_k_cache) self.cpu_cache.append((cpu_k_cache, cpu_v_cache)) + bind_kv_cache(self.compilation_config.static_forward_context, + [self.tpu_cache]) self._warmup_model() def _warmup_model(self) -> None: diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index f51b51d433d3d..29d62ddda3dc0 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -21,7 +21,8 @@ from vllm.platforms import current_platform from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, SequenceGroupMetadata, SequenceGroupMetadataDelta) -from vllm.utils import GiB_bytes, memory_profiling +from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache, + memory_profiling) from vllm.worker.cache_engine import CacheEngine from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner @@ -55,9 +56,6 @@ class Worker(LocalOrDistributedWorkerBase): self.rank = rank self.distributed_init_method = distributed_init_method self.is_driver_worker = is_driver_worker - if is_driver_worker: - assert rank % self.parallel_config.tensor_parallel_size == 0, \ - "Driver worker should be rank 0 of tensor parallel group." if self.model_config.trust_remote_code: # note: lazy import to avoid importing torch before initializing from vllm.utils import init_cached_hf_modules @@ -140,7 +138,8 @@ class Worker(LocalOrDistributedWorkerBase): _check_if_gpu_supports_dtype(self.model_config.dtype) gc.collect() torch.cuda.empty_cache() - self.init_gpu_memory = torch.cuda.mem_get_info()[0] + torch.cuda.reset_peak_memory_stats() + self.baseline_snapshot = MemorySnapshot() else: raise RuntimeError( f"Not support device type: {self.device_config.device}") @@ -195,19 +194,17 @@ class Worker(LocalOrDistributedWorkerBase): # Execute a forward pass with dummy inputs to profile the memory usage # of the model. - with memory_profiling(baseline_memory_in_bytes=total_gpu_memory - - self.init_gpu_memory, - weights_memory_in_bytes=self.model_runner. - model_memory_usage) as result: + with memory_profiling( + self.baseline_snapshot, + weights_memory=self.model_runner.model_memory_usage) as result: self.model_runner.profile_run() - torch.cuda.synchronize() self._assert_memory_footprint_increased_during_profiling() memory_for_current_instance = total_gpu_memory * \ self.cache_config.gpu_memory_utilization available_kv_cache_memory = (memory_for_current_instance - - result.non_kv_cache_memory_in_bytes) + result.non_kv_cache_memory) # Calculate the number of blocks that can be allocated with the # profiled peak memory. @@ -230,11 +227,11 @@ class Worker(LocalOrDistributedWorkerBase): f"({self.cache_config.gpu_memory_utilization:.2f})" f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n" "model weights take " - f"{(result.weights_memory_in_bytes / GiB_bytes):.2f}GiB;" + f"{(result.weights_memory / GiB_bytes):.2f}GiB;" " non_torch_memory takes " - f"{(result.non_torch_increase_in_bytes / GiB_bytes):.2f}GiB;" + f"{(result.non_torch_increase / GiB_bytes):.2f}GiB;" " PyTorch activation peak memory takes " - f"{(result.torch_peak_increase_in_bytes / GiB_bytes):.2f}GiB;" + f"{(result.torch_peak_increase / GiB_bytes):.2f}GiB;" " the rest of the memory reserved for KV Cache is " f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.") @@ -250,11 +247,13 @@ class Worker(LocalOrDistributedWorkerBase): def _assert_memory_footprint_increased_during_profiling(self): # NOTE(woosuk): Here we assume that the other processes using the same # GPU did not change their memory usage during the profiling. - free_gpu_memory, _ = torch.cuda.mem_get_info() - assert self.init_gpu_memory - free_gpu_memory > 0, ( + free_gpu_memory, total = torch.cuda.mem_get_info() + cuda_memory = total - free_gpu_memory + assert self.baseline_snapshot.cuda_memory < cuda_memory, ( "Error in memory profiling. " - f"Initial free memory {self.init_gpu_memory}, current free memory" - f" {free_gpu_memory}. This happens when the GPU memory was " + f"Initial used memory {self.baseline_snapshot.cuda_memory}, " + f"currently used memory {cuda_memory}. " + f"This happens when the GPU memory was " "not properly cleaned up before initializing the vLLM instance.") def initialize_cache(self, num_gpu_blocks: int, @@ -285,6 +284,8 @@ class Worker(LocalOrDistributedWorkerBase): self.cache_engine[ve].gpu_cache for ve in range(self.parallel_config.pipeline_parallel_size) ] + bind_kv_cache(self.compilation_config.static_forward_context, + self.gpu_cache) def _warm_up_model(self) -> None: if not self.model_config.enforce_eager: diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 249b3ed2dfd37..1104eceef72a3 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -4,7 +4,9 @@ import time from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union +import cloudpickle import torch +import torch.nn as nn from vllm.config import ObservabilityConfig, VllmConfig from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group @@ -13,7 +15,8 @@ from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest, IntermediateTensors from vllm.utils import (enable_trace_function_call_for_thread, - resolve_obj_by_qualname, update_environment_variables) + resolve_obj_by_qualname, run_method, + update_environment_variables) from vllm.worker.model_runner_base import (BroadcastableModelInput, ModelRunnerBase, ModelRunnerInputBase) @@ -43,6 +46,7 @@ class WorkerBase(ABC): self.prompt_adapter_config = vllm_config.prompt_adapter_config self.observability_config = vllm_config.observability_config self.kv_transfer_config = vllm_config.kv_transfer_config + self.compilation_config = vllm_config.compilation_config from vllm.platforms import current_platform self.current_platform = current_platform @@ -87,6 +91,10 @@ class WorkerBase(ABC): if output is None: return None + @abstractmethod + def get_model(self) -> nn.Module: + raise NotImplementedError + @abstractmethod def execute_model( self, @@ -118,6 +126,61 @@ class WorkerBase(ABC): raise NotImplementedError +class DelegateWorkerBase(WorkerBase): + """ + A class that delegates all methods to another WorkerBase instance. This is + useful for creating a WorkerBase that wraps another WorkerBase instance, + e.g. speculative decoding. + """ + worker: WorkerBase + + def __init__( + self, + *args, + **kwargs, + ) -> None: + vllm_config: VllmConfig = kwargs.get("vllm_config") + cls = resolve_obj_by_qualname(vllm_config.parallel_config.worker_cls) + self.worker = cls(*args, **kwargs) + + def init_device(self) -> None: + self.worker.init_device() + + def determine_num_available_blocks(self) -> Tuple[int, int]: + return self.worker.determine_num_available_blocks() + + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + self.worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + + def get_model(self) -> nn.Module: + return self.worker.get_model() + + def execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> Optional[List[SamplerOutput]]: + return self.worker.execute_model(execute_model_req) + + def get_cache_block_size_bytes(self) -> int: + return self.worker.get_cache_block_size_bytes() + + def add_lora(self, lora_request: LoRARequest) -> bool: + return self.worker.add_lora(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + return self.worker.remove_lora(lora_id) + + def pin_lora(self, lora_id: int) -> bool: + return self.worker.pin_lora(lora_id) + + def list_loras(self) -> Set[int]: + return self.worker.list_loras() + + def __getattr__(self, attr): + return getattr(self.worker, attr) + + class LoraNotSupportedWorkerBase(WorkerBase): """Partial implementation of WorkerBase that raises exceptions when LoRA methods are invoked. @@ -309,6 +372,9 @@ class LocalOrDistributedWorkerBase(WorkerBase): else: return self._get_worker_input_from_broadcast() + def get_model(self) -> nn.Module: + return self.model_runner.get_model() + def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None, @@ -409,7 +475,8 @@ class LocalOrDistributedWorkerBase(WorkerBase): class WorkerWrapperBase: """ - The whole point of this class is to lazily initialize the worker. + This class represents one process in an executor/engine. It is responsible + for lazily initializing the worker and handling the worker's lifecycle. We first instantiate the WorkerWrapper, which remembers the worker module and class name. Then, when we call `update_environment_variables`, and the real initialization happens in `init_worker`. @@ -418,17 +485,41 @@ class WorkerWrapperBase: def __init__( self, vllm_config: VllmConfig, + rpc_rank: int = 0, ) -> None: + """ + Initialize the worker wrapper with the given vllm_config and rpc_rank. + Note: rpc_rank is the rank of the worker in the executor. In most cases, + it is also the rank of the worker in the distributed group. However, + when multiple executors work together, they can be different. + e.g. in the case of SPMD-style offline inference with TP=2, + users can launch 2 engines/executors, each with only 1 worker. + All workers have rpc_rank=0, but they have different ranks in the TP + group. + """ + self.rpc_rank = rpc_rank self.vllm_config = vllm_config - trust_remote_code = vllm_config.model_config.trust_remote_code self.worker: Optional[WorkerBase] = None - if trust_remote_code: - # note: lazy import to avoid importing torch before initializing - from vllm.utils import init_cached_hf_modules - init_cached_hf_modules() + if vllm_config.model_config is not None: + # it can be None in tests + trust_remote_code = vllm_config.model_config.trust_remote_code + if trust_remote_code: + # note: lazy import to avoid importing torch before initializing + from vllm.utils import init_cached_hf_modules + init_cached_hf_modules() - @staticmethod - def update_environment_variables(envs: Dict[str, str]) -> None: + def adjust_rank(self, rank_mapping: Dict[int, int]) -> None: + """ + Adjust the rpc_rank based on the given mapping. + It is only used during the initialization of the executor, + to adjust the rpc_rank of workers after we create all workers. + """ + if self.rpc_rank in rank_mapping: + self.rpc_rank = rank_mapping[self.rpc_rank] + + def update_environment_variables(self, envs_list: List[Dict[str, + str]]) -> None: + envs = envs_list[self.rpc_rank] key = 'CUDA_VISIBLE_DEVICES' if key in envs and key in os.environ: # overwriting CUDA_VISIBLE_DEVICES is desired behavior @@ -436,35 +527,41 @@ class WorkerWrapperBase: del os.environ[key] update_environment_variables(envs) - def init_worker(self, *args, **kwargs): + def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None: """ Here we inject some common logic before initializing the worker. Arguments are passed to the worker class constructor. """ + kwargs = all_kwargs[self.rpc_rank] enable_trace_function_call_for_thread(self.vllm_config) - # see https://github.com/NVIDIA/nccl/issues/1234 - os.environ['NCCL_CUMEM_ENABLE'] = '0' + from vllm import configure_as_vllm_process + configure_as_vllm_process() from vllm.plugins import load_general_plugins load_general_plugins() - worker_class = resolve_obj_by_qualname( - self.vllm_config.parallel_config.worker_cls) - self.worker = worker_class(*args, **kwargs) + if isinstance(self.vllm_config.parallel_config.worker_cls, str): + worker_class = resolve_obj_by_qualname( + self.vllm_config.parallel_config.worker_cls) + else: + assert isinstance(self.vllm_config.parallel_config.worker_cls, + bytes) + worker_class = cloudpickle.loads( + self.vllm_config.parallel_config.worker_cls) + self.worker = worker_class(**kwargs) assert self.worker is not None - def execute_method(self, method: str, *args, **kwargs): + def execute_method(self, method: Union[str, bytes], *args, **kwargs): try: target = self if self.worker is None else self.worker - executor = getattr(target, method) - return executor(*args, **kwargs) + return run_method(target, method, args, kwargs) except Exception as e: # if the driver worker also execute methods, # exceptions in the rest worker may cause deadlock in rpc like ray # see https://github.com/vllm-project/vllm/issues/3455 # print the error and inform the user to solve the error - msg = (f"Error executing method {method}. " + msg = (f"Error executing method {method!r}. " "This might cause deadlock in distributed execution.") logger.exception(msg) raise e diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 9cf25387560da..25a2fea1e8eac 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -12,6 +12,7 @@ import torch.nn as nn from vllm.attention import get_attn_backend from vllm.config import VllmConfig from vllm.distributed import get_pp_group +from vllm.forward_context import set_forward_context from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.logger import init_logger from vllm.model_executor import SamplingMetadataCache @@ -415,6 +416,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): logger.info("Loading model weights took %.4f GB", self.model_memory_usage / float(2**30)) + def get_model(self) -> nn.Module: + return self.model + @property def vocab_size(self) -> int: return self.model_config.get_vocab_size() @@ -562,15 +566,17 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): if (self.observability_config is not None and self.observability_config.collect_model_forward_time): model_forward_start_time = time.time() - - hidden_or_intermediate_states = model_executable( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - kv_caches=kv_caches, - attn_metadata=model_input.attn_metadata, - intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, - device=self.device)) + with set_forward_context(model_input.attn_metadata, self.vllm_config, + model_input.virtual_engine): + hidden_or_intermediate_states = model_executable( + input_ids=model_input.input_tokens, + positions=model_input.input_positions, + kv_caches=kv_caches, + attn_metadata=model_input.attn_metadata, + intermediate_tensors=intermediate_tensors, + **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs + or {}, + device=self.device)) # Compute the logits in the last pipeline stage. if not get_pp_group().is_last_rank: return hidden_or_intermediate_states