diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index d3f5fc5cd4cee..72c52d5bb5e9b 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do ### Visualizing the results -The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results. +The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results. You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. If you do not see the table, please wait till the benchmark finish running. The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. diff --git a/.buildkite/pyproject.toml b/.buildkite/pyproject.toml index 083bb795caf5a..d5cad1c73c6f8 100644 --- a/.buildkite/pyproject.toml +++ b/.buildkite/pyproject.toml @@ -6,11 +6,6 @@ [tool.ruff] line-length = 88 -exclude = [ - # External file, leaving license intact - "examples/other/fp8/quantizer/quantize.py", - "vllm/vllm_flash_attn/flash_attn_interface.pyi" -] [tool.ruff.lint.per-file-ignores] "vllm/third_party/**" = ["ALL"] diff --git a/.buildkite/scripts/hardware_ci/run-hpu-test.sh b/.buildkite/scripts/hardware_ci/run-hpu-test.sh index 95b6ac37f1857..5efac3ddf469f 100644 --- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh @@ -10,15 +10,17 @@ docker build -t hpu-test-env -f docker/Dockerfile.hpu . # Setup cleanup # certain versions of HPU software stack have a bug that can # override the exit code of the script, so we need to use -# separate remove_docker_container and remove_docker_container_and_exit +# separate remove_docker_containers and remove_docker_containers_and_exit # functions, while other platforms only need one remove_docker_container # function. EXITCODE=1 -remove_docker_container() { docker rm -f hpu-test || true; } -remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; } -trap remove_docker_container_and_exit EXIT -remove_docker_container +remove_docker_containers() { docker rm -f hpu-test || true; docker rm -f hpu-test-tp2 || true; } +remove_docker_containers_and_exit() { remove_docker_containers; exit $EXITCODE; } +trap remove_docker_containers_and_exit EXIT +remove_docker_containers # Run the image and launch offline inference docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m +docker run --runtime=habana --name=hpu-test-tp2 --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2 + EXITCODE=$? diff --git a/.buildkite/scripts/hardware_ci/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh index c0b9dd8dadba9..3d294ea5f8a75 100644 --- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh +++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh @@ -53,4 +53,11 @@ docker run --rm -it --device=/dev/neuron0 --network bridge \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ --name "${container_name}" \ ${image_name} \ - /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys" + /bin/bash -c " + python3 /workspace/vllm/examples/offline_inference/neuron.py; + python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys; + for f in /workspace/vllm/tests/neuron/2_core/*.py; do + echo 'Running test file: '$f; + python3 -m pytest \$f -v --capture=tee-sys; + done + " \ No newline at end of file diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 2d375d7e9d871..6102431456210 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -2,102 +2,180 @@ set -xu + +remove_docker_container() { + docker rm -f tpu-test || true; + docker rm -f vllm-tpu || true; +} + +trap remove_docker_container EXIT + +# Remove the container that might not be cleaned up in the previous run. +remove_docker_container + # Build the docker image. docker build -f docker/Dockerfile.tpu -t vllm-tpu . # Set up cleanup. -remove_docker_container() { docker rm -f tpu-test || true; } -trap remove_docker_container EXIT -# Remove the container that might not be cleaned up in the previous run. -remove_docker_container +cleanup_docker() { + # Get Docker's root directory + docker_root=$(docker info -f '{{.DockerRootDir}}') + if [ -z "$docker_root" ]; then + echo "Failed to determine Docker root directory." + exit 1 + fi + echo "Docker root directory: $docker_root" + # Check disk usage of the filesystem where Docker's root directory is located + disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') + # Define the threshold + threshold=70 + if [ "$disk_usage" -gt "$threshold" ]; then + echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." + # Remove dangling images (those that are not tagged and not used by any container) + docker image prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune --force --filter "until=72h" --all + echo "Docker images and volumes cleanup completed." + else + echo "Disk usage is below $threshold%. No cleanup needed." + fi +} +cleanup_docker # For HF_TOKEN. source /etc/environment -# Run a simple end-to-end example. + docker run --privileged --net host --shm-size=16G -it \ -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ - vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \ - && python3 -m pip install pytest pytest-asyncio tpu-info \ - && python3 -m pip install lm_eval[api]==0.4.4 \ - && export VLLM_XLA_CACHE_PATH= \ - && export VLLM_USE_V1=1 \ - && export VLLM_XLA_CHECK_RECOMPILATION=1 \ - && echo HARDWARE \ - && tpu-info \ - && { \ - echo TEST_0: Running test_perf.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \ - echo TEST_0_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_1: Running test_compilation.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \ - echo TEST_1_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_2: Running test_basic.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \ - echo TEST_2_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \ - python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \ - echo TEST_3_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_4: Running test_quantization_accuracy.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \ - echo TEST_4_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_5: Running examples/offline_inference/tpu.py; \ - python3 /workspace/vllm/examples/offline_inference/tpu.py; \ - echo TEST_5_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_6: Running test_tpu_model_runner.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \ - echo TEST_6_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_7: Running test_sampler.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \ - echo TEST_7_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_8: Running test_topk_topp_sampler.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \ - echo TEST_8_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_9: Running test_multimodal.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \ - echo TEST_9_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_10: Running test_pallas.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \ - echo TEST_10_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_11: Running test_struct_output_generate.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \ - echo TEST_11_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_12: Running test_moe_pallas.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \ - echo TEST_12_EXIT_CODE: \$?; \ - } & \ - # Disable the TPU LoRA tests until the feature is activated - # & { \ - # echo TEST_13: Running test_moe_pallas.py; \ - # python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \ - # echo TEST_13_EXIT_CODE: \$?; \ - # } & \ - wait \ - && echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \ -" + vllm-tpu /bin/bash -c ' +set -e # Exit immediately if a command exits with a non-zero status. +set -u # Treat unset variables as an error. +echo "--- Starting script inside Docker container ---" + +# Create results directory +RESULTS_DIR=$(mktemp -d) +# If mktemp fails, set -e will cause the script to exit. +echo "Results will be stored in: $RESULTS_DIR" + +# Install dependencies +echo "--- Installing Python dependencies ---" +python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ + && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ + && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 +echo "--- Python dependencies installed ---" +export VLLM_USE_V1=1 +export VLLM_XLA_CHECK_RECOMPILATION=1 +export VLLM_XLA_CACHE_PATH= +echo "Using VLLM V1" + +echo "--- Hardware Information ---" +tpu-info +echo "--- Starting Tests ---" +set +e +overall_script_exit_code=0 + +# --- Test Definitions --- +# If a test fails, this function will print logs and will not cause the main script to exit. +run_test() { + local test_num=$1 + local test_name=$2 + local test_command=$3 + local log_file="$RESULTS_DIR/test_${test_num}.log" + local actual_exit_code + + echo "--- TEST_$test_num: Running $test_name ---" + + # Execute the test command. + eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2) + actual_exit_code=$? + + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log + + if [ "$actual_exit_code" -ne 0 ]; then + echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2 + echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2 + if [ -f "$log_file" ]; then + cat "$log_file" >&2 + else + echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2 + fi + echo "--- End of log for TEST_$test_num ($test_name) ---" >&2 + return "$actual_exit_code" # Return the failure code + else + echo "TEST_$test_num ($test_name) PASSED." + return 0 # Return success + fi +} + +# Helper function to call run_test and update the overall script exit code +run_and_track_test() { + local test_num_arg="$1" + local test_name_arg="$2" + local test_command_arg="$3" + + # Run the test + run_test "$test_num_arg" "$test_name_arg" "$test_command_arg" + local test_specific_exit_code=$? + + # If the test failed, set the overall script exit code to 1 + if [ "$test_specific_exit_code" -ne 0 ]; then + # No need for extra echo here, run_test already logged the failure. + overall_script_exit_code=1 + fi +} + +# --- Actual Test Execution --- +run_and_track_test 0 "test_perf.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py" +run_and_track_test 1 "test_compilation.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py" +run_and_track_test 2 "test_basic.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py" +run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \ + "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine" +run_and_track_test 4 "test_quantization_accuracy.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py" +run_and_track_test 5 "examples/offline_inference/tpu.py" \ + "python3 /workspace/vllm/examples/offline_inference/tpu.py" +run_and_track_test 6 "test_tpu_model_runner.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py" +run_and_track_test 7 "test_sampler.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" +run_and_track_test 8 "test_topk_topp_sampler.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py" +run_and_track_test 9 "test_multimodal.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py" +run_and_track_test 10 "test_pallas.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" +run_and_track_test 11 "test_struct_output_generate.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" +run_and_track_test 12 "test_moe_pallas.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" +run_and_track_test 13 "test_lora.py" \ + "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py" + +# After all tests have been attempted, exit with the overall status. +if [ "$overall_script_exit_code" -ne 0 ]; then + echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---" +else + echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---" +fi +exit "$overall_script_exit_code" +' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct. + +# Capture the exit code of the docker run command +DOCKER_RUN_EXIT_CODE=$? + +# The trap will run for cleanup. +# Exit the main script with the Docker run command's exit code. +if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then + echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE." + exit "$DOCKER_RUN_EXIT_CODE" +else + echo "Docker run command completed successfully." + exit 0 +fi # TODO: This test fails because it uses RANDOM_SEED sampling -# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ +# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 461fb6d30c45e..bff2f69c17ba7 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -33,14 +33,13 @@ steps: - label: Documentation Build # 2min mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/test_docs/docs" + working_dir: "/vllm-workspace/test_docs" fast_check: true no_gpu: True commands: - - pip install -r ../../requirements/docs.txt - - SPHINXOPTS=\"-W\" make html - # Check API reference (if it fails, you may have missing mock imports) - - grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html + - pip install -r ../requirements/docs.txt + # TODO: add `--strict` once warnings in docstrings are fixed + - mkdocs build - label: Async Engine, Inputs, Utils, Worker Test # 24min mirror_hardwares: [amdexperimental] @@ -59,6 +58,7 @@ steps: - pytest -v -s async_engine # AsyncLLMEngine - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py - pytest -v -s test_inputs.py + - pytest -v -s test_outputs.py - pytest -v -s multimodal - pytest -v -s test_utils.py # Utils - pytest -v -s worker # Worker @@ -125,7 +125,7 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ - pytest -v -s entrypoints/test_chat_utils.py - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests @@ -138,6 +138,7 @@ steps: - vllm/core/ - tests/distributed/test_utils - tests/distributed/test_pynccl + - tests/distributed/test_events - tests/spec_decode/e2e/test_integration_dist_tp4 - tests/compile/test_basic_correctness - examples/offline_inference/rlhf.py @@ -156,6 +157,7 @@ steps: - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py + - pytest -v -s distributed/test_events.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py # TODO: create a dedicated test section for multi-GPU example tests # when we have multiple distributed example tests @@ -197,8 +199,9 @@ steps: - tests/test_sequence - tests/test_config - tests/test_logger + - tests/test_vllm_port commands: - - pytest -v -s engine test_sequence.py test_config.py test_logger.py + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py # OOM in the CI unless we run this separately - pytest -v -s tokenization @@ -220,6 +223,7 @@ steps: - pytest -v -s v1/test_serial_utils.py - pytest -v -s v1/test_utils.py - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_metrics_reader.py # TODO: accuracy does not match, whether setting # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - pytest -v -s v1/e2e @@ -244,7 +248,7 @@ steps: - python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language_embedding.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 - - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder.py - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/basic/classify.py @@ -271,17 +275,6 @@ steps: - pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers -- label: LogitsProcessor Test # 5min - mirror_hardwares: [amdexperimental, amdproduction] - source_file_dependencies: - - vllm/model_executor/layers - - vllm/model_executor/guided_decoding - - tests/test_logits_processor - - tests/model_executor/test_guided_processors - commands: - - pytest -v -s test_logits_processor.py - - pytest -v -s model_executor/test_guided_processors.py - - label: Speculative decoding tests # 40min mirror_hardwares: [amdexperimental] source_file_dependencies: @@ -312,6 +305,7 @@ steps: - pytest -v -s compile/test_fusion.py - pytest -v -s compile/test_silu_mul_quant_fusion.py - pytest -v -s compile/test_sequence_parallelism.py + - pytest -v -s compile/test_async_tp.py - label: PyTorch Fullgraph Smoke Test # 9min mirror_hardwares: [amdexperimental, amdproduction] @@ -386,10 +380,23 @@ steps: source_file_dependencies: - vllm/model_executor/model_loader - tests/tensorizer_loader + - tests/entrypoints/openai/test_tensorizer_entrypoint.py commands: - apt-get update && apt-get install -y curl libsodium23 - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s tensorizer_loader + - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py + +- label: Model Executor Test + mirror_hardwares: [amdexperimental, amdproduction] + soft_fail: true + source_file_dependencies: + - vllm/model_executor + - tests/model_executor + commands: + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s model_executor - label: Benchmarks # 9min mirror_hardwares: [amdexperimental, amdproduction] @@ -467,10 +474,7 @@ steps: - pytest -v -s models/test_registry.py - pytest -v -s models/test_utils.py - pytest -v -s models/test_vision.py - # V1 Test: https://github.com/vllm-project/vllm/issues/14531 - - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2' - - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4' - - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2' + - pytest -v -s models/test_initialization.py - label: Language Models Test (Standard) mirror_hardwares: [amdexperimental] @@ -484,16 +488,25 @@ steps: - pip freeze | grep -E 'torch' - pytest -v -s models/language -m core_model -- label: Language Models Test (Extended) +- label: Language Models Test (Extended Generation) # 1hr20min mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: - vllm/ - - tests/models/language + - tests/models/language/generation commands: # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - - pytest -v -s models/language -m 'not core_model' + - pytest -v -s models/language/generation -m 'not core_model' + +- label: Language Models Test (Extended Pooling) # 36min + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling + commands: + - pytest -v -s models/language/pooling -m 'not core_model' - label: Multi-Modal Models Test (Standard) mirror_hardwares: [amdexperimental] @@ -605,9 +618,11 @@ steps: - vllm/worker/model_runner.py - entrypoints/llm/test_collective_rpc.py - tests/v1/test_async_llm_dp.py + - tests/v1/entrypoints/openai/test_multi_api_servers.py - vllm/v1/engine/ commands: - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py + - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index a37bdb0f4d9ef..4452ce22d504e 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -42,3 +42,7 @@ CMakeLists.txt @tlrmchlsmth /tests/v1/structured_output @mgoin @russellb /tests/weight_loading @mgoin @youkaichao /tests/lora @jeejeelee + +# Docs +/docs @hmellor +mkdocs.yaml @hmellor \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml index 00b0f024c0da5..f05be2ba8707a 100644 --- a/.github/ISSUE_TEMPLATE/400-bug-report.yml +++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml @@ -81,14 +81,14 @@ body: required: true - type: markdown attributes: - value: > - โš ๏ธ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output: + value: | + โš ๏ธ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output: - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc). - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect. - Thanks for contributing ๐ŸŽ‰! + Thanks for reporting ๐Ÿ™! - type: checkboxes id: askllm attributes: diff --git a/.github/ISSUE_TEMPLATE/450-ci-failure.yml b/.github/ISSUE_TEMPLATE/450-ci-failure.yml new file mode 100644 index 0000000000000..7af0e0673a2f3 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/450-ci-failure.yml @@ -0,0 +1,69 @@ +name: ๐Ÿงช CI failure report +description: Report a failing test. +title: "[CI Failure]: " +labels: ["ci-failure"] + +body: +- type: markdown + attributes: + value: > + #### Include the name of the failing Buildkite step and test file in the title. +- type: input + attributes: + label: Name of failing test + description: | + Paste in the fully-qualified name of the failing test from the logs. + placeholder: | + `path/to/test_file.py::test_name[params]` + validations: + required: true +- type: checkboxes + attributes: + label: Basic information + description: Select all items that apply to the failing test. + options: + - label: Flaky test + - label: Can reproduce locally + - label: Caused by external libraries (e.g. bug in `transformers`) +- type: textarea + attributes: + label: ๐Ÿงช Describe the failing test + description: | + Please provide a clear and concise description of the failing test. + placeholder: | + A clear and concise description of the failing test. + + ``` + The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present. + ``` + validations: + required: true +- type: textarea + attributes: + label: ๐Ÿ“ History of failing test + description: | + Since when did the test start to fail? + You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main). + + If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods: + + - Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally. + + - Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally. + + - Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only) + placeholder: | + Approximate timeline and/or problematic PRs + + A link to the Buildkite analytics of the failing test (if available) + validations: + required: true +- type: textarea + attributes: + label: CC List. + description: > + The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test. +- type: markdown + attributes: + value: > + Thanks for reporting ๐Ÿ™! diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 7042e81a84daa..65be771b94fb9 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE FIX #xxxx (*link existing issues this PR will resolve*) -**BEFORE SUBMITTING, PLEASE READ ** (anything written below this line will be removed by GitHub Actions) +**BEFORE SUBMITTING, PLEASE READ ** (anything written below this line will be removed by GitHub Actions) diff --git a/.github/mergify.yml b/.github/mergify.yml index ccfd571625b54..e595060c325a5 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -58,7 +58,7 @@ pull_request_rules: - files~=^benchmarks/structured_schemas/ - files=benchmarks/benchmark_serving_structured_output.py - files=benchmarks/run_structured_output_benchmark.sh - - files=docs/source/features/structured_outputs.md + - files=docs/features/structured_outputs.md - files=examples/offline_inference/structured_outputs.py - files=examples/online_serving/openai_chat_completion_structured_outputs.py - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py @@ -135,9 +135,7 @@ pull_request_rules: - files~=^tests/entrypoints/openai/tool_parsers/ - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py - files~=^vllm/entrypoints/openai/tool_parsers/ - - files=docs/source/features/tool_calling.md - - files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md - - files=docs/source/getting_started/examples/chat_with_tools.md + - files=docs/features/tool_calling.md - files~=^examples/tool_chat_* - files=examples/offline_inference/chat_with_tools.py - files=examples/online_serving/openai_chat_completion_client_with_tools_required.py diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh index 3246c6f9bc4b7..8d65936fba1d8 100755 --- a/.github/scripts/cleanup_pr_body.sh +++ b/.github/scripts/cleanup_pr_body.sh @@ -26,7 +26,7 @@ sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}" # Remove HTML
section that includes text of "PR Checklist (Click to Expand)" python3 - < - - vLLM + + vLLM

@@ -58,7 +58,7 @@ vLLM is fast with: - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) - Continuous batching of incoming requests - Fast model execution with CUDA/HIP graph -- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8. +- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8. - Optimized CUDA kernels, including integration with FlashAttention and FlashInfer. - Speculative decoding - Chunked prefill @@ -100,14 +100,14 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more. ## Contributing We welcome and value any contributions and collaborations. -Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved. +Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved. ## Sponsors vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support! - + Cash Donations: - a16z - Dropbox diff --git a/SECURITY.md b/SECURITY.md index 47196a1f1221e..6053cfb41f35b 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -8,4 +8,6 @@ Please report security issues privately using [the vulnerability submission form --- +Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations. + Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models. diff --git a/benchmarks/README.md b/benchmarks/README.md index 4a8ab895e18e9..6f9fbb91cbd91 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -64,6 +64,12 @@ become available. โœ… lmms-lab/LLaVA-OneVision-Data, Aeala/ShareGPT_Vicuna_unfiltered + + Custom + โœ… + โœ… + Local file: data.jsonl + @@ -124,6 +130,38 @@ P99 ITL (ms): 8.39 ================================================== ``` +### Custom Dataset +If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl + +``` +{"prompt": "What is the capital of India?"} +{"prompt": "What is the capital of Iran?"} +{"prompt": "What is the capital of China?"} +``` + +```bash +# start server +VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests +``` + +```bash +# run benchmarking script +python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \ + --backend vllm \ + --model meta-llama/Llama-3.1-8B-Instruct \ + --endpoint /v1/completions \ + --dataset-name custom \ + --dataset-path \ + --custom-skip-chat-template \ + --num-prompts 80 \ + --max-concurrency 1 \ + --temperature=0.3 \ + --top-p=0.75 \ + --result-dir "./log/" +``` + +You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`. + ### VisionArena Benchmark for Vision Language Models ```bash @@ -146,10 +184,9 @@ python3 vllm/benchmarks/benchmark_serving.py \ ``` bash VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ - --speculative-model "[ngram]" \ - --ngram_prompt_lookup_min 2 \ - --ngram-prompt-lookup-max 5 \ - --num_speculative_tokens 5 + --speculative-config $'{"method": "ngram", + "num_speculative_tokens": 5, "prompt_lookup_max": 5, + "prompt_lookup_min": 2}' ``` ``` bash @@ -204,6 +241,16 @@ python3 vllm/benchmarks/benchmark_serving.py \ --seed 42 ``` +**`philschmid/mt-bench`** + +``` bash +python3 vllm/benchmarks/benchmark_serving.py \ + --model Qwen/QwQ-32B \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 80 +``` + ### Running With Sampling Parameters When using OpenAI-compatible backends such as `vllm`, optional sampling @@ -274,10 +321,9 @@ python3 vllm/benchmarks/benchmark_throughput.py \ --output-len=100 \ --num-prompts=2048 \ --async-engine \ - --speculative-model="[ngram]" \ - --ngram_prompt_lookup_min=2 \ - --ngram-prompt-lookup-max=5 \ - --num_speculative_tokens=5 + --speculative-config $'{"method": "ngram", + "num_speculative_tokens": 5, "prompt_lookup_max": 5, + "prompt_lookup_min": 2}' ``` ``` diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 800d426c6d118..85e6eda7f36fd 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -194,6 +194,11 @@ async def async_request_deepspeed_mii( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, ) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith(("completions", "profile")), ( + "OpenAI Completions API URL must end with 'completions' or 'profile'." + ) + async with aiohttp.ClientSession( trust_env=True, timeout=AIOHTTP_TIMEOUT ) as session: @@ -204,6 +209,8 @@ async def async_request_deepspeed_mii( "temperature": 0.01, # deepspeed-mii does not accept 0.0 temp. "top_p": 1.0, } + headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} + output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -215,7 +222,7 @@ async def async_request_deepspeed_mii( st = time.perf_counter() try: async with session.post( - url=request_func_input.api_url, json=payload + url=api_url, json=payload, headers=headers ) as response: if response.status == 200: parsed_resp = await response.json() @@ -317,7 +324,7 @@ async def async_request_openai_completions( most_recent_timestamp = timestamp generated_text += text or "" - elif usage := data.get("usage"): + if usage := data.get("usage"): output.output_tokens = usage.get("completion_tokens") if first_chunk_received: output.success = True @@ -604,6 +611,7 @@ ASYNC_REQUEST_FUNCS = { "tensorrt-llm": async_request_trt_llm, "scalellm": async_request_openai_completions, "sglang": async_request_openai_completions, + "llama.cpp": async_request_openai_completions, } OPENAI_COMPATIBLE_BACKENDS = [ diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index d8f48644cc005..d86bf045ea47e 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -9,9 +9,6 @@ generation. Supported dataset types include: - BurstGPT - HuggingFace - VisionArena - -TODO: Implement CustomDataset to parse a JSON file and convert its contents into -SampleRequest instances, similar to the approach used in ShareGPT. """ import base64 @@ -35,6 +32,7 @@ from transformers import PreTrainedTokenizerBase from vllm.lora.request import LoRARequest from vllm.lora.utils import get_adapter_absolute_path from vllm.multimodal import MultiModalDataDict +from vllm.multimodal.image import convert_image_mode from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer logger = logging.getLogger(__name__) @@ -257,7 +255,7 @@ def process_image(image: Any) -> Mapping[str, Any]: if isinstance(image, dict) and "bytes" in image: image = Image.open(BytesIO(image["bytes"])) if isinstance(image, Image.Image): - image = image.convert("RGB") + image = convert_image_mode(image, "RGB") with io.BytesIO() as image_data: image.save(image_data, format="JPEG") image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") @@ -441,6 +439,97 @@ class ShareGPTDataset(BenchmarkDataset): return samples +# ----------------------------------------------------------------------------- +# Custom Dataset Implementation +# ----------------------------------------------------------------------------- + + +class CustomDataset(BenchmarkDataset): + """ + Implements the Custom dataset. Loads data from a JSONL file and generates + sample requests based on conversation turns. E.g., + ``` + {"prompt": "What is the capital of India?"} + {"prompt": "What is the capital of Iran?"} + {"prompt": "What is the capital of China?"} + ``` + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + # self.data will be a list of dictionaries + # e.g., [{"prompt": "What is the capital of India?"}, ...] + # This will be the standardized format which load_data() + # has to convert into depending on the filetype of dataset_path. + # sample() will assume this standardized format of self.data + self.data = [] + + # Load the JSONL file + if self.dataset_path.endswith(".jsonl"): + jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True) + + # check if the JSONL file has a 'prompt' column + if "prompt" not in jsonl_data.columns: + raise ValueError("JSONL file must contain a 'prompt' column.") + + # Convert each row to a dictionary and append to self.data + # This will convert the DataFrame to a list of dictionaries + # where each dictionary corresponds to a row in the DataFrame. + # This is the standardized format we want for self.data + for _, row in jsonl_data.iterrows(): + self.data.append(row.to_dict()) + else: + raise NotImplementedError( + "Only JSONL format is supported for CustomDataset." + ) + + random.seed(self.random_seed) + random.shuffle(self.data) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + lora_path: Optional[str] = None, + max_loras: Optional[int] = None, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + skip_chat_template: bool = False, + **kwargs, + ) -> list: + sampled_requests = [] + for item in self.data: + if len(sampled_requests) >= num_requests: + break + prompt = item["prompt"] + + # apply template + if not skip_chat_template: + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + ) + ) + self.maybe_oversample_requests(sampled_requests, num_requests) + + return sampled_requests + + # ----------------------------------------------------------------------------- # Sonnet Dataset Implementation # ----------------------------------------------------------------------------- diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index d5aaceeb8c9c3..de62bf5c63c76 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -6,13 +6,12 @@ import dataclasses import json import os import time -from pathlib import Path from typing import Any, Optional import numpy as np -import torch from tqdm import tqdm +import vllm.envs as envs from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs @@ -80,17 +79,9 @@ def main(args: argparse.Namespace): def run_to_completion(profile_dir: Optional[str] = None): if profile_dir: - with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - on_trace_ready=torch.profiler.tensorboard_trace_handler( - str(profile_dir) - ), - ) as p: - llm_generate() - print(p.key_averages().table(sort_by="self_cuda_time_total")) + llm.start_profile() + llm_generate() + llm.stop_profile() else: start_time = time.perf_counter() llm_generate() @@ -103,11 +94,7 @@ def main(args: argparse.Namespace): run_to_completion(profile_dir=None) if args.profile: - profile_dir = args.profile_result_dir - if not profile_dir: - profile_dir = ( - Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}" - ) + profile_dir = envs.VLLM_TORCH_PROFILER_DIR print(f"Profiling (results will be saved to '{profile_dir}')...") run_to_completion(profile_dir=profile_dir) return @@ -164,15 +151,6 @@ if __name__ == "__main__": action="store_true", help="profile the generation process of a single batch", ) - parser.add_argument( - "--profile-result-dir", - type=str, - default=None, - help=( - "path to save the pytorch profiler output. Can be visualized " - "with ui.perfetto.dev or Tensorboard." - ), - ) parser.add_argument( "--output-json", type=str, @@ -189,5 +167,13 @@ if __name__ == "__main__": ) parser = EngineArgs.add_cli_args(parser) + # V1 enables prefix caching by default which skews the latency + # numbers. We need to disable prefix caching by default. + parser.set_defaults(enable_prefix_caching=False) args = parser.parse_args() + if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: + raise OSError( + "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " + "Please set it to a valid path to use torch profiler." + ) main(args) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index a887e7150dc78..6bd9f1b49c2ec 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -60,6 +60,7 @@ from benchmark_dataset import ( ASRDataset, BurstGPTDataset, ConversationDataset, + CustomDataset, HuggingFaceDataset, InstructCoderDataset, MTBenchDataset, @@ -627,7 +628,16 @@ def main(args: argparse.Namespace): "'--dataset-path' if required." ) - if args.dataset_name == "sonnet": + if args.dataset_name == "custom": + dataset = CustomDataset(dataset_path=args.dataset_path) + input_requests = dataset.sample( + num_requests=args.num_prompts, + tokenizer=tokenizer, + output_len=args.custom_output_len, + skip_chat_template=args.custom_skip_chat_template, + ) + + elif args.dataset_name == "sonnet": dataset = SonnetDataset(dataset_path=args.dataset_path) # For the "sonnet" dataset, formatting depends on the backend. if args.backend == "openai-chat": @@ -762,6 +772,10 @@ def main(args: argparse.Namespace): if "temperature" not in sampling_params: sampling_params["temperature"] = 0.0 # Default to greedy decoding. + if args.backend == "llama.cpp": + # Disable prompt caching in llama.cpp backend + sampling_params["cache_prompt"] = False + # Avoid GC processing "static" data - reduce pause times. gc.collect() gc.freeze() @@ -834,6 +848,8 @@ def main(args: argparse.Namespace): ]: if field in result_json: del result_json[field] + if field in benchmark_result: + del benchmark_result[field] # Save to file base_model_id = model_id.split("/")[-1] @@ -846,6 +862,7 @@ def main(args: argparse.Namespace): if args.result_filename: file_name = args.result_filename if args.result_dir: + os.makedirs(args.result_dir, exist_ok=True) file_name = os.path.join(args.result_dir, file_name) with open( file_name, mode="a+" if args.append_result else "w", encoding="utf-8" @@ -886,7 +903,7 @@ if __name__ == "__main__": "--dataset-name", type=str, default="sharegpt", - choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"], + choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"], help="Name of the dataset to benchmark on.", ) parser.add_argument( @@ -1056,6 +1073,19 @@ if __name__ == "__main__": ) # group for dataset specific arguments + custom_group = parser.add_argument_group("custom dataset options") + custom_group.add_argument( + "--custom-output-len", + type=int, + default=256, + help="Number of output tokens per request, used only for custom dataset.", + ) + custom_group.add_argument( + "--custom-skip-chat-template", + action="store_true", + help="Skip applying chat template to prompt, used only for custom dataset.", + ) + sonnet_group = parser.add_argument_group("sonnet dataset options") sonnet_group.add_argument( "--sonnet-input-len", diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 5088c805f53ef..6a50f47d3951c 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -672,7 +672,7 @@ async def benchmark( def evaluate(ret, args): def _eval_correctness_json(expected, actual): # extract json string from string using regex - import re + import regex as re actual = actual.replace("\n", "").replace(" ", "").strip() try: @@ -687,7 +687,7 @@ def evaluate(ret, args): return actual in args.choice def _eval_correctness_regex(expected, actual): - import re + import regex as re return re.match(args.regex, actual) is not None diff --git a/benchmarks/kernels/bench_fp8_gemm.py b/benchmarks/kernels/bench_fp8_gemm.py new file mode 100644 index 0000000000000..36d03e40ef9a1 --- /dev/null +++ b/benchmarks/kernels/bench_fp8_gemm.py @@ -0,0 +1,222 @@ +# SPDX-License-Identifier: Apache-2.0 +import argparse +import copy +import itertools + +import torch +import triton +from weight_shapes import WEIGHT_SHAPES + +from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm +from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], + x_log=False, + line_arg="provider", + line_vals=[ + "torch-bf16", + # "fp8-tensor-w-token-a", + "fp8-tensor-w-tensor-a", + "fp8-channel-w-token-a", + # "fp8-channel-w-tensor-a", + # "fp8-tensor-w-token-a-noquant", + "fp8-tensor-w-tensor-a-noquant", + "fp8-channel-w-token-a-noquant", + # "fp8-channel-w-tensor-a-noquant", + ], + line_names=[ + "torch-bf16", + # "fp8-tensor-w-token-a", + "fp8-tensor-w-tensor-a", + "fp8-channel-w-token-a", + # "fp8-channel-w-tensor-a", + # "fp8-tensor-w-token-a-noquant", + "fp8-tensor-w-tensor-a-noquant", + "fp8-channel-w-token-a-noquant", + # "fp8-channel-w-tensor-a-noquant", + ], + ylabel="TFLOP/s (larger is better)", + plot_name="BF16 vs FP8 GEMMs", + args={}, + ) +) +def benchmark(batch_size, provider, N, K): + M = batch_size + device = "cuda" + dtype = torch.bfloat16 + + # Create input tensors + a = torch.randn((M, K), device=device, dtype=dtype) + b = torch.randn((N, K), device=device, dtype=dtype) + + quantiles = [0.5, 0.2, 0.8] + + if "torch-bf16" in provider: + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: torch.nn.functional.linear(a, b), quantiles=quantiles + ) + + elif "fp8" in provider: + # Weights are always quantized ahead of time + if "noquant" in provider: + # For no quantization, we just measure the GEMM + if "tensor-w-token-a" in provider: + # Dynamic per-token quant for A, per-tensor quant for B + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b) + assert scale_b_fp8.numel() == 1 + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( + a, use_per_token_if_dynamic=True + ) + + def run_quant(): + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "tensor-w-tensor-a" in provider: + # Static per-tensor quantization with fixed scales + # for both A and B + scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) + scale_b = torch.tensor([1.0], device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + assert scale_b_fp8.numel() == 1 + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) + + def run_quant(): + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "channel-w-token-a" in provider: + # Static per-channel quantization for weights, per-token + # quant for A + scale_b = torch.tensor((N,), device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + scale_b_fp8 = scale_b_fp8.expand(N).contiguous() + assert scale_b_fp8.numel() == N + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( + a, use_per_token_if_dynamic=True + ) + + def run_quant(): + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "channel-w-tensor-a" in provider: + # Static per-channel quantization for weights, per-tensor + # quant for A + scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) + scale_b = torch.tensor((N,), device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + scale_b_fp8 = scale_b_fp8.expand(N).contiguous() + assert scale_b_fp8.numel() == N + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) + + def run_quant(): + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + else: + # In these cases, we quantize the activations during the GEMM call + if "tensor-w-token-a" in provider: + # Dynamic per-token quant for A, per-tensor quant for B + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b) + assert scale_b_fp8.numel() == 1 + + def run_quant(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( + a, use_per_token_if_dynamic=True + ) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "tensor-w-tensor-a" in provider: + # Static per-tensor quantization with fixed scales + # for both A and B + scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) + scale_b = torch.tensor([1.0], device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + assert scale_b_fp8.numel() == 1 + + def run_quant(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "channel-w-token-a" in provider: + # Static per-channel quantization for weights, per-token + # quant for A + scale_b = torch.tensor((N,), device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + scale_b_fp8 = scale_b_fp8.expand(N).contiguous() + assert scale_b_fp8.numel() == N + + def run_quant(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( + a, use_per_token_if_dynamic=True + ) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "channel-w-tensor-a" in provider: + # Static per-channel quantization for weights, per-tensor + # quant for A + scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) + scale_b = torch.tensor((N,), device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + scale_b_fp8 = scale_b_fp8.expand(N).contiguous() + assert scale_b_fp8.numel() == N + + def run_quant(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + b_fp8 = b_fp8.t() + + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: run_quant(), quantiles=quantiles + ) + + # Calculate TFLOP/s, two flops per multiply-add + tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3) + return tflops(ms), tflops(max_ms), tflops(min_ms) + + +def prepare_shapes(args): + KN_model_names = [] + models_tps = list(itertools.product(args.models, args.tp_sizes)) + for model, tp_size in models_tps: + assert model in WEIGHT_SHAPES + for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]): + KN[tp_split_dim] = KN[tp_split_dim] // tp_size + KN.append(model) + KN_model_names.append(KN) + return KN_model_names + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--models", + nargs="+", + type=str, + default=["meta-llama/Llama-3.1-8B-Instruct"], + choices=[*WEIGHT_SHAPES.keys()], + help="List of models to benchmark", + ) + parser.add_argument( + "--tp-sizes", + nargs="+", + type=int, + default=[1], + help="List of tensor parallel sizes", + ) + args = parser.parse_args() + + KN_model_names = prepare_shapes(args) + for K, N, model_name in KN_model_names: + print(f"{model_name}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:") + benchmark.run( + print_data=True, + show_plots=True, + save_path=f"bench_fp8_res_n{N}_k{K}", + N=N, + K=K, + ) + + print("Benchmark finished!") diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 17432159c94e7..54f05e7232265 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -84,7 +84,10 @@ def main( if version == "v2": if current_platform.is_rocm(): global PARTITION_SIZE - PARTITION_SIZE = 1024 if not args.custom_paged_attn else PARTITION_SIZE_ROCM + if not args.custom_paged_attn and not current_platform.is_navi(): + PARTITION_SIZE = 1024 + else: + PARTITION_SIZE = PARTITION_SIZE_ROCM num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE tmp_output = torch.empty( size=(num_seqs, num_query_heads, num_partitions, head_size), @@ -159,6 +162,7 @@ def main( scale, block_tables, seq_lens, + None, block_size, max_seq_len, alibi_slopes, diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 110d36db157fd..944024ca35725 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -22,7 +22,7 @@ def benchmark_rope_kernels_multi_lora( seed: int, device: str, max_position: int = 8192, - base: int = 10000, + base: float = 10000, ) -> None: current_platform.seed_everything(seed) torch.set_default_device(device) diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py index ab364a84d6cb2..0c86e40729579 100644 --- a/benchmarks/kernels/graph_machete_bench.py +++ b/benchmarks/kernels/graph_machete_bench.py @@ -2,11 +2,11 @@ import math import pickle -import re from collections import defaultdict import matplotlib.pyplot as plt import pandas as pd +import regex as re import seaborn as sns from torch.utils.benchmark import Measurement as TMeasurement diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py index 89b05d5882a38..afe159ddda6e8 100644 --- a/benchmarks/kernels/weight_shapes.py +++ b/benchmarks/kernels/weight_shapes.py @@ -48,4 +48,50 @@ WEIGHT_SHAPES = { ([16384, 106496], 1), ([53248, 16384], 0), ], + "meta-llama/Llama-3.1-8B-Instruct": [ + ([4096, 6144], 1), + ([4096, 4096], 0), + ([4096, 28672], 1), + ([14336, 4096], 0), + ], + "meta-llama/Llama-3.3-70B-Instruct": [ + ([8192, 10240], 1), + ([8192, 8192], 0), + ([8192, 57344], 1), + ([28672, 8192], 0), + ], + "mistralai/Mistral-Large-Instruct-2407": [ + ([12288, 14336], 1), + ([12288, 12288], 0), + ([12288, 57344], 1), + ([28672, 12288], 0), + ], + "Qwen/Qwen2.5-7B-Instruct": [ + ([3584, 4608], 1), + ([3584, 3584], 0), + ([3584, 37888], 1), + ([18944, 3584], 0), + ], + "Qwen/Qwen2.5-32B-Instruct": [ + ([5120, 7168], 1), + ([5120, 5120], 0), + ([5120, 55296], 1), + ([27648, 5120], 0), + ], + "Qwen/Qwen2.5-72B-Instruct": [ + ([8192, 10240], 1), + ([8192, 8192], 0), + ([8192, 59136], 1), + ([29568, 8192], 0), + ], + "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [ + ([2048, 3072], 1), + ([2048, 4096], 1), + ([2048, 2048], 0), + ([2048, 576], 0), + ([2048, 21888], 1), + ([10944, 2048], 0), + ([2048, 2816], 1), + ([1408, 2048], 0), + ], } diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml index f825cb203269c..65b1e09a247e2 100644 --- a/benchmarks/pyproject.toml +++ b/benchmarks/pyproject.toml @@ -6,11 +6,6 @@ [tool.ruff] line-length = 88 -exclude = [ - # External file, leaving license intact - "examples/other/fp8/quantizer/quantize.py", - "vllm/vllm_flash_attn/flash_attn_interface.pyi" -] [tool.ruff.lint.per-file-ignores] "vllm/third_party/**" = ["ALL"] diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index b04e4c2d06edc..a4edd5b96fe29 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -46,22 +46,38 @@ else() endif() +# Ensure the vllm/vllm_flash_attn directory exists before installation +install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" ALL_COMPONENTS) + +# Make sure vllm-flash-attn install rules are nested under vllm/ +# This is here to support installing all components under the same prefix with cmake --install. +# setup.py installs every component separately but uses the same prefix for all. +# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3, +# and these statements don't hurt when installing neither component. +install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS) +install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS) +install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS) + # Fetch the vllm-flash-attn library FetchContent_MakeAvailable(vllm-flash-attn) message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}") +# Restore the install prefix +install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS) +install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) + # Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in # case only one is built, in the case both are built redundant work is done) install( DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ - DESTINATION vllm_flash_attn + DESTINATION vllm/vllm_flash_attn COMPONENT _vllm_fa2_C FILES_MATCHING PATTERN "*.py" ) install( DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ - DESTINATION vllm_flash_attn + DESTINATION vllm/vllm_flash_attn COMPONENT _vllm_fa3_C FILES_MATCHING PATTERN "*.py" ) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 12e4e39024f5d..6d90555f29678 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -76,7 +76,7 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS) set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc) add_custom_target( hipify${NAME} - COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS} + COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS} DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS} BYPRODUCTS ${HIP_SRCS} COMMENT "Running hipify on ${NAME} extension source files.") diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu index 14e5edd7e283d..6bee9e4ce1166 100644 --- a/csrc/attention/merge_attn_states.cu +++ b/csrc/attention/merge_attn_states.cu @@ -143,6 +143,14 @@ void merge_attn_states_launcher(torch::Tensor& output, const uint pack_size = 16 / sizeof(scalar_t); TORCH_CHECK(head_size % pack_size == 0, "headsize must be multiple of pack_size:", pack_size); + TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1, + "output heads must be contiguous in memory"); + TORCH_CHECK( + prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1, + "prefix_output heads must be contiguous in memory"); + TORCH_CHECK( + suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1, + "suffix_output heads must be contiguous in memory"); float* output_lse_ptr = nullptr; if (output_lse.has_value()) { output_lse_ptr = output_lse.value().data_ptr(); diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index cf67847b45ba0..9a613ba588ddf 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -19,6 +19,7 @@ namespace vec_op { #define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...) \ AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu index 98daf1a1b8e6c..f62d08c17c6d8 100644 --- a/csrc/mamba/causal_conv1d/causal_conv1d.cu +++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu @@ -13,6 +13,10 @@ #include #include +#ifdef USE_ROCM + namespace cub = hipcub; +#endif + #include "static_switch.h" @@ -501,15 +505,9 @@ void causal_conv1d_fwd_launch(ConvParamsBase ¶ms, cudaStream_t stream) { auto kernel = &causal_conv1d_fwd_kernel; if (kSmemSize >= 48 * 1024) { - #ifndef USE_ROCM - C10_CUDA_CHECK(cudaFuncSetAttribute( - kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize)); - #else - // There is a slight signature discrepancy in HIP and CUDA "FuncSetAttribute" function. C10_CUDA_CHECK(cudaFuncSetAttribute( (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize)); std::cerr << "Warning (causal_conv1d fwd launch): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl; - #endif } kernel<<>>(params); diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu index bd0a34119c82b..0c9df925bdbf6 100644 --- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu +++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu @@ -321,7 +321,7 @@ void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) { auto kernel = &selective_scan_fwd_kernel; if (kSmemSize >= 48 * 1024) { C10_CUDA_CHECK(cudaFuncSetAttribute( - kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize)); + (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize)); } kernel<<>>(params); C10_CUDA_KERNEL_LAUNCH_CHECK(); diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h index 0bae119a7c460..8fda434d452f9 100644 --- a/csrc/moe/moe_ops.h +++ b/csrc/moe/moe_ops.h @@ -28,4 +28,6 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output, torch::Tensor num_tokens_post_pad, int64_t top_k, int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N, int64_t BLOCK_SIZE_K, int64_t bit); -#endif \ No newline at end of file +#endif + +bool moe_permute_unpermute_supported(); \ No newline at end of file diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu index 76d5f0eab0218..9a7465261abfe 100644 --- a/csrc/moe/moe_permute_unpermute_op.cu +++ b/csrc/moe/moe_permute_unpermute_op.cu @@ -5,6 +5,9 @@ #include "permute_unpermute_kernels/dispatch.h" #include "core/registration.h" +// moe_permute kernels require at least CUDA 12.0 +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000) + void moe_permute( const torch::Tensor& input, // [n_token, hidden] const torch::Tensor& topk_weights, //[n_token, topk] @@ -127,7 +130,45 @@ void moe_unpermute( }); } +#else + +void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights, + torch::Tensor& topk_ids, + const torch::Tensor& token_expert_indicies, + const std::optional& expert_map, + int64_t n_expert, int64_t n_local_expert, int64_t topk, + const std::optional& align_block_size, + torch::Tensor& permuted_input, + torch::Tensor& expert_first_token_offset, + torch::Tensor& src_row_id2dst_row_id_map, + torch::Tensor& m_indices) { + TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0"); +} + +void moe_unpermute(const torch::Tensor& input, + const torch::Tensor& topk_weights, torch::Tensor& topk_ids, + const torch::Tensor& token_expert_indicies, + const std::optional& expert_map, + int64_t n_expert, int64_t n_local_expert, int64_t topk, + const std::optional& align_block_size, + torch::Tensor& permuted_input, + torch::Tensor& expert_first_token_offset, + torch::Tensor& src_row_id2dst_row_id_map, + torch::Tensor& m_indices) { + TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0"); +} + +#endif + +bool moe_permute_unpermute_supported() { +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000) + return true; +#else + return false; +#endif +} + TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { m.impl("moe_permute", &moe_permute); m.impl("moe_unpermute", &moe_unpermute); -} \ No newline at end of file +} diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu index aa353d0f0437f..de2c153882d93 100644 --- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu +++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu @@ -1,6 +1,9 @@ #include "moe_permute_unpermute_kernel.h" +// moe_permute kernels require at least CUDA 12.0 +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000) + // CubKeyValueSorter definition begin CubKeyValueSorter::CubKeyValueSorter() : num_experts_(0), num_bits_(sizeof(int) * 8) {} @@ -131,9 +134,6 @@ __global__ void preprocessTopkIdKernel(int* topk_id_ptr, int size, int num_experts) { auto tidx = threadIdx.x; auto bidx = blockIdx.x; - auto lidx = tidx & 31; - auto widx = tidx >> 5; - auto warp_count = (blockDim.x + 31) >> 5; auto offset = bidx * blockDim.x; auto bound = min(offset + blockDim.x, size); extern __shared__ int smem_expert_map[]; @@ -226,4 +226,6 @@ void getMIndices(int64_t* expert_first_token_offset, expert_first_token_offset, align_expert_first_token_offset, m_indices, num_local_expert, align_block_size); } -} \ No newline at end of file +} + +#endif diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index 810026d034c07..7d35ec79ead48 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -10,7 +10,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { // Calculate the result of moe by summing up the partial results // from all selected experts. - m.def("moe_sum(Tensor! input, Tensor output) -> ()"); + m.def("moe_sum(Tensor input, Tensor! output) -> ()"); m.impl("moe_sum", torch::kCUDA, &moe_sum); // Aligning the number of tokens to be processed by each expert such @@ -77,7 +77,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { "Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor " "expert_first_token_offset, int n_expert, int n_local_expert,int " "topk, Tensor! hidden_states)->()"); - // conditionally compiled so impl registration is in source file + + m.def("moe_permute_unpermute_supported() -> bool"); + m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported); #endif } diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu index 3c258ddce61e6..e9b408fbf2ee0 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu @@ -123,7 +123,7 @@ bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) { } bool cutlass_group_gemm_supported(int64_t cuda_device_capability) { - // CUTLASS groped FP8 kernels need at least CUDA 12.3 + // CUTLASS grouped FP8 kernels need at least CUDA 12.3 // and SM90 (Hopper) #if defined CUDA_VERSION diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index 8cc5a0f4f2186..f1e7da1641998 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -30,6 +30,14 @@ #define __HIP__GFX9__ #endif +#if defined(__HIPCC__) && (defined(__gfx1100__) || defined(__gfx1101__)) + #define __HIP__GFX11__ +#endif + +#if defined(__HIPCC__) && (defined(__gfx1200__) || defined(__gfx1201__)) + #define __HIP__GFX12__ +#endif + #if defined(NDEBUG) #undef NDEBUG #include @@ -43,7 +51,7 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b)) -#if defined(__HIP__GFX9__) // TODO: Add NAVI support +#if defined(__HIP__GFX9__) #define GCN_MFMA_INSTR1 __builtin_amdgcn_mfma_f32_16x16x4f32 #define GCN_MFMA_INSTR __builtin_amdgcn_mfma_f32_4x4x4f16 @@ -1482,7 +1490,1506 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( } } -#else // !defined(__HIP__GFX9__) TODO: Add NAVI support +#elif defined(__HIP__GFX11__) + +using floatx8 = __attribute__((__vector_size__(8 * sizeof(float)))) float; + +using bit16_t = uint16_t; +using bit16x4 = __attribute__((__vector_size__(4 * sizeof(uint16_t)))) uint16_t; +typedef bit16x4 _B16x4; + +using bit16x8 = __attribute__((__vector_size__(8 * sizeof(uint16_t)))) uint16_t; +union b16x8_u { + bit16x8 u16x8; + _B16x4 xy[2]; +}; +typedef b16x8_u _B16x8; + +using bit16x16 = + __attribute__((__vector_size__(16 * sizeof(uint16_t)))) uint16_t; +union b16x16_u { + bit16x16 u16x16; + _B16x8 xy[2]; +}; +typedef b16x16_u _B16x16; + +using _B8x8 = uint2; +using bit8_t = uint8_t; + +typedef struct _B8x16 { + _B8x8 xy[2]; +} _B8x16; + +template +__device__ __forceinline__ floatx8 gcn_wmma16x16x16_instr(const bit16x16& inpA, + const bit16x16& inpB, + const floatx8& inpC) { + if constexpr (std::is_same::value) { + return __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(inpA, inpB, inpC); + } else if constexpr (std::is_same::value) { + return __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(inpA, inpB, inpC); + } else { + static_assert(false, "unsupported 16b dtype"); + } +} + +template +__device__ __forceinline__ float to_float(const T& inp) { + if constexpr (std::is_same::value) { + return (float)inp; + } else if constexpr (std::is_same::value) { + return __bfloat162float(inp); + } else { + static_assert(false, "unsupported 16b dtype"); + } +} + +template +__device__ __forceinline__ T from_float(const float& inp) { + if constexpr (std::is_same::value) { + return (_Float16)inp; + } else if constexpr (std::is_same::value) { + return __float2bfloat16(inp); + } else { + static_assert(false, "unsupported 16b dtype"); + } +} + +template +__device__ __forceinline__ _B16x8 from_floatx8(const floatx8& inp) { + if constexpr (std::is_same::value) { + union h2cvt { + __half2 h2[4]; + _B16x8 b16x8; + } u; + u.h2[0] = __float22half2_rn(make_float2(inp[0], inp[1])); + u.h2[1] = __float22half2_rn(make_float2(inp[2], inp[3])); + u.h2[2] = __float22half2_rn(make_float2(inp[4], inp[5])); + u.h2[3] = __float22half2_rn(make_float2(inp[6], inp[7])); + return u.b16x8; + } else if constexpr (std::is_same::value) { + union b2cvt { + __hip_bfloat162 b2[4]; + _B16x8 b16x8; + } u; + + u.b2[0] = __float22bfloat162_rn(make_float2(inp[0], inp[1])); + u.b2[1] = __float22bfloat162_rn(make_float2(inp[2], inp[3])); + u.b2[2] = __float22bfloat162_rn(make_float2(inp[4], inp[5])); + u.b2[3] = __float22bfloat162_rn(make_float2(inp[6], inp[7])); + + return u.b16x8; + } else { + static_assert(false, "unsupported 16b dtype"); + } +} + +// clang-format off +template +__global__ +__launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, + // head_size/x, block_size, x] + const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, + // head_size, block_size] + const int num_kv_heads, const float scale, + const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] + const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] + const int max_num_blocks_per_seq, + const float* __restrict__ alibi_slopes, // [num_heads] + const int q_stride, const int kv_block_stride, const int kv_head_stride, + float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] + float* __restrict__ max_logits, // [num_seqs, num_heads, + // max_num_partitions] + scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, + // head_size] + OUTT* __restrict__ final_out, // [num_seqs, num_heads, head_size] + int max_ctx_blocks, const float* k_scale, const float* v_scale) { + // clang-format on + constexpr int NWARPS = NUM_THREADS / WARP_SIZE; // 8 warps on gfx11 + const int warpid = threadIdx.x / WARP_SIZE; + const int laneid = threadIdx.x % WARP_SIZE; + const int lane2id = laneid % 2; + const int lane4id = laneid % 4; + const int lane16id = laneid % 16; + const int rowid = laneid / 16; + + const int seq_idx = blockIdx.x; + // NOTE queries with sequence len > 1 are prefills and taken care by another + // kernel. + if (query_start_loc_ptr != nullptr && + (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx]) != 1) { + return; + } + + const int partition_idx = blockIdx.y; + + constexpr int T_PAR_SIZE = 256; // token partition size set to 256 + + const int max_num_partitions = gridDim.y; + + const int context_len = context_lens[seq_idx]; // length of a seq + + const int partition_start_token_idx = partition_idx * T_PAR_SIZE; + // exit if partition is out of context for seq + if (partition_start_token_idx >= context_len) { + return; + } + + constexpr int GQA_RATIO2 = DIVIDE_ROUND_UP(GQA_RATIO, 2); + + __shared__ float shared_qk_max[NWARPS][16 + 1]; + __shared__ float shared_exp_sum[NWARPS][16 + 1]; + // shared_logits is used for multiple purposes + __shared__ _B16x16 shared_logits[NWARPS][2][16][2]; + + // for QK wmma16x16, layout is QHead/Tokenx16 across every 16 lanes, + // 32 Bytes HeadElements in each lane, 2x16B HeadElements across a row of warp + constexpr int ROWS_PER_WARP = + WARP_SIZE / 16 / 2; // rows refers to 16 lanes; refer dpp terminology + constexpr int CONTIGUOUS_KV_ELEMS_16B_LOAD = + 16 / sizeof(cache_t); // 8 for 16 bit cache type, 16 for 8 bit types + constexpr int QKHE_PER_FETCH = + CONTIGUOUS_KV_ELEMS_16B_LOAD * + ROWS_PER_WARP; // each fetch across a warp fetches these many elements + constexpr int QKHELOOP = HEAD_SIZE / QKHE_PER_FETCH; // 2xQKHE_16B across + // warp + + _B16x16 Qlocal[QKHELOOP / 2]; // note that 16 contiguous elements of Q should + // be fetched per lane for 16 bit cache types + + constexpr int CONTIGUOUS_SCALAR_ELEMS_16B = 16 / sizeof(scalar_t); + + constexpr int TOKENS_PER_WARP = + T_PAR_SIZE / + NWARPS; // sub partition of tokens per warp for qk calculation + constexpr int TLOOP = + TOKENS_PER_WARP / + 16; // each wmma16x16x16 instruction processes 16 tokens + + _B16x16 Klocal[TLOOP] + [QKHELOOP / 2]; // can be interpreted as B8x16 for 8 bit types + + const int wg_start_head_idx = blockIdx.z * GQA_RATIO; + const int wg_start_kv_head_idx = blockIdx.z; + const int total_num_heads = gridDim.z * GQA_RATIO; + + // for QK wmma, tokens in multiples of TOKENS_PER_WARP are spread across warps + // each wmma takes QH16xT16x16HE across warp + // repeat wmma across QKHELOOP dimension + // output layout from QKwmma : QH16xT8x2 16 qheads across 16 lanes, 16 tokens + // across 2 rows x 8 tokens per lane + + const int64_t query_start_off = static_cast( + query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx); + + if (GQA_RATIO == 1) { + const int local_qhead_idx = lane16id % GQA_RATIO; + const int global_qhead_idx = wg_start_head_idx + local_qhead_idx; + const scalar_t* q_ptr = + q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE; + if (lane16id < GQA_RATIO) { + #pragma unroll + for (int qkhe_depth = 0; qkhe_depth < QKHELOOP / 2; qkhe_depth++) { + const scalar_t* q_fetch_ptr = q_ptr + qkhe_depth * QKHE_PER_FETCH * 2; + const _B16x16* q_fetch_ptr_32B = + reinterpret_cast(q_fetch_ptr); + Qlocal[qkhe_depth] = *q_fetch_ptr_32B; + } + } + } else { + // fetch Q in shared across warps and then write to registers + const int local_qhead_idx = 2 * warpid + rowid; + const int global_qhead_idx = wg_start_head_idx + local_qhead_idx; + const scalar_t* q_ptr = + q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE; + + const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B; + if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) { + const scalar_t* q_fetch_ptr = q_ptr + qhead_element; + const _B16x8* q_fetch_ptr_16B = + reinterpret_cast(q_fetch_ptr); + _B16x8 tmp = *q_fetch_ptr_16B; + + const int offset1 = + lane16id / + 2; // 16 contiguous chunks of head elems are spread across 8x2lanes + shared_logits[offset1][lane2id][local_qhead_idx][0].xy[0] = tmp; + } + + __syncthreads(); + + #pragma unroll + for (int qkhe_depth = 0; qkhe_depth < QKHELOOP / 2; qkhe_depth++) { + Qlocal[qkhe_depth].xy[0] = + shared_logits[qkhe_depth][0][lane16id % GQA_RATIO][0].xy[0]; + Qlocal[qkhe_depth].xy[1] = + shared_logits[qkhe_depth][1][lane16id % GQA_RATIO][0].xy[0]; + } + } + + const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE); + const int last_ctx_block = num_context_blocks - 1; + + const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq; + + int kphysical_block_number[TLOOP]; + + // fetch k physical block numbers + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + const int klocal_token_idx = + TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id; + const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx; + const int kblock_idx = (kglobal_token_idx < context_len) + ? kglobal_token_idx / BLOCK_SIZE + : last_ctx_block; + kphysical_block_number[token_depth] = block_table_seq[kblock_idx]; + } + + constexpr int KX = 16 / sizeof(cache_t); + const cache_t* k_ptr = k_cache + wg_start_kv_head_idx * kv_head_stride; + + const int row_head_elem = 0; + + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + const int64_t kblock_number = + static_cast(kphysical_block_number[token_depth]); + const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride; + const int klocal_token_idx = + TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id; + const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx; + const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE; + const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX; + + for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) { + const int head_elem = row_head_elem + qkhe_depth * QKHE_PER_FETCH; + const int offset1 = head_elem / KX; + const int offset2 = head_elem % KX; + const cache_t* k_fetch_ptr = k_ptr3 + offset1 * BLOCK_SIZE * KX + offset2; + const _B16x8* k_fetch_ptr_16B = + reinterpret_cast(k_fetch_ptr); + Klocal[token_depth][qkhe_depth / 2].xy[qkhe_depth % 2] = *k_fetch_ptr_16B; + } + } + + constexpr int VTOKENS_PER_LANE = + TOKENS_PER_WARP / ROWS_PER_WARP; // 32/1 = 32 vtokens per lane + constexpr int VBLOCKS_PER_LANE = 2; // assumes block size >=16 + constexpr int VTLOOP = NWARPS; // corresponds to tokens across warps + constexpr int VTLANELOOP = DIVIDE_ROUND_UP( + VTOKENS_PER_LANE, + CONTIGUOUS_KV_ELEMS_16B_LOAD); // optimized for 16B fetches; assumes + // minimum block size is 16 + constexpr int VHELOOP = DIVIDE_ROUND_UP( + (HEAD_SIZE / 16), NWARPS); // head_size distributed across warps; each + // wmma instr works on 16 head elements + + int vphysical_block_number[VTLOOP][VBLOCKS_PER_LANE]; + + // fetch v physical block numbers + for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) { + for (int vblock_depth = 0; vblock_depth < VBLOCKS_PER_LANE; + vblock_depth++) { + const int vlocal_token_idx = + vtoken_depth * VTOKENS_PER_LANE * ROWS_PER_WARP + + vblock_depth * BLOCK_SIZE; + const int vglobal_token_idx = + partition_start_token_idx + vlocal_token_idx; + const int vblock_idx = (vglobal_token_idx < context_len) + ? vglobal_token_idx / BLOCK_SIZE + : last_ctx_block; + vphysical_block_number[vtoken_depth][vblock_depth] = + block_table_seq[vblock_idx]; + } + } + + _B16x16 Vlocal[VTLOOP][VHELOOP] + [VTLANELOOP / 2]; // this can be interpreted as B8x16 too + + const cache_t* v_ptr = v_cache + wg_start_kv_head_idx * kv_head_stride; + // v fetches are 16head elems across lanes x (16x2) tokens per lane + for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) { + const int vhead_elem = vhe_depth * NWARPS * 16 + warpid * 16 + lane16id; + const cache_t* v_ptr2 = v_ptr + vhead_elem * BLOCK_SIZE; + + for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) { + for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP; vfetch_depth++) { + const int64_t vblock_number = static_cast( + vphysical_block_number[vtoken_depth] + [vfetch_depth / VBLOCKS_PER_LANE]); + const cache_t* v_ptr3 = v_ptr2 + (vblock_number * kv_block_stride); + + const cache_t* v_fetch_ptr = + v_ptr3 + + (vfetch_depth % VBLOCKS_PER_LANE) * CONTIGUOUS_KV_ELEMS_16B_LOAD; + const _B16x8* v_fetch_ptr_16B = + reinterpret_cast(v_fetch_ptr); + Vlocal[vtoken_depth][vhe_depth][vfetch_depth / 2].xy[vfetch_depth % 2] = + *v_fetch_ptr_16B; + } + } + } + + floatx8 dout[TLOOP]; + // qk wmma + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + dout[token_depth] = {0}; + for (int qkhe_depth = 0; qkhe_depth < QKHELOOP / 2; qkhe_depth++) { + dout[token_depth] = gcn_wmma16x16x16_instr( + Klocal[token_depth][qkhe_depth].u16x16, Qlocal[qkhe_depth].u16x16, + dout[token_depth]); + } + dout[token_depth] *= scale; + } + + // calculate qk_max and exp_sum per warp and write to shared memory + float qk_max = -FLT_MAX; + float exp_sum = 0.0f; + const int qkout_token_idx = + partition_start_token_idx + TOKENS_PER_WARP * warpid + rowid; + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + const int local_token_idx = qkout_token_idx + token_depth * 16; + for (int i = 0; i < 8; i++) { + const float tmp = (local_token_idx + 2 * i < context_len) + ? dout[token_depth][i] + : -FLT_MAX; + qk_max = fmaxf(qk_max, tmp); + } + } + + qk_max = fmaxf(qk_max, __shfl_xor(qk_max, 16)); + + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + const int local_token_idx = qkout_token_idx + token_depth * 16; + for (int i = 0; i < 8; i++) { + const float tmp = (local_token_idx + 2 * i < context_len) + ? __expf(dout[token_depth][i] - qk_max) + : 0.0f; + dout[token_depth][i] = tmp; + exp_sum += tmp; + } + } + + exp_sum += __shfl_xor(exp_sum, 16); + + __syncthreads(); + + if (laneid < 16) { + shared_qk_max[warpid][lane16id] = qk_max; + shared_exp_sum[warpid][lane16id] = exp_sum; + } + + __syncthreads(); + + // calculate partition qk_max and exp_sum + float partition_qk_max = -FLT_MAX; + float warp_qk_max_exp[NWARPS]; + float partition_exp_sum = 0.0f; + + #pragma unroll + for (int w = 0; w < NWARPS; w++) { + warp_qk_max_exp[w] = shared_qk_max[w][lane16id]; + partition_qk_max = fmaxf(partition_qk_max, warp_qk_max_exp[w]); + } + + for (int w = 0; w < NWARPS; w++) { + warp_qk_max_exp[w] = __expf(warp_qk_max_exp[w] - partition_qk_max); + partition_exp_sum += shared_exp_sum[w][lane16id] * warp_qk_max_exp[w]; + } + + const float inv_sum_scale = + __fdividef(1.f, partition_exp_sum + 1e-6f) * warp_qk_max_exp[warpid]; + + __syncthreads(); + + // write logits to shared mem + #pragma unroll + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + dout[token_depth] *= inv_sum_scale; + shared_logits[warpid][token_depth][lane16id][0].xy[rowid] = + from_floatx8(dout[token_depth]); + } + __syncthreads(); + + _B16x8 swp_buf[TLOOP][2]; + #pragma unroll + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + swp_buf[token_depth][0] = + shared_logits[warpid][token_depth][lane16id][0].xy[0]; + swp_buf[token_depth][1] = + shared_logits[warpid][token_depth][lane16id][0].xy[1]; + } + + #pragma unroll + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + #pragma unroll + for (int i = 0; i < 8; i++) { + shared_logits[warpid][token_depth][lane16id][0].xy[rowid].u16x8[i] = + swp_buf[token_depth][i % 2].u16x8[4 * rowid + (i / 2)]; + } + } + + // write out partition max_logits and exp_sum + if (threadIdx.x < GQA_RATIO) { + const int qhead_idx = lane16id; + const int offset = seq_idx * total_num_heads * max_num_partitions + + (wg_start_head_idx + qhead_idx) * max_num_partitions + + partition_idx; + max_logits[offset] = partition_qk_max; + exp_sums[offset] = partition_exp_sum; + } + + __syncthreads(); + + _B16x8 outelems[VHELOOP]; + // Softmax V wmma + // v layout: 16he across lanes x (16x2) tokens per lane + for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) { + floatx8 tmp_out = {0}; + for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) { + for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP / 2; + vfetch_depth++) { + const int offset = vfetch_depth; + // if output format is 16 qheads across 16 lanes, 16 head elems spread + // across rows + tmp_out = gcn_wmma16x16x16_instr( + Vlocal[vtoken_depth][vhe_depth][vfetch_depth].u16x16, + shared_logits[vtoken_depth][offset][lane16id][0].u16x16, tmp_out); + } + } + outelems[vhe_depth] = from_floatx8(tmp_out); + } + + __syncthreads(); + + #pragma unroll + for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) { + shared_logits[warpid][vhe_depth][lane16id][0].xy[rowid] = + outelems[vhe_depth]; // lane16 id head dimension; rowid head element + // dimension + } + + __syncthreads(); + + #pragma unroll + for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) { + swp_buf[vhe_depth][0] = shared_logits[warpid][vhe_depth][lane16id][0].xy[0]; + swp_buf[vhe_depth][1] = shared_logits[warpid][vhe_depth][lane16id][0].xy[1]; + } + + #pragma unroll + for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) { + #pragma unroll + for (int i = 0; i < 8; i++) { + shared_logits[warpid][vhe_depth][lane16id][0].xy[rowid].u16x8[i] = + swp_buf[vhe_depth][i % 2].u16x8[4 * rowid + (i / 2)]; + } + } + + __syncthreads(); + + // write to tmp_out with coalesced writes after reading from shared mem + if (warpid == 0) { + _B16x8 vout[GQA_RATIO2]; + // each lane writes out 16Bytes of tmp_out along head elem dimension + const int head_elem_idx = lane16id * 8; + if (head_elem_idx < HEAD_SIZE) { + for (int h = 0; h < GQA_RATIO2; h++) { + const int local_head_idx = 2 * h + rowid; + const int offset1 = (head_elem_idx / 16) % NWARPS; + const int offset2 = head_elem_idx / 16 / NWARPS; + const int offset3 = (head_elem_idx / 8) % 2; // num_he % num_row + vout[h] = + shared_logits[offset1][offset2][local_head_idx][0].xy[offset3]; + } + + const int hsz_maxp_mult = HEAD_SIZE * max_num_partitions; + scalar_t* out_ptr = out + seq_idx * total_num_heads * hsz_maxp_mult + + partition_idx * HEAD_SIZE; + for (int h = 0; h < GQA_RATIO2; h++) { + const int local_head_idx = 2 * h + rowid; + if (local_head_idx < GQA_RATIO) { + const int out_head_idx = wg_start_head_idx + local_head_idx; + scalar_t* out_ptr2 = out_ptr + out_head_idx * hsz_maxp_mult; + scalar_t* out_ptr3 = out_ptr2 + head_elem_idx; + _B16x8* out_ptr_B16x8 = reinterpret_cast<_B16x8*>(out_ptr3); + *out_ptr_B16x8 = vout[h]; + } + } + } + } +} + +template +__global__ +__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, + // head_size/x, block_size, x] + const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, + // head_size, block_size] + const int num_kv_heads, const float scale, + const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] + const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] + const int max_num_blocks_per_seq, + const float* __restrict__ alibi_slopes, // [num_heads] + const int q_stride, const int kv_block_stride, const int kv_head_stride, + float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] + float* __restrict__ max_logits, // [num_seqs, num_heads, + // max_num_partitions] + scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, + // head_size] + OUTT* __restrict__ final_out, // [num_seqs, num_heads, head_size] + int max_ctx_blocks, const float* k_scale, const float* v_scale) { + UNREACHABLE_CODE +} + +// Grid: (num_heads, num_seqs). +template +__global__ +__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( + OUTT* __restrict__ out, // [num_seqs, num_heads, head_size] + const float* __restrict__ exp_sums, // [num_seqs, num_heads, + // max_num_partitions] + const float* __restrict__ max_logits, // [num_seqs, num_heads, + // max_num_partitions] + const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, + // max_num_partitions, head_size] + const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] + const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) { + const auto num_heads = gridDim.x; + const auto head_idx = blockIdx.x; + const auto seq_idx = blockIdx.y; + + // NOTE queries with sequence len > 1 are prefills and taken care by another + // kernel. + if (query_start_loc_ptr != nullptr && + (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) { + return; + } + + const int context_len = context_lens[seq_idx]; + const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); + [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; + const int warpid = threadIdx.x / WARP_SIZE; + [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE; + + __shared__ float shared_global_exp_sum; + // max num partitions supported is warp_size * NPAR_LOOPS + __shared__ float shared_exp_sums[NPAR_LOOPS * WARP_SIZE]; + + if (warpid == 0) { + const float* max_logits_ptr = max_logits + + seq_idx * num_heads * max_num_partitions + + head_idx * max_num_partitions; + + // valid partition is the last valid partition in case threadid > num + // partitions + int valid_partition[NPAR_LOOPS]; + float reg_max_logit[NPAR_LOOPS]; + const int last_valid_partition = num_partitions - 1; + + #pragma unroll + for (int i = 0; i < NPAR_LOOPS; i++) { + const int partition_no = i * WARP_SIZE + threadIdx.x; + valid_partition[i] = + (partition_no < num_partitions) ? partition_no : last_valid_partition; + } + #pragma unroll + for (int i = 0; i < NPAR_LOOPS; i++) { + reg_max_logit[i] = max_logits_ptr[valid_partition[i]]; + } + float max_logit = reg_max_logit[0]; + #pragma unroll + for (int i = 1; i < NPAR_LOOPS; i++) { + max_logit = fmaxf(max_logit, reg_max_logit[i]); + } + + #pragma unroll + for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { + max_logit = fmaxf(max_logit, __shfl_xor(max_logit, mask)); + } + + const float* exp_sums_ptr = exp_sums + + seq_idx * num_heads * max_num_partitions + + head_idx * max_num_partitions; + + float rescaled_exp_sum[NPAR_LOOPS]; + #pragma unroll + for (int i = 0; i < NPAR_LOOPS; i++) { + rescaled_exp_sum[i] = exp_sums_ptr[valid_partition[i]]; + } + #pragma unroll + for (int i = 0; i < NPAR_LOOPS; i++) { + const int partition_no = i * WARP_SIZE + threadIdx.x; + rescaled_exp_sum[i] *= (partition_no < num_partitions) + ? expf(reg_max_logit[i] - max_logit) + : 0.0f; + } + float global_exp_sum = rescaled_exp_sum[0]; + #pragma unroll + for (int i = 1; i < NPAR_LOOPS; i++) { + global_exp_sum += rescaled_exp_sum[i]; + } + #pragma unroll + for (int i = 0; i < NPAR_LOOPS; i++) { + const int partition_no = i * WARP_SIZE + threadIdx.x; + shared_exp_sums[partition_no] = rescaled_exp_sum[i]; + } + + #pragma unroll + for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { + global_exp_sum += __shfl_xor(global_exp_sum, mask); + } + if (threadIdx.x == 0) { + shared_global_exp_sum = global_exp_sum; + } + } // warpid == 0 + const scalar_t* tmp_out_ptr = + tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE + + head_idx * max_num_partitions * HEAD_SIZE + threadIdx.x; + constexpr int MAX_NPAR = 32; + scalar_t tmps[MAX_NPAR]; + const float dzero = 0.0f; + #pragma unroll + for (int j = 0; j < MAX_NPAR; j++) { + tmps[j] = from_float(dzero); + } + const int last_partition_offset = (num_partitions - 1) * HEAD_SIZE; + const int num_partition_offset = (num_partitions)*HEAD_SIZE; + int idx = 0; + + constexpr int JCHUNK = 16; + + #pragma unroll + for (int j = 0; j < JCHUNK * HEAD_SIZE; j += HEAD_SIZE) { + // lastj is last valid partition + const int lastj_offset = + (j < num_partition_offset) ? j : last_partition_offset; + tmps[idx] = tmp_out_ptr[lastj_offset]; + idx++; + } + __syncthreads(); + + if (num_partitions > JCHUNK) { + #pragma unroll + for (int j = JCHUNK * HEAD_SIZE; j < 2 * JCHUNK * HEAD_SIZE; + j += HEAD_SIZE) { + const int lastj_offset = + (j < num_partition_offset) ? j : last_partition_offset; + tmps[idx] = tmp_out_ptr[lastj_offset]; + idx++; + } + + if (num_partitions > 2 * JCHUNK) { + #pragma unroll + for (int j = 2 * JCHUNK * HEAD_SIZE; j < MAX_NPAR * HEAD_SIZE; + j += HEAD_SIZE) { + const int lastj_offset = + (j < num_partition_offset) ? j : last_partition_offset; + tmps[idx] = tmp_out_ptr[lastj_offset]; + idx++; + } + } + } // num_partitions > JCHUNK + + // Aggregate tmp_out to out. + float acc = 0.0f; + #pragma unroll + for (int j = 0; j < JCHUNK; j++) { + acc += to_float(tmps[j]) * shared_exp_sums[j]; + } + if (num_partitions > JCHUNK) { + #pragma unroll + for (int j = JCHUNK; j < 2 * JCHUNK; j++) { + acc += to_float(tmps[j]) * shared_exp_sums[j]; + } + if (num_partitions > 2 * JCHUNK) { + #pragma unroll + for (int j = 2 * JCHUNK; j < MAX_NPAR; j++) { + acc += to_float(tmps[j]) * shared_exp_sums[j]; + } + } + } + + for (int p = 1; p < NPAR_LOOPS; p++) { + if (num_partitions > p * MAX_NPAR) { + idx = 0; + #pragma unroll + for (int j = p * MAX_NPAR * HEAD_SIZE; j < (p + 1) * MAX_NPAR * HEAD_SIZE; + j += HEAD_SIZE) { + // lastj is last valid partition + const int lastj_offset = + (j < num_partition_offset) ? j : last_partition_offset; + tmps[idx] = tmp_out_ptr[lastj_offset]; + idx++; + } + + #pragma unroll + for (int j = 0; j < MAX_NPAR; j++) { + acc += to_float(tmps[j]) * shared_exp_sums[j + p * MAX_NPAR]; + } + } + } + + const float inv_global_exp_sum = + __fdividef(1.0f, shared_global_exp_sum + 1e-6f); + acc *= inv_global_exp_sum; + + const int64_t query_start_off = static_cast( + query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx); + OUTT* out_ptr = out + query_start_off * num_heads * HEAD_SIZE + + static_cast(head_idx) * HEAD_SIZE; + out_ptr[threadIdx.x] = from_float(acc); +} + +#elif defined(__HIP__GFX12__) + +using floatx8 = __attribute__((__vector_size__(8 * sizeof(float)))) float; + +using bit16_t = uint16_t; +using bit16x4 = __attribute__((__vector_size__(4 * sizeof(uint16_t)))) uint16_t; +typedef bit16x4 _B16x4; + +using bit16x8 = __attribute__((__vector_size__(8 * sizeof(uint16_t)))) uint16_t; +union b16x8_u { + bit16x8 u16x8; + _B16x4 xy[2]; +}; +typedef b16x8_u _B16x8; + +using _B8x8 = uint2; +using bit8_t = uint8_t; + +typedef struct _B8x16 { + _B8x8 xy[2]; +} _B8x16; + +template +__device__ __forceinline__ floatx8 gcn_wmma16x16x16_instr(const bit16x8& inpA, + const bit16x8& inpB, + const floatx8& inpC) { + if constexpr (std::is_same::value) { + return __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(inpA, inpB, inpC); + } else if constexpr (std::is_same::value) { + return __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(inpA, inpB, inpC); + } else { + static_assert(false, "unsupported 16b dtype"); + } +} + +template +__device__ __forceinline__ float to_float(const T& inp) { + if constexpr (std::is_same::value) { + return (float)inp; + } else if constexpr (std::is_same::value) { + return __bfloat162float(inp); + } else { + static_assert(false, "unsupported 16b dtype"); + } +} + +template +__device__ __forceinline__ float to_float_b16(const bit16_t& inp) { + union tmpcvt { + bit16_t u; + _Float16 f; + __hip_bfloat16 b; + } t16; + t16.u = inp; + if constexpr (std::is_same::value) { + return (float)t16.f; + } else if constexpr (std::is_same::value) { + return __bfloat162float(t16.b); + } else { + static_assert(false, "unsupported 16b dtype"); + } +} + +template +__device__ __forceinline__ T from_float(const float& inp) { + if constexpr (std::is_same::value) { + return (_Float16)inp; + } else if constexpr (std::is_same::value) { + return __float2bfloat16(inp); + } else { + static_assert(false, "unsupported 16b dtype"); + } +} + +template +__device__ __forceinline__ _B16x8 from_floatx8(const floatx8& inp) { + if constexpr (std::is_same::value) { + union h2cvt { + __half2 h2[4]; + _B16x8 b16x8; + } u; + u.h2[0] = __float22half2_rn(make_float2(inp[0], inp[1])); + u.h2[1] = __float22half2_rn(make_float2(inp[2], inp[3])); + u.h2[2] = __float22half2_rn(make_float2(inp[4], inp[5])); + u.h2[3] = __float22half2_rn(make_float2(inp[6], inp[7])); + return u.b16x8; + } else if constexpr (std::is_same::value) { + union b2cvt { + __hip_bfloat162 b2[4]; + _B16x8 b16x8; + } u; + + u.b2[0] = __float22bfloat162_rn(make_float2(inp[0], inp[1])); + u.b2[1] = __float22bfloat162_rn(make_float2(inp[2], inp[3])); + u.b2[2] = __float22bfloat162_rn(make_float2(inp[4], inp[5])); + u.b2[3] = __float22bfloat162_rn(make_float2(inp[6], inp[7])); + + return u.b16x8; + } else { + static_assert(false, "unsupported 16b dtype"); + } +} + +// clang-format off +template +__global__ +__launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, + // head_size/x, block_size, x] + const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, + // head_size, block_size] + const int num_kv_heads, const float scale, + const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] + const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] + const int max_num_blocks_per_seq, + const float* __restrict__ alibi_slopes, // [num_heads] + const int q_stride, const int kv_block_stride, const int kv_head_stride, + float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] + float* __restrict__ max_logits, // [num_seqs, num_heads, + // max_num_partitions] + scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, + // head_size] + OUTT* __restrict__ final_out, // [num_seqs, num_heads, head_size] + int max_ctx_blocks, const float* k_scale, const float* v_scale) { + // clang-format on + constexpr int NWARPS = NUM_THREADS / WARP_SIZE; // 8 warps on gfx11 + const int warpid = threadIdx.x / WARP_SIZE; + const int laneid = threadIdx.x % WARP_SIZE; + const int lane2id = laneid % 2; + const int lane4id = laneid % 4; + const int lane16id = laneid % 16; + const int rowid = laneid / 16; + + const int seq_idx = blockIdx.x; + // NOTE queries with sequence len > 1 are prefills and taken care by another + // kernel. + if (query_start_loc_ptr != nullptr && + (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) { + return; + } + const int partition_idx = blockIdx.y; + + constexpr int T_PAR_SIZE = 256; // token partition size set to 256 + + const int max_num_partitions = gridDim.y; + + const int context_len = context_lens[seq_idx]; // length of a seq + + const int partition_start_token_idx = partition_idx * T_PAR_SIZE; + // exit if partition is out of context for seq + if (partition_start_token_idx >= context_len) { + return; + } + + constexpr int GQA_RATIO2 = DIVIDE_ROUND_UP(GQA_RATIO, 2); + + __shared__ float shared_qk_max[NWARPS][16 + 1]; + __shared__ float shared_exp_sum[NWARPS][16 + 1]; + // shared_logits is used for multiple purposes + __shared__ _B16x8 shared_logits[NWARPS][2][16][2]; + + // for QK wmma16x16_gfx12, layout is QHead/Tokenx16 across every 16 lanes, + // 16 Bytes HeadElements in each lane, 2x16B HeadElements across 2 rows of + // warp + constexpr int ROWS_PER_WARP = + WARP_SIZE / 16; // rows refers to 16 lanes; refer dpp terminology + constexpr int CONTIGUOUS_KV_ELEMS_16B_LOAD = + 16 / sizeof(cache_t); // 8 for 16 bit cache type, 16 for 8 bit types + constexpr int QKHE_PER_FETCH = + CONTIGUOUS_KV_ELEMS_16B_LOAD * + ROWS_PER_WARP; // each fetch across a warp fetches these many elements + constexpr int QKHELOOP = HEAD_SIZE / QKHE_PER_FETCH; // 2xQKHE_16B across + // warp + + _B16x8 Qlocal[QKHELOOP]; // note that 16 contiguous elements of Q should + // be fetched per lane for 16 bit cache types + + constexpr int CONTIGUOUS_SCALAR_ELEMS_16B = 16 / sizeof(scalar_t); + + constexpr int TOKENS_PER_WARP = + T_PAR_SIZE / + NWARPS; // sub partition of tokens per warp for qk calculation + constexpr int TLOOP = + TOKENS_PER_WARP / + 16; // each wmma16x16x16 instruction processes 16 tokens + + _B16x8 Klocal[TLOOP] + [QKHELOOP]; // can be interpreted as B8x16 for 8 bit types + + const int wg_start_head_idx = blockIdx.z * GQA_RATIO; + const int wg_start_kv_head_idx = blockIdx.z; + const int total_num_heads = gridDim.z * GQA_RATIO; + + // for QK wmma, tokens in multiples of TOKENS_PER_WARP are spread across warps + // each wmma takes QH16xT16x16HE across warp + // repeat wmma across QKHELOOP dimension + // output layout from QKwmma : QH16xT8x2 16 qheads across 16 lanes, 16 tokens + // across 2 rows x 8 tokens per lane + + const int64_t query_start_off = static_cast( + query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx); + + if (GQA_RATIO == 1) { + const int local_qhead_idx = lane16id % GQA_RATIO; + const int global_qhead_idx = wg_start_head_idx + local_qhead_idx; + const scalar_t* q_ptr = q + query_start_off * q_stride + + global_qhead_idx * HEAD_SIZE + + rowid * CONTIGUOUS_KV_ELEMS_16B_LOAD; + if (lane16id < GQA_RATIO) { + #pragma unroll + for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) { + const scalar_t* q_fetch_ptr = q_ptr + qkhe_depth * QKHE_PER_FETCH; + const _B16x8* q_fetch_ptr_16B = + reinterpret_cast(q_fetch_ptr); + Qlocal[qkhe_depth] = *q_fetch_ptr_16B; + } + } + } else { + // fetch Q in shared across warps and then write to registers + const int local_qhead_idx = 2 * warpid + rowid; + const int global_qhead_idx = wg_start_head_idx + local_qhead_idx; + const scalar_t* q_ptr = + q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE; + + const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B; + if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) { + const scalar_t* q_fetch_ptr = q_ptr + qhead_element; + const _B16x8* q_fetch_ptr_16B = + reinterpret_cast(q_fetch_ptr); + _B16x8 tmp = *q_fetch_ptr_16B; + + const int offset1 = + lane16id / + 2; // 16 contiguous chunks of head elems are spread across 8x2lanes + shared_logits[offset1][lane2id][local_qhead_idx][0] = tmp; + } + + __syncthreads(); + + #pragma unroll + for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) { + Qlocal[qkhe_depth] = + shared_logits[qkhe_depth][rowid][lane16id % GQA_RATIO][0]; + } + } + + const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE); + const int last_ctx_block = num_context_blocks - 1; + + const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq; + + int kphysical_block_number[TLOOP]; + + // fetch k physical block numbers + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + const int klocal_token_idx = + TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id; + const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx; + const int kblock_idx = (kglobal_token_idx < context_len) + ? kglobal_token_idx / BLOCK_SIZE + : last_ctx_block; + kphysical_block_number[token_depth] = block_table_seq[kblock_idx]; + } + + constexpr int KX = 16 / sizeof(cache_t); + const cache_t* k_ptr = k_cache + wg_start_kv_head_idx * kv_head_stride; + + const int row_head_elem = rowid * CONTIGUOUS_KV_ELEMS_16B_LOAD; + + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + const int64_t kblock_number = + static_cast(kphysical_block_number[token_depth]); + const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride; + const int klocal_token_idx = + TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id; + const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx; + const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE; + const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX; + + for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) { + const int head_elem = row_head_elem + qkhe_depth * QKHE_PER_FETCH; + const int offset1 = head_elem / KX; + const int offset2 = head_elem % KX; + const cache_t* k_fetch_ptr = k_ptr3 + offset1 * BLOCK_SIZE * KX + offset2; + const _B16x8* k_fetch_ptr_16B = + reinterpret_cast(k_fetch_ptr); + Klocal[token_depth][qkhe_depth] = *k_fetch_ptr_16B; + } + } + + constexpr int VTOKENS_PER_LANE = + TOKENS_PER_WARP / ROWS_PER_WARP; // 32/2 = 16 vtokens per lane + constexpr int VBLOCKS_PER_LANE = 1; // assumes block size >=16 + constexpr int VTLOOP = NWARPS; // corresponds to tokens across warps + constexpr int VTLANELOOP = DIVIDE_ROUND_UP( + VTOKENS_PER_LANE, + CONTIGUOUS_KV_ELEMS_16B_LOAD); // optimized for 16B fetches; assumes + // minimum block size is 16 + constexpr int VHELOOP = DIVIDE_ROUND_UP( + (HEAD_SIZE / 16), NWARPS); // head_size distributed across warps; each + // wmma instr works on 16 head elements + + int vphysical_block_number[VTLOOP][VBLOCKS_PER_LANE]; + + // fetch v physical block numbers + for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) { + for (int vblock_depth = 0; vblock_depth < VBLOCKS_PER_LANE; + vblock_depth++) { + const int vlocal_token_idx = + vtoken_depth * VTOKENS_PER_LANE * ROWS_PER_WARP + + rowid * VTOKENS_PER_LANE + vblock_depth * BLOCK_SIZE; + const int vglobal_token_idx = + partition_start_token_idx + vlocal_token_idx; + const int vblock_idx = (vglobal_token_idx < context_len) + ? vglobal_token_idx / BLOCK_SIZE + : last_ctx_block; + vphysical_block_number[vtoken_depth][vblock_depth] = + block_table_seq[vblock_idx]; + } + } + + _B16x8 Vlocal[VTLOOP][VHELOOP] + [VTLANELOOP]; // this can be interpreted as B8x16 too + + const cache_t* v_ptr = v_cache + wg_start_kv_head_idx * kv_head_stride + + ((rowid * VTOKENS_PER_LANE) % BLOCK_SIZE); + + // v fetches are 16head elems across lanes x 16 tokens per lane + for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) { + const int vhead_elem = vhe_depth * NWARPS * 16 + warpid * 16 + lane16id; + const cache_t* v_ptr2 = v_ptr + vhead_elem * BLOCK_SIZE; + + for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) { + for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP; vfetch_depth++) { + const int vblock_depth = 0; + const int64_t vblock_number = static_cast( + vphysical_block_number[vtoken_depth][vblock_depth]); + const cache_t* v_ptr3 = v_ptr2 + (vblock_number * kv_block_stride); + + const cache_t* v_fetch_ptr = + v_ptr3 + vfetch_depth * CONTIGUOUS_KV_ELEMS_16B_LOAD; + const _B16x8* v_fetch_ptr_16B = + reinterpret_cast(v_fetch_ptr); + Vlocal[vtoken_depth][vhe_depth][vfetch_depth] = *v_fetch_ptr_16B; + } + } + } + + floatx8 dout[TLOOP]; + // qk wmma + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + dout[token_depth] = {0}; + for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) { + dout[token_depth] = gcn_wmma16x16x16_instr( + Klocal[token_depth][qkhe_depth].u16x8, Qlocal[qkhe_depth].u16x8, + dout[token_depth]); + } + dout[token_depth] *= scale; + } + + // calculate qk_max and exp_sum per warp and write to shared memory + float qk_max = -FLT_MAX; + float exp_sum = 0.0f; + const int qkout_token_idx = + partition_start_token_idx + TOKENS_PER_WARP * warpid + rowid * 8; + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + const int local_token_idx = qkout_token_idx + token_depth * 16; + for (int i = 0; i < 8; i++) { + const float tmp = + (local_token_idx + i < context_len) ? dout[token_depth][i] : -FLT_MAX; + qk_max = fmaxf(qk_max, tmp); + } + } + + qk_max = fmaxf(qk_max, __shfl_xor(qk_max, 16)); + + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + const int local_token_idx = qkout_token_idx + token_depth * 16; + for (int i = 0; i < 8; i++) { + const float tmp = (local_token_idx + i < context_len) + ? __expf(dout[token_depth][i] - qk_max) + : 0.0f; + dout[token_depth][i] = tmp; + exp_sum += tmp; + } + } + + exp_sum += __shfl_xor(exp_sum, 16); + + __syncthreads(); + + if (laneid < 16) { + shared_qk_max[warpid][lane16id] = qk_max; + shared_exp_sum[warpid][lane16id] = exp_sum; + } + + __syncthreads(); + + // calculate partition qk_max and exp_sum + float partition_qk_max = -FLT_MAX; + float warp_qk_max_exp[NWARPS]; + float partition_exp_sum = 0.0f; + + #pragma unroll + for (int w = 0; w < NWARPS; w++) { + warp_qk_max_exp[w] = shared_qk_max[w][lane16id]; + partition_qk_max = fmaxf(partition_qk_max, warp_qk_max_exp[w]); + } + + for (int w = 0; w < NWARPS; w++) { + warp_qk_max_exp[w] = __expf(warp_qk_max_exp[w] - partition_qk_max); + partition_exp_sum += shared_exp_sum[w][lane16id] * warp_qk_max_exp[w]; + } + + const float inv_sum_scale = + __fdividef(1.f, partition_exp_sum + 1e-6f) * warp_qk_max_exp[warpid]; + + __syncthreads(); + + // write logits to shared mem + #pragma unroll + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + dout[token_depth] *= inv_sum_scale; + shared_logits[warpid][token_depth][lane16id][rowid] = + from_floatx8(dout[token_depth]); + } + + // write out partition max_logits and exp_sum + if (threadIdx.x < GQA_RATIO) { + const int qhead_idx = lane16id; + const int offset = seq_idx * total_num_heads * max_num_partitions + + (wg_start_head_idx + qhead_idx) * max_num_partitions + + partition_idx; + max_logits[offset] = partition_qk_max; + exp_sums[offset] = partition_exp_sum; + } + + __syncthreads(); + + _B16x8 outelems[VHELOOP]; + // Softmax V wmma + // v layout: 16he across lanes x 16 tokens per lane + for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) { + floatx8 tmp_out = {0}; + + for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) { + for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP; vfetch_depth++) { + const int offset = rowid * VTLANELOOP + vfetch_depth; + const int offset1 = offset % ROWS_PER_WARP; + const int offset2 = offset / ROWS_PER_WARP; + // if output format is 16 qheads across 16 lanes, 16 head elems spread + // across rows + tmp_out = gcn_wmma16x16x16_instr( + Vlocal[vtoken_depth][vhe_depth][vfetch_depth].u16x8, + shared_logits[vtoken_depth][offset2][lane16id][offset1].u16x8, + tmp_out); + } + } + outelems[vhe_depth] = from_floatx8(tmp_out); + } + + __syncthreads(); + + #pragma unroll + for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) { + shared_logits[warpid][vhe_depth][lane16id][rowid] = + outelems[vhe_depth]; // lane16 id head dimension; rowid head element + // dimension + } + + __syncthreads(); + + // write to tmp_out with coalesced writes after reading from shared mem + if (warpid == 0) { + _B16x8 vout[GQA_RATIO2]; + // each lane writes out 16Bytes of tmp_out along head elem dimension + const int head_elem_idx = lane16id * 8; + if (head_elem_idx < HEAD_SIZE) { + for (int h = 0; h < GQA_RATIO2; h++) { + const int local_head_idx = 2 * h + rowid; + const int offset1 = (head_elem_idx / 16) % NWARPS; + const int offset2 = head_elem_idx / 16 / NWARPS; + const int offset3 = (head_elem_idx / 8) % 2; // num_he % num_row + vout[h] = shared_logits[offset1][offset2][local_head_idx][offset3]; + } + + const int hsz_maxp_mult = HEAD_SIZE * max_num_partitions; + scalar_t* out_ptr = out + seq_idx * total_num_heads * hsz_maxp_mult + + partition_idx * HEAD_SIZE; + for (int h = 0; h < GQA_RATIO2; h++) { + const int local_head_idx = 2 * h + rowid; + if (local_head_idx < GQA_RATIO) { + const int out_head_idx = wg_start_head_idx + local_head_idx; + scalar_t* out_ptr2 = out_ptr + out_head_idx * hsz_maxp_mult; + scalar_t* out_ptr3 = out_ptr2 + head_elem_idx; + _B16x8* out_ptr_B16x8 = reinterpret_cast<_B16x8*>(out_ptr3); + *out_ptr_B16x8 = vout[h]; + } + } + } + } +} + +template +__global__ +__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, + // head_size/x, block_size, x] + const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, + // head_size, block_size] + const int num_kv_heads, const float scale, + const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] + const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] + const int max_num_blocks_per_seq, + const float* __restrict__ alibi_slopes, // [num_heads] + const int q_stride, const int kv_block_stride, const int kv_head_stride, + float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] + float* __restrict__ max_logits, // [num_seqs, num_heads, + // max_num_partitions] + scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, + // head_size] + OUTT* __restrict__ final_out, // [num_seqs, num_heads, head_size] + int max_ctx_blocks, const float* k_scale, const float* v_scale) { + UNREACHABLE_CODE +} + +// Grid: (num_heads, num_seqs). +template +__global__ +__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( + OUTT* __restrict__ out, // [num_seqs, num_heads, head_size] + const float* __restrict__ exp_sums, // [num_seqs, num_heads, + // max_num_partitions] + const float* __restrict__ max_logits, // [num_seqs, num_heads, + // max_num_partitions] + const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, + // max_num_partitions, head_size] + const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] + const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) { + const auto num_heads = gridDim.x; + const auto head_idx = blockIdx.x; + const auto seq_idx = blockIdx.y; + + // NOTE queries with sequence len > 1 are prefills and taken care by another + // kernel. + if (query_start_loc_ptr != nullptr && + (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) { + return; + } + + const int context_len = context_lens[seq_idx]; + const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); + [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; + const int warpid = threadIdx.x / WARP_SIZE; + [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE; + + __shared__ float shared_global_exp_sum; + // max num partitions supported is warp_size * NPAR_LOOPS + __shared__ float shared_exp_sums[NPAR_LOOPS * WARP_SIZE]; + + if (warpid == 0) { + const float* max_logits_ptr = max_logits + + seq_idx * num_heads * max_num_partitions + + head_idx * max_num_partitions; + + // valid partition is the last valid partition in case threadid > num + // partitions + int valid_partition[NPAR_LOOPS]; + float reg_max_logit[NPAR_LOOPS]; + const int last_valid_partition = num_partitions - 1; + + #pragma unroll + for (int i = 0; i < NPAR_LOOPS; i++) { + const int partition_no = i * WARP_SIZE + threadIdx.x; + valid_partition[i] = + (partition_no < num_partitions) ? partition_no : last_valid_partition; + } + #pragma unroll + for (int i = 0; i < NPAR_LOOPS; i++) { + reg_max_logit[i] = max_logits_ptr[valid_partition[i]]; + } + float max_logit = reg_max_logit[0]; + #pragma unroll + for (int i = 1; i < NPAR_LOOPS; i++) { + max_logit = fmaxf(max_logit, reg_max_logit[i]); + } + + #pragma unroll + for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { + max_logit = fmaxf(max_logit, __shfl_xor(max_logit, mask)); + } + + const float* exp_sums_ptr = exp_sums + + seq_idx * num_heads * max_num_partitions + + head_idx * max_num_partitions; + + float rescaled_exp_sum[NPAR_LOOPS]; + #pragma unroll + for (int i = 0; i < NPAR_LOOPS; i++) { + rescaled_exp_sum[i] = exp_sums_ptr[valid_partition[i]]; + } + #pragma unroll + for (int i = 0; i < NPAR_LOOPS; i++) { + const int partition_no = i * WARP_SIZE + threadIdx.x; + rescaled_exp_sum[i] *= (partition_no < num_partitions) + ? expf(reg_max_logit[i] - max_logit) + : 0.0f; + } + float global_exp_sum = rescaled_exp_sum[0]; + #pragma unroll + for (int i = 1; i < NPAR_LOOPS; i++) { + global_exp_sum += rescaled_exp_sum[i]; + } + #pragma unroll + for (int i = 0; i < NPAR_LOOPS; i++) { + const int partition_no = i * WARP_SIZE + threadIdx.x; + shared_exp_sums[partition_no] = rescaled_exp_sum[i]; + } + + #pragma unroll + for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { + global_exp_sum += __shfl_xor(global_exp_sum, mask); + } + if (threadIdx.x == 0) { + shared_global_exp_sum = global_exp_sum; + } + } // warpid == 0 + const scalar_t* tmp_out_ptr = + tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE + + head_idx * max_num_partitions * HEAD_SIZE + threadIdx.x; + constexpr int MAX_NPAR = 32; + scalar_t tmps[MAX_NPAR]; + const float dzero = 0.0f; + #pragma unroll + for (int j = 0; j < MAX_NPAR; j++) { + tmps[j] = from_float(dzero); + } + const int last_partition_offset = (num_partitions - 1) * HEAD_SIZE; + const int num_partition_offset = (num_partitions)*HEAD_SIZE; + int idx = 0; + + constexpr int JCHUNK = 16; + + #pragma unroll + for (int j = 0; j < JCHUNK * HEAD_SIZE; j += HEAD_SIZE) { + // lastj is last valid partition + const int lastj_offset = + (j < num_partition_offset) ? j : last_partition_offset; + tmps[idx] = tmp_out_ptr[lastj_offset]; + idx++; + } + __syncthreads(); + + if (num_partitions > JCHUNK) { + #pragma unroll + for (int j = JCHUNK * HEAD_SIZE; j < 2 * JCHUNK * HEAD_SIZE; + j += HEAD_SIZE) { + const int lastj_offset = + (j < num_partition_offset) ? j : last_partition_offset; + tmps[idx] = tmp_out_ptr[lastj_offset]; + idx++; + } + + if (num_partitions > 2 * JCHUNK) { + #pragma unroll + for (int j = 2 * JCHUNK * HEAD_SIZE; j < MAX_NPAR * HEAD_SIZE; + j += HEAD_SIZE) { + const int lastj_offset = + (j < num_partition_offset) ? j : last_partition_offset; + tmps[idx] = tmp_out_ptr[lastj_offset]; + idx++; + } + } + } // num_partitions > JCHUNK + + // Aggregate tmp_out to out. + float acc = 0.0f; + #pragma unroll + for (int j = 0; j < JCHUNK; j++) { + acc += to_float(tmps[j]) * shared_exp_sums[j]; + } + if (num_partitions > JCHUNK) { + #pragma unroll + for (int j = JCHUNK; j < 2 * JCHUNK; j++) { + acc += to_float(tmps[j]) * shared_exp_sums[j]; + } + if (num_partitions > 2 * JCHUNK) { + #pragma unroll + for (int j = 2 * JCHUNK; j < MAX_NPAR; j++) { + acc += to_float(tmps[j]) * shared_exp_sums[j]; + } + } + } + + for (int p = 1; p < NPAR_LOOPS; p++) { + if (num_partitions > p * MAX_NPAR) { + idx = 0; + #pragma unroll + for (int j = p * MAX_NPAR * HEAD_SIZE; j < (p + 1) * MAX_NPAR * HEAD_SIZE; + j += HEAD_SIZE) { + // lastj is last valid partition + const int lastj_offset = + (j < num_partition_offset) ? j : last_partition_offset; + tmps[idx] = tmp_out_ptr[lastj_offset]; + idx++; + } + + #pragma unroll + for (int j = 0; j < MAX_NPAR; j++) { + acc += to_float(tmps[j]) * shared_exp_sums[j + p * MAX_NPAR]; + } + } + } + + const float inv_global_exp_sum = + __fdividef(1.0f, shared_global_exp_sum + 1e-6f); + acc *= inv_global_exp_sum; + + const int64_t query_start_off = static_cast( + query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx); + OUTT* out_ptr = out + query_start_off * num_heads * HEAD_SIZE + + static_cast(head_idx) * HEAD_SIZE; + out_ptr[threadIdx.x] = from_float(acc); +} + +#else // clang-format off template ( \ - out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ - num_kv_heads, scale, block_tables, context_lens, query_start_loc, \ - max_context_len, alibi_slopes, k_scale, v_scale, fp8_out_scale); +template +void paged_attention_custom_launcher_navi( + torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, + torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, + torch::Tensor& value_cache, const int num_kv_heads, float scale, + torch::Tensor& block_tables, torch::Tensor& context_lens, + const std::optional& query_start_loc, int max_context_len, + const std::optional& alibi_slopes, torch::Tensor& k_scale, + torch::Tensor& v_scale) { + int num_seqs = block_tables.size(0); + int num_heads = query.size(1); + int head_size = query.size(2); + int max_num_blocks_per_seq = block_tables.size(1); + int q_stride = query.stride(0); + int kv_block_stride = key_cache.stride(0); + int kv_head_stride = key_cache.stride(1); + + // NOTE: query start location is optional for V0 decode should not be used. + // If batch contains mix of prefills and decode, prefills should be skipped. + const int* query_start_loc_ptr = + query_start_loc + ? reinterpret_cast(query_start_loc.value().data_ptr()) + : nullptr; + + // NOTE: Navi does not support alibi_slopes. + const float* alibi_slopes_ptr = nullptr; + + float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr()); + float* max_logits_ptr = reinterpret_cast(max_logits.data_ptr()); + T* tmp_out_ptr = reinterpret_cast(tmp_out.data_ptr()); + T* query_ptr = reinterpret_cast(query.data_ptr()); + KVT* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); + KVT* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); + int* block_tables_ptr = block_tables.data_ptr(); + int* context_lens_ptr = context_lens.data_ptr(); + + const float* k_scale_ptr = reinterpret_cast(k_scale.data_ptr()); + const float* v_scale_ptr = reinterpret_cast(v_scale.data_ptr()); + // NOTE: Navi does not support fp8. + const auto fp8_out_scale_ptr = nullptr; + OUTT* out_ptr = reinterpret_cast(out.data_ptr()); + + const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE); + + constexpr int PARTITION_SIZE = 256; + const int max_num_partitions = + DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE); + const int gqa_ratio = num_heads / num_kv_heads; + assert(num_heads % num_kv_heads == 0); + assert(head_size == HEAD_SIZE); + + constexpr int NTHR = 256; + dim3 grid(num_seqs, max_num_partitions, num_kv_heads); + dim3 block(NTHR); + const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + switch (gqa_ratio) { + case 1: + LAUNCH_CUSTOM_ATTENTION_MFMA16(1); + break; + case 2: + LAUNCH_CUSTOM_ATTENTION_MFMA16(2); + break; + case 3: + LAUNCH_CUSTOM_ATTENTION_MFMA16(3); + break; + case 4: + LAUNCH_CUSTOM_ATTENTION_MFMA16(4); + break; + case 5: + LAUNCH_CUSTOM_ATTENTION_MFMA16(5); + break; + case 6: + LAUNCH_CUSTOM_ATTENTION_MFMA16(6); + break; + case 7: + LAUNCH_CUSTOM_ATTENTION_MFMA16(7); + break; + case 8: + LAUNCH_CUSTOM_ATTENTION_MFMA16(8); + break; + case 9: + LAUNCH_CUSTOM_ATTENTION_MFMA16(9); + break; + case 10: + LAUNCH_CUSTOM_ATTENTION_MFMA16(10); + break; + case 11: + LAUNCH_CUSTOM_ATTENTION_MFMA16(11); + break; + case 12: + LAUNCH_CUSTOM_ATTENTION_MFMA16(12); + break; + case 13: + LAUNCH_CUSTOM_ATTENTION_MFMA16(13); + break; + case 14: + LAUNCH_CUSTOM_ATTENTION_MFMA16(14); + break; + case 15: + LAUNCH_CUSTOM_ATTENTION_MFMA16(15); + break; + case 16: + LAUNCH_CUSTOM_ATTENTION_MFMA16(16); + break; + default: + TORCH_CHECK(false, "Unsupported gqa ratio: ", gqa_ratio); + break; + } + + dim3 reduce_grid(num_heads, num_seqs); + dim3 reduce_block(head_size); + const int warp_size = 32; + const int npar_loops = DIVIDE_ROUND_UP(max_num_partitions, warp_size); + // reduction kernel supports upto 16 NPAR_loops * 32 (warp_size) * 256 + // (partition size) = 128K context length + switch (npar_loops) { + case 1: + LAUNCH_CUSTOM_REDUCTION(1); + break; + case 2: + LAUNCH_CUSTOM_REDUCTION(2); + break; + case 3: + LAUNCH_CUSTOM_REDUCTION(3); + break; + case 4: + LAUNCH_CUSTOM_REDUCTION(4); + break; + case 5: + LAUNCH_CUSTOM_REDUCTION(5); + break; + case 6: + LAUNCH_CUSTOM_REDUCTION(6); + break; + case 7: + LAUNCH_CUSTOM_REDUCTION(7); + break; + case 8: + LAUNCH_CUSTOM_REDUCTION(8); + break; + case 9: + LAUNCH_CUSTOM_REDUCTION(9); + break; + case 10: + LAUNCH_CUSTOM_REDUCTION(10); + break; + case 11: + LAUNCH_CUSTOM_REDUCTION(11); + break; + case 12: + LAUNCH_CUSTOM_REDUCTION(12); + break; + case 13: + LAUNCH_CUSTOM_REDUCTION(13); + break; + case 14: + LAUNCH_CUSTOM_REDUCTION(14); + break; + case 15: + LAUNCH_CUSTOM_REDUCTION(15); + break; + case 16: + LAUNCH_CUSTOM_REDUCTION(16); + break; + default: + TORCH_CHECK(false, "Unsupported npar_loops: ", npar_loops); + break; + } +} + +#define CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, \ + PSIZE, ALIBI_ENABLED) \ + if (!is_navi) { \ + paged_attention_custom_launcher( \ + out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ + num_kv_heads, scale, block_tables, context_lens, query_start_loc, \ + max_context_len, alibi_slopes, k_scale, v_scale, fp8_out_scale); \ + } else { \ + paged_attention_custom_launcher_navi< \ + T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, ALIBI_ENABLED>( \ + out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ + num_kv_heads, scale, block_tables, context_lens, query_start_loc, \ + max_context_len, alibi_slopes, k_scale, v_scale); \ + } #define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \ OUTT, PSIZE) \ @@ -1807,6 +3496,24 @@ void paged_attention_custom_launcher( break; \ } +bool is_navi_gpu() { + static bool is_cached = false; + static bool result; + + if (!is_cached) { + int device_id; + hipDeviceProp_t deviceProp; + hipGetDevice(&device_id); + hipGetDeviceProperties(&deviceProp, device_id); + + std::string arch = deviceProp.gcnArchName; + result = arch.find("gfx11") == 0 || arch.find("gfx12") == 0; + is_cached = true; + } + + return result; +} + // clang-format off void paged_attention( torch::Tensor& out, // [num_seqs, num_heads, head_size] @@ -1827,6 +3534,8 @@ void paged_attention( torch::Tensor& v_scale, const std::optional& fp8_out_scale) { // clang-format on + bool is_navi = is_navi_gpu(); + const int head_size = query.size(2); if (kv_cache_dtype == "auto") { if (query.dtype() == at::ScalarType::Half) { diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu index b3717892db784..e31aa0162628f 100644 --- a/csrc/rocm/skinny_gemms.cu +++ b/csrc/rocm/skinny_gemms.cu @@ -13,14 +13,34 @@ #include "dispatch_utils.h" #include "quantization/fp8/common.cuh" -#if defined(__HIPCC__) && (defined(__gfx90a__) || defined(__gfx942__)) - #define __HIP__MI300_MI250__ +#if defined(__HIPCC__) && \ + (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) + #define __HIP__GFX9__ #endif -#if defined(__HIPCC__) && defined(__gfx942__) - #define __HIP__MI300__ +#if defined(__HIPCC__) && (defined(__gfx942__) || defined(__gfx950__)) + #define __HIP__MI3XX__ #endif +#if defined(__gfx950__) + #define LDS_SIZE 160 * 1024 +#else + #define LDS_SIZE 64 * 1024 +#endif + +int get_lds_size() { + static bool is_cached = false; + static int result; + if (is_cached == false) { + auto dprops = at::cuda::getCurrentDeviceProperties(); + std::string device_arch = dprops->gcnArchName; + size_t substring = device_arch.find("gfx95"); + result = (substring == std::string::npos ? 64 * 1024 : 160 * 1024); + is_cached = true; + } + return result; +} + #if defined(NDEBUG) #undef NDEBUG #include @@ -267,7 +287,7 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b, V0 += (s.x + s.y); \ } -#if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support +#if defined(__HIP__GFX9__) // TODO: Add NAVI support // This version targets cases where A[] fits LDS capacity template @@ -275,7 +295,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B, const scalar_t* __restrict__ A, scalar_t* C, const int _WvPrGrp, const int CuCount) { - #if defined(__HIP__MI300__) + constexpr int max_lds_len = LDS_SIZE / 2; + #if defined(__HIP__MI3XX__) constexpr bool use_mfma = (std::is_same_v); #else constexpr bool use_mfma = false; @@ -295,13 +316,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) }; //---------------------------------------------------- - // Reserving 64 KB of LDS to have 1 WG / CU + // Reserving 64/160 KB of LDS to have 1 WG / CU // Goal is to bring the activation matrix A to the LDS // and use it across the lifetime of the work group // TODO: When activation matrix is larger than 64 KB // then this is not goint to work! //---------------------------------------------------- - __shared__ scalar_t s[1024 * 32]; + __shared__ scalar_t s[max_lds_len]; //---------------------------------------------------- // Fetch the activation matrix to LDS @@ -312,11 +333,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) // - Then the WG will move to another 8 K elements // TODO: Logic below will only work when K is multiple of 8 //---------------------------------------------------- - for (uint32_t k = 0; k < min(K * N, 32 * 1024); + for (uint32_t k = 0; k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) { uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK); - if (k_in >= min(K * N, 32 * 1024)) break; + if (k_in >= min(K * N, max_lds_len)) break; *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in])); } @@ -517,7 +538,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) m += CuCount * _WvPrGrp * YTILE; } } -#else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support +#else // !defined(__HIP__GFX9__) TODO: Add NAVI support template __global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B, @@ -525,9 +546,9 @@ __global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B, const int _WvPrGrp, const int CuCount) { UNREACHABLE_CODE } -#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support +#endif // defined(__HIP__GFX9__) TODO: Add NAVI support -#if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support +#if defined(__HIP__GFX9__) // TODO: Add NAVI support // This version targets cases where A[] marginally exceeds LDS capacity template @@ -535,7 +556,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) wvSplitK_hf_(const int K, const int M, const scalar_t* B, const scalar_t* __restrict__ A, scalar_t* C, const int _WvPrGrp, const int CuCount) { - #if defined(__HIP__MI300__) + constexpr int max_lds_len = LDS_SIZE / 2; + #if defined(__HIP__MI3XX__) constexpr bool use_mfma = (std::is_same_v); #else constexpr bool use_mfma = false; @@ -561,7 +583,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) // TODO: When activation matrix is larger than 64 KB // then this is not goint to work! //---------------------------------------------------- - __shared__ scalar_t s[1024 * 32]; + __shared__ scalar_t s[max_lds_len]; //---------------------------------------------------- // Computation of columns that need to be committed to memory! @@ -598,11 +620,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) // - Then the WG will move to another 8 K elements // TODO: Logic below will only work when K is multiple of 8 //---------------------------------------------------- - for (uint32_t k = 0; k < min(K * N, 32 * 1024); + for (uint32_t k = 0; k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) { uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK); - if (k_in >= min(K * N, 32 * 1024)) break; + if (k_in >= min(K * N, max_lds_len)) break; *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in])); } @@ -686,7 +708,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) // Fetch A activation matrix in interleaved fashion from LDS or memory for (int n = 0; n < N; n++) { - if (k_ + K * n < 32 * 1024) + if (k_ + K * n < max_lds_len) bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n]))); else bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n]))); @@ -817,7 +839,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) } } -#else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support +#else // !defined(__HIP__GFX9__) TODO: Add NAVI support template __global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B, @@ -825,9 +847,9 @@ __global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B, const int _WvPrGrp, const int CuCount) { UNREACHABLE_CODE } -#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support +#endif // defined(__HIP__GFX9__) TODO: Add NAVI support -#if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support +#if defined(__HIP__GFX9__) // TODO: Add NAVI support // This version targets big A[] cases, where it is much larger than LDS capacity template @@ -835,7 +857,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) wvSplitK_hf_big_(const int K, const int M, const scalar_t* B, const scalar_t* __restrict__ A, scalar_t* C, const int _WvPrGrp, const int CuCount) { - #if defined(__HIP__MI300__) + constexpr int max_lds_len = LDS_SIZE / 2; + #if defined(__HIP__MI3XX__) constexpr bool use_mfma = (std::is_same_v); #else constexpr bool use_mfma = false; @@ -855,13 +878,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) }; //---------------------------------------------------- - // Reserving 64 KB of LDS to have 1 WG / CU + // Reserving 64/160 KB of LDS to have 1 WG / CU // Goal is to bring the activation matrix A to the LDS // and use it across the lifetime of the work group // TODO: When activation matrix is larger than 64 KB // then this is not goint to work! //---------------------------------------------------- - __shared__ scalar_t s[1024 * 32]; + __shared__ scalar_t s[max_lds_len]; //---------------------------------------------------- // Computation of columns that need to be committed to memory! @@ -902,11 +925,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) //---------------------------------------------------- #define PCML #ifndef PCML - for (uint32_t k = 0; k < min(K * N, 32 * 1024); + for (uint32_t k = 0; k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) { uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK); - if (k_in >= min(K * N, 32 * 1024)) break; + if (k_in >= min(K * N, max_lds_len)) break; *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in])); } @@ -916,7 +939,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) #define TUC (THRDS * UNRL * A_CHUNK) uint32_t kBase = 0; // find biggest k size that fits in LDS - uint32_t kFit = (32 * 1024) / N; + uint32_t kFit = (max_lds_len) / N; // kFit = (kFit%TWC==0) ? kFit : (kFit-kFit%TWC+TWC); //round up to multiple // of TUC kFit = (kFit % TUC == 0) @@ -1164,7 +1187,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) } } } -#else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support +#else // !defined(__HIP__GFX9__) TODO: Add NAVI support template __global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B, @@ -1172,7 +1195,7 @@ __global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B, const int _WvPrGrp, const int CuCount) { UNREACHABLE_CODE } -#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support +#endif // defined(__HIP__GFX9__) TODO: Add NAVI support int mindiv(int N, int div1, int div2) { int nPrRnd = div1 * div2; @@ -1222,17 +1245,18 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b, const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + const int max_lds_len = get_lds_size() / 2; #define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \ _N) \ { \ dim3 block(64, _WvPrGrp); \ - if ((K_in * N_in <= 32 * 1024) && (M_in % _YTILEs == 0)) { \ + if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \ int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \ wvSplitK_hf_sml_ \ <<>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \ CuCount); \ - } else if (K_in * N_in <= 32 * 1024 * 1.2) { \ + } else if (K_in * N_in <= max_lds_len * 1.2) { \ int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \ wvSplitK_hf_ \ <<>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \ @@ -1272,7 +1296,7 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b, return out_c; } -#if defined(__HIP__MI300__) // TODO: Add NAVI support +#if defined(__HIP__MI3XX__) // TODO: Add NAVI support template __global__ void __launch_bounds__(WvPrGrp* THRDS) @@ -1281,6 +1305,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) const float* __restrict__ s_A, const float* __restrict__ s_B, const int _WvPrGrp, const int CuCount) { + constexpr int max_lds_len = LDS_SIZE; using scalar8 = __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float; using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int; @@ -1296,10 +1321,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) scalar8 h8; }; - __shared__ fp8_t s[1024 * 64]; + __shared__ fp8_t s[max_lds_len]; for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK; - k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) { + k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) { *((bigType*)(&s[k])) = *((bigType*)(&A[k])); } __syncthreads(); @@ -1436,7 +1461,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) m += CuCount * _WvPrGrp * YTILE; } } -#else // !defined(__HIP__MI300__) TODO: Add NAVI support +#else // !defined(__HIP__MI3XX__) TODO: Add NAVI support template __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, @@ -1446,9 +1471,9 @@ __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, const int _WvPrGrp, const int CuCount) { UNREACHABLE_CODE } -#endif // defined(__HIP__MI300__) TODO: Add NAVI support +#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support -#if defined(__HIP__MI300__) // TODO: Add NAVI support +#if defined(__HIP__MI3XX__) // TODO: Add NAVI support template __global__ void __launch_bounds__(WvPrGrp* THRDS) @@ -1456,6 +1481,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) const fp8_t* __restrict__ A, scalar_t* C, const float* __restrict__ s_A, const float* __restrict__ s_B, const int _WvPrGrp, const int CuCount) { + constexpr int max_lds_len = LDS_SIZE; using scalar8 = __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float; using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int; @@ -1471,10 +1497,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) scalar8 h8; }; - __shared__ fp8_t s[1024 * 64]; + __shared__ fp8_t s[max_lds_len]; for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK; - k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) { + k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) { *((bigType*)(&s[k])) = *((bigType*)(&A[k])); } __syncthreads(); @@ -1517,7 +1543,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) uint32_t k_ = k + threadIdx.x * A_CHUNK; if (k_ >= K) break; for (int n = 0; n < N; n++) { - if (k_ + K * n < 64 * 1024) + if (k_ + K * n < max_lds_len) bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n]))); else bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n]))); @@ -1608,7 +1634,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) m += CuCount * _WvPrGrp * YTILE; } } -#else // !defined(__HIP__MI300__) TODO: Add NAVI support +#else // !defined(__HIP__MI3XX__) TODO: Add NAVI support template __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M, @@ -1618,7 +1644,7 @@ __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M, const int CuCount) { UNREACHABLE_CODE } -#endif // defined(__HIP__MI300__) TODO: Add NAVI support +#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c, at::Tensor& scale_a, at::Tensor& scale_b, @@ -1638,12 +1664,13 @@ void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c, dim3 grid(CuCount); const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + const int max_lds_len = get_lds_size(); #define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \ _N) \ { \ dim3 block(64, _WvPrGrp); \ - if ((K_in * N_in <= 64 * 1024) && (M_in % _YTILEs == 0)) { \ + if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \ int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \ wvSplitKQ_hf_sml_ \ <<>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \ diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 4eda1aaccc6b3..371894c56a79b 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -482,41 +482,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor page_table, float scale) -> ()"); ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode); - // Mamba selective scan kernel - ops.def( - "selective_scan_fwd(Tensor! u, Tensor! delta," - "Tensor! A, Tensor! B, Tensor! C," - "Tensor? D_, Tensor!? z_, Tensor? delta_bias_," - "bool delta_softplus," - "Tensor? query_start_loc," - "Tensor? cache_indices," - "Tensor? has_initial_state," - "Tensor! ssm_states," - "int pad_slot_id) -> ()"); - ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd); - - ops.def( - "causal_conv1d_update(Tensor! x," - "Tensor! conv_state," - "Tensor! weight," - "Tensor? bias_," - "bool silu_activation," - "Tensor? cache_seqlens_," - "Tensor? conv_state_indices," - "int pad_slot_id) -> ()"); - ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update); - - ops.def( - "causal_conv1d_fwd(Tensor! x, Tensor! weight," - "Tensor? bias_," - "Tensor!? conv_states," - "Tensor? query_start_loc," - "Tensor? cache_indices," - "Tensor? has_initial_state," - "bool silu_activation," - "int pad_slot_id) -> ()"); - ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd); - // Compute NVFP4 block quantized tensor. ops.def( "scaled_fp4_quant(Tensor! output, Tensor input," @@ -584,6 +549,41 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.impl("dynamic_scaled_int8_quant", torch::kCUDA, &dynamic_scaled_int8_quant); + // Mamba selective scan kernel + ops.def( + "selective_scan_fwd(Tensor! u, Tensor! delta," + "Tensor! A, Tensor! B, Tensor! C," + "Tensor? D_, Tensor!? z_, Tensor? delta_bias_," + "bool delta_softplus," + "Tensor? query_start_loc," + "Tensor? cache_indices," + "Tensor? has_initial_state," + "Tensor! ssm_states," + "int pad_slot_id) -> ()"); + ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd); + + ops.def( + "causal_conv1d_update(Tensor! x," + "Tensor! conv_state," + "Tensor! weight," + "Tensor? bias_," + "bool silu_activation," + "Tensor? cache_seqlens_," + "Tensor? conv_state_indices," + "int pad_slot_id) -> ()"); + ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update); + + ops.def( + "causal_conv1d_fwd(Tensor! x, Tensor! weight," + "Tensor? bias_," + "Tensor!? conv_states," + "Tensor? query_start_loc," + "Tensor? cache_indices," + "Tensor? has_initial_state," + "bool silu_activation," + "int pad_slot_id) -> ()"); + ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd); + #ifndef USE_ROCM // reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel ops.def( diff --git a/docker/Dockerfile b/docker/Dockerfile index 97a7879da8767..24986a1b73b1b 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -2,8 +2,8 @@ # to run the OpenAI compatible server. # Please update any changes made here to -# docs/source/contributing/dockerfile/dockerfile.md and -# docs/source/assets/contributing/dockerfile-stages-dependency.png +# docs/contributing/dockerfile/dockerfile.md and +# docs/assets/contributing/dockerfile-stages-dependency.png ARG CUDA_VERSION=12.8.1 #################### BASE BUILD IMAGE #################### @@ -189,6 +189,8 @@ WORKDIR /vllm-workspace ENV DEBIAN_FRONTEND=noninteractive ARG TARGETPLATFORM +SHELL ["/bin/bash", "-c"] + RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment @@ -255,15 +257,17 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist RUN --mount=type=cache,target=/root/.cache/uv \ . /etc/environment && \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ - # uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.4/flashinfer_python-0.2.4+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \ - # TESTING: install FlashInfer from source to test 2.7.0 final RC + # FlashInfer alreary has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use if [[ "$CUDA_VERSION" == 12.8* ]]; then \ - export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'; \ + uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl; \ else \ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX'; \ - fi && \ - export FLASHINFER_ENABLE_AOT=1; \ - uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \ + CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ + if [ "$CUDA_MAJOR" -lt 12 ]; then \ + export FLASHINFER_ENABLE_SM90=0; \ + fi; \ + uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \ + fi \ fi COPY examples examples COPY benchmarks benchmarks @@ -273,7 +277,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ . /etc/environment && \ uv pip list -# Although we build Flashinfer with AOT mode, there's still +# Even when we build Flashinfer with AOT mode, there's still # some issues w.r.t. JIT compilation. Therefore we need to # install build dependencies for JIT compilation. # TODO: Remove this once FlashInfer AOT wheel is fixed @@ -301,8 +305,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4" # install development dependencies (for testing) -RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements/dev.txt +RUN --mount=type=cache,target=/root/.cache/uv \ + CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ + if [ "$CUDA_MAJOR" -ge 12 ]; then \ + uv pip install --system -r requirements/dev.txt; \ + fi # install development dependencies (for testing) RUN --mount=type=cache,target=/root/.cache/uv \ @@ -321,7 +328,9 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1 # will not be imported by other tests RUN mkdir test_docs RUN mv docs test_docs/ +RUN cp -r examples test_docs/ RUN mv vllm test_docs/ +RUN mv mkdocs.yaml test_docs/ #################### TEST IMAGE #################### #################### OPENAI API SERVER #################### diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index c647d9036f400..5395b3884fb52 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -51,9 +51,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --upgrade pip && \ uv pip install -r requirements/cpu.txt -RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install intel-openmp==2024.2.1 intel_extension_for_pytorch==2.6.0 - ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD" RUN echo 'ulimit -c 0' >> ~/.bashrc diff --git a/docker/Dockerfile.neuron b/docker/Dockerfile.neuron index 2b63fe301bac6..259dc5a23f78b 100644 --- a/docker/Dockerfile.neuron +++ b/docker/Dockerfile.neuron @@ -1,6 +1,6 @@ # default base image # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx -ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04" +ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.6.0-neuronx-py310-sdk2.23.0-ubuntu22.04" FROM $BASE_IMAGE @@ -22,8 +22,7 @@ WORKDIR ${APP_MOUNT}/vllm RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity -RUN python3 -m pip install sentencepiece transformers==4.48.0 -U -RUN python3 -m pip install neuronx-cc==2.17.194.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U +RUN python3 -m pip install neuronx-cc==2.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install pytest # uninstall transformers-neuronx package explicitly to avoid version conflict @@ -49,6 +48,8 @@ RUN python3 -m pip install -e tests/vllm_test_utils # FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps +RUN python3 -m pip install sentencepiece transformers==4.48.0 -U + # overwrite entrypoint to run bash script RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base index 222b9c158e5e0..45efcbde698b2 100644 --- a/docker/Dockerfile.rocm_base +++ b/docker/Dockerfile.rocm_base @@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG FA_BRANCH="1a7f4dfa" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" -ARG AITER_BRANCH="5a77249" +ARG AITER_BRANCH="c1debd8" ARG AITER_REPO="https://github.com/ROCm/aiter.git" FROM ${BASE_IMAGE} AS base diff --git a/docker/Dockerfile.s390x b/docker/Dockerfile.s390x index 9c10cd56b5949..4e89bb3057c5e 100644 --- a/docker/Dockerfile.s390x +++ b/docker/Dockerfile.s390x @@ -84,16 +84,40 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \ rustup default stable && \ rustup show +FROM python-install AS torch +ARG TORCH_VERSION=2.7.0 +ENV export _GLIBCXX_USE_CXX11_ABI=1 +ENV CARGO_HOME=/root/.cargo +ENV RUSTUP_HOME=/root/.rustup +ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH" + +WORKDIR /tmp + +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \ + --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \ + git clone https://github.com/pytorch/pytorch.git && \ + cd pytorch && \ + git checkout v2.7.0 && \ + git submodule sync && \ + git submodule update --init --recursive && \ + uv pip install cmake ninja && \ + uv pip install -r requirements.txt && \ + python setup.py bdist_wheel + + FROM python-install AS torch-vision # Install torchvision -ARG TORCH_VERSION=2.7.0.dev20250304 +ARG TORCH_VERSION=2.7.0 ARG TORCH_VISION_VERSION=v0.20.1 WORKDIR /tmp RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \ git clone https://github.com/pytorch/vision.git && \ cd vision && \ git checkout $TORCH_VISION_VERSION && \ - uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \ + TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \ + uv pip install -v $TORCH_WHL_FILE && \ python setup.py bdist_wheel FROM python-install AS hf-xet-builder @@ -138,15 +162,17 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \ --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \ --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \ + --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \ sed -i '/^torch/d' requirements/build.txt && \ ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \ VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \ HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \ + TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \ uv pip install -v \ $ARROW_WHL_FILE \ $VISION_WHL_FILE \ $HF_XET_WHL_FILE \ - --extra-index-url https://download.pytorch.org/whl/nightly/cpu \ + $TORCH_WHL_FILE \ --index-strategy unsafe-best-match \ -r requirements/build.txt \ -r requirements/cpu.txt diff --git a/docs/.nav.yml b/docs/.nav.yml new file mode 100644 index 0000000000000..a9c594c291777 --- /dev/null +++ b/docs/.nav.yml @@ -0,0 +1,66 @@ +nav: + - Home: + - vLLM: README.md + - Getting Started: + - getting_started/quickstart.md + - getting_started/installation + - Examples: + - Offline Inference: examples/offline_inference + - Online Serving: examples/online_serving + - Others: examples/others + - Quick Links: + - User Guide: usage/README.md + - Developer Guide: contributing/README.md + - API Reference: api/README.md + - CLI Reference: cli/README.md + - Timeline: + - Roadmap: https://roadmap.vllm.ai + - Releases: https://github.com/vllm-project/vllm/releases + - User Guide: + - Summary: usage/README.md + - usage/v1_guide.md + - General: + - usage/* + - Inference and Serving: + - serving/offline_inference.md + - serving/openai_compatible_server.md + - serving/* + - serving/integrations + - Deployment: + - deployment/* + - deployment/frameworks + - deployment/integrations + - Training: training + - Configuration: + - Summary: configuration/README.md + - configuration/* + - Models: + - models/supported_models.md + - models/generative_models.md + - models/pooling_models.md + - models/extensions + - Features: + - features/compatibility_matrix.md + - features/* + - features/quantization + - Developer Guide: + - Summary: contributing/README.md + - General: + - glob: contributing/* + flatten_single_child_sections: true + - Model Implementation: contributing/model + - Design Documents: + - V0: design + - V1: design/v1 + - API Reference: + - Summary: api/README.md + - Contents: + - glob: api/vllm/* + preserve_directory_names: true + - CLI Reference: + - Summary: cli/README.md + - Community: + - community/* + - Blog: https://blog.vllm.ai + - Forum: https://discuss.vllm.ai + - Slack: https://slack.vllm.ai diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index d3b429dfb9257..0000000000000 --- a/docs/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line, and also -# from the environment for the first two. -SPHINXOPTS ?= -SPHINXBUILD ?= sphinx-build -SOURCEDIR = source -BUILDDIR = build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -clean: - @$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - rm -rf "$(SOURCEDIR)/getting_started/examples" - rm -rf "$(SOURCEDIR)/api/vllm" diff --git a/docs/README.md b/docs/README.md index dcd5e759dfa88..0c6aff5fa07c3 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,43 +1,50 @@ -# vLLM documents +# Welcome to vLLM -## Build the docs +
+ ![](./assets/logos/vllm-logo-text-light.png){ align="center" alt="vLLM" class="no-scaled-link" width="60%" } +
-- Make sure in `docs` directory +

+Easy, fast, and cheap LLM serving for everyone + +

-```bash -cd docs -``` +

+ +Star +Watch +Fork +

-- Install the dependencies: +vLLM is a fast and easy-to-use library for LLM inference and serving. -```bash -pip install -r ../requirements/docs.txt -``` +Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry. -- Clean the previous build (optional but recommended): +vLLM is fast with: -```bash -make clean -``` +- State-of-the-art serving throughput +- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) +- Continuous batching of incoming requests +- Fast model execution with CUDA/HIP graph +- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8 +- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer. +- Speculative decoding +- Chunked prefill -- Generate the HTML documentation: +vLLM is flexible and easy to use with: -```bash -make html -``` +- Seamless integration with popular HuggingFace models +- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more +- Tensor parallelism and pipeline parallelism support for distributed inference +- Streaming outputs +- OpenAI-compatible API server +- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudiยฎ accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators. +- Prefix caching support +- Multi-lora support -## Open the docs with your browser +For more information, check out the following: -- Serve the documentation locally: - -```bash -python -m http.server -d build/html/ -``` - -This will start a local server at http://localhost:8000. You can now open your browser and view the documentation. - -If port 8000 is already in use, you can specify a different port, for example: - -```bash -python -m http.server 3000 -d build/html/ -``` +- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention) +- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023) +- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al. +- [vLLM Meetups][meetups] diff --git a/docs/api/README.md b/docs/api/README.md new file mode 100644 index 0000000000000..5c7b2ca79ee2c --- /dev/null +++ b/docs/api/README.md @@ -0,0 +1,107 @@ +# Summary + +[](){ #configuration } + +## Configuration + +API documentation for vLLM's configuration classes. + +- [vllm.config.ModelConfig][] +- [vllm.config.CacheConfig][] +- [vllm.config.TokenizerPoolConfig][] +- [vllm.config.LoadConfig][] +- [vllm.config.ParallelConfig][] +- [vllm.config.SchedulerConfig][] +- [vllm.config.DeviceConfig][] +- [vllm.config.SpeculativeConfig][] +- [vllm.config.LoRAConfig][] +- [vllm.config.PromptAdapterConfig][] +- [vllm.config.MultiModalConfig][] +- [vllm.config.PoolerConfig][] +- [vllm.config.DecodingConfig][] +- [vllm.config.ObservabilityConfig][] +- [vllm.config.KVTransferConfig][] +- [vllm.config.CompilationConfig][] +- [vllm.config.VllmConfig][] + +[](){ #offline-inference-api } + +## Offline Inference + +LLM Class. + +- [vllm.LLM][] + +LLM Inputs. + +- [vllm.inputs.PromptType][] +- [vllm.inputs.TextPrompt][] +- [vllm.inputs.TokensPrompt][] + +## vLLM Engines + +Engine classes for offline and online inference. + +- [vllm.LLMEngine][] +- [vllm.AsyncLLMEngine][] + +## Inference Parameters + +Inference parameters for vLLM APIs. + +[](){ #sampling-params } +[](){ #pooling-params } + +- [vllm.SamplingParams][] +- [vllm.PoolingParams][] + +[](){ #multi-modality } + +## Multi-Modality + +vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package. + +Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models] +via the `multi_modal_data` field in [vllm.inputs.PromptType][]. + +Looking to add your own multi-modal model? Please follow the instructions listed [here][supports-multimodal]. + +- [vllm.multimodal.MULTIMODAL_REGISTRY][] + +### Inputs + +User-facing inputs. + +- [vllm.multimodal.inputs.MultiModalDataDict][] + +Internal data structures. + +- [vllm.multimodal.inputs.PlaceholderRange][] +- [vllm.multimodal.inputs.NestedTensors][] +- [vllm.multimodal.inputs.MultiModalFieldElem][] +- [vllm.multimodal.inputs.MultiModalFieldConfig][] +- [vllm.multimodal.inputs.MultiModalKwargsItem][] +- [vllm.multimodal.inputs.MultiModalKwargs][] +- [vllm.multimodal.inputs.MultiModalInputs][] + +### Data Parsing + +- [vllm.multimodal.parse][] + +### Data Processing + +- [vllm.multimodal.processing][] + +### Memory Profiling + +- [vllm.multimodal.profiling][] + +### Registry + +- [vllm.multimodal.registry][] + +## Model Development + +- [vllm.model_executor.models.interfaces_base][] +- [vllm.model_executor.models.interfaces][] +- [vllm.model_executor.models.adapters][] diff --git a/docs/api/vllm/.meta.yml b/docs/api/vllm/.meta.yml new file mode 100644 index 0000000000000..c15adfec644cf --- /dev/null +++ b/docs/api/vllm/.meta.yml @@ -0,0 +1,2 @@ +search: + boost: 0.5 diff --git a/docs/source/assets/contributing/dockerfile-stages-dependency.png b/docs/assets/contributing/dockerfile-stages-dependency.png similarity index 100% rename from docs/source/assets/contributing/dockerfile-stages-dependency.png rename to docs/assets/contributing/dockerfile-stages-dependency.png diff --git a/docs/source/assets/deployment/anything-llm-chat-with-doc.png b/docs/assets/deployment/anything-llm-chat-with-doc.png similarity index 100% rename from docs/source/assets/deployment/anything-llm-chat-with-doc.png rename to docs/assets/deployment/anything-llm-chat-with-doc.png diff --git a/docs/source/assets/deployment/anything-llm-chat-without-doc.png b/docs/assets/deployment/anything-llm-chat-without-doc.png similarity index 100% rename from docs/source/assets/deployment/anything-llm-chat-without-doc.png rename to docs/assets/deployment/anything-llm-chat-without-doc.png diff --git a/docs/source/assets/deployment/anything-llm-provider.png b/docs/assets/deployment/anything-llm-provider.png similarity index 100% rename from docs/source/assets/deployment/anything-llm-provider.png rename to docs/assets/deployment/anything-llm-provider.png diff --git a/docs/source/assets/deployment/anything-llm-upload-doc.png b/docs/assets/deployment/anything-llm-upload-doc.png similarity index 100% rename from docs/source/assets/deployment/anything-llm-upload-doc.png rename to docs/assets/deployment/anything-llm-upload-doc.png diff --git a/docs/source/assets/deployment/architecture_helm_deployment.png b/docs/assets/deployment/architecture_helm_deployment.png similarity index 100% rename from docs/source/assets/deployment/architecture_helm_deployment.png rename to docs/assets/deployment/architecture_helm_deployment.png diff --git a/docs/source/assets/deployment/chatbox-chat.png b/docs/assets/deployment/chatbox-chat.png similarity index 100% rename from docs/source/assets/deployment/chatbox-chat.png rename to docs/assets/deployment/chatbox-chat.png diff --git a/docs/source/assets/deployment/chatbox-settings.png b/docs/assets/deployment/chatbox-settings.png similarity index 100% rename from docs/source/assets/deployment/chatbox-settings.png rename to docs/assets/deployment/chatbox-settings.png diff --git a/docs/source/assets/deployment/dify-chat.png b/docs/assets/deployment/dify-chat.png similarity index 100% rename from docs/source/assets/deployment/dify-chat.png rename to docs/assets/deployment/dify-chat.png diff --git a/docs/source/assets/deployment/dify-create-chatbot.png b/docs/assets/deployment/dify-create-chatbot.png similarity index 100% rename from docs/source/assets/deployment/dify-create-chatbot.png rename to docs/assets/deployment/dify-create-chatbot.png diff --git a/docs/source/assets/deployment/dify-settings.png b/docs/assets/deployment/dify-settings.png similarity index 100% rename from docs/source/assets/deployment/dify-settings.png rename to docs/assets/deployment/dify-settings.png diff --git a/docs/source/assets/deployment/open_webui.png b/docs/assets/deployment/open_webui.png similarity index 100% rename from docs/source/assets/deployment/open_webui.png rename to docs/assets/deployment/open_webui.png diff --git a/docs/source/assets/deployment/streamlit-chat.png b/docs/assets/deployment/streamlit-chat.png similarity index 100% rename from docs/source/assets/deployment/streamlit-chat.png rename to docs/assets/deployment/streamlit-chat.png diff --git a/docs/source/assets/design/arch_overview/entrypoints.excalidraw.png b/docs/assets/design/arch_overview/entrypoints.excalidraw.png similarity index 100% rename from docs/source/assets/design/arch_overview/entrypoints.excalidraw.png rename to docs/assets/design/arch_overview/entrypoints.excalidraw.png diff --git a/docs/source/assets/design/arch_overview/llm_engine.excalidraw.png b/docs/assets/design/arch_overview/llm_engine.excalidraw.png similarity index 100% rename from docs/source/assets/design/arch_overview/llm_engine.excalidraw.png rename to docs/assets/design/arch_overview/llm_engine.excalidraw.png diff --git a/docs/source/assets/design/hierarchy.png b/docs/assets/design/hierarchy.png similarity index 100% rename from docs/source/assets/design/hierarchy.png rename to docs/assets/design/hierarchy.png diff --git a/docs/source/assets/design/v1/metrics/intervals-1.png b/docs/assets/design/v1/metrics/intervals-1.png similarity index 100% rename from docs/source/assets/design/v1/metrics/intervals-1.png rename to docs/assets/design/v1/metrics/intervals-1.png diff --git a/docs/source/assets/design/v1/metrics/intervals-2.png b/docs/assets/design/v1/metrics/intervals-2.png similarity index 100% rename from docs/source/assets/design/v1/metrics/intervals-2.png rename to docs/assets/design/v1/metrics/intervals-2.png diff --git a/docs/source/assets/design/v1/metrics/intervals-3.png b/docs/assets/design/v1/metrics/intervals-3.png similarity index 100% rename from docs/source/assets/design/v1/metrics/intervals-3.png rename to docs/assets/design/v1/metrics/intervals-3.png diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-1.png b/docs/assets/design/v1/prefix_caching/example-time-1.png similarity index 100% rename from docs/source/assets/design/v1/prefix_caching/example-time-1.png rename to docs/assets/design/v1/prefix_caching/example-time-1.png diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-3.png b/docs/assets/design/v1/prefix_caching/example-time-3.png similarity index 100% rename from docs/source/assets/design/v1/prefix_caching/example-time-3.png rename to docs/assets/design/v1/prefix_caching/example-time-3.png diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-4.png b/docs/assets/design/v1/prefix_caching/example-time-4.png similarity index 100% rename from docs/source/assets/design/v1/prefix_caching/example-time-4.png rename to docs/assets/design/v1/prefix_caching/example-time-4.png diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-5.png b/docs/assets/design/v1/prefix_caching/example-time-5.png similarity index 100% rename from docs/source/assets/design/v1/prefix_caching/example-time-5.png rename to docs/assets/design/v1/prefix_caching/example-time-5.png diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-6.png b/docs/assets/design/v1/prefix_caching/example-time-6.png similarity index 100% rename from docs/source/assets/design/v1/prefix_caching/example-time-6.png rename to docs/assets/design/v1/prefix_caching/example-time-6.png diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-7.png b/docs/assets/design/v1/prefix_caching/example-time-7.png similarity index 100% rename from docs/source/assets/design/v1/prefix_caching/example-time-7.png rename to docs/assets/design/v1/prefix_caching/example-time-7.png diff --git a/docs/source/assets/design/v1/prefix_caching/free.png b/docs/assets/design/v1/prefix_caching/free.png similarity index 100% rename from docs/source/assets/design/v1/prefix_caching/free.png rename to docs/assets/design/v1/prefix_caching/free.png diff --git a/docs/source/assets/design/v1/prefix_caching/overview.png b/docs/assets/design/v1/prefix_caching/overview.png similarity index 100% rename from docs/source/assets/design/v1/prefix_caching/overview.png rename to docs/assets/design/v1/prefix_caching/overview.png diff --git a/docs/source/assets/features/disagg_prefill/abstraction.jpg b/docs/assets/features/disagg_prefill/abstraction.jpg similarity index 100% rename from docs/source/assets/features/disagg_prefill/abstraction.jpg rename to docs/assets/features/disagg_prefill/abstraction.jpg diff --git a/docs/source/assets/features/disagg_prefill/overview.jpg b/docs/assets/features/disagg_prefill/overview.jpg similarity index 100% rename from docs/source/assets/features/disagg_prefill/overview.jpg rename to docs/assets/features/disagg_prefill/overview.jpg diff --git a/docs/source/assets/kernel/k_vecs.png b/docs/assets/kernel/k_vecs.png similarity index 100% rename from docs/source/assets/kernel/k_vecs.png rename to docs/assets/kernel/k_vecs.png diff --git a/docs/source/assets/kernel/key.png b/docs/assets/kernel/key.png similarity index 100% rename from docs/source/assets/kernel/key.png rename to docs/assets/kernel/key.png diff --git a/docs/source/assets/kernel/logits_vec.png b/docs/assets/kernel/logits_vec.png similarity index 100% rename from docs/source/assets/kernel/logits_vec.png rename to docs/assets/kernel/logits_vec.png diff --git a/docs/source/assets/kernel/q_vecs.png b/docs/assets/kernel/q_vecs.png similarity index 100% rename from docs/source/assets/kernel/q_vecs.png rename to docs/assets/kernel/q_vecs.png diff --git a/docs/source/assets/kernel/query.png b/docs/assets/kernel/query.png similarity index 100% rename from docs/source/assets/kernel/query.png rename to docs/assets/kernel/query.png diff --git a/docs/source/assets/kernel/v_vec.png b/docs/assets/kernel/v_vec.png similarity index 100% rename from docs/source/assets/kernel/v_vec.png rename to docs/assets/kernel/v_vec.png diff --git a/docs/source/assets/kernel/value.png b/docs/assets/kernel/value.png similarity index 100% rename from docs/source/assets/kernel/value.png rename to docs/assets/kernel/value.png diff --git a/docs/source/assets/logos/vllm-logo-only-light.ico b/docs/assets/logos/vllm-logo-only-light.ico similarity index 100% rename from docs/source/assets/logos/vllm-logo-only-light.ico rename to docs/assets/logos/vllm-logo-only-light.ico diff --git a/docs/source/assets/logos/vllm-logo-only-light.png b/docs/assets/logos/vllm-logo-only-light.png similarity index 100% rename from docs/source/assets/logos/vllm-logo-only-light.png rename to docs/assets/logos/vllm-logo-only-light.png diff --git a/docs/source/assets/logos/vllm-logo-text-dark.png b/docs/assets/logos/vllm-logo-text-dark.png similarity index 100% rename from docs/source/assets/logos/vllm-logo-text-dark.png rename to docs/assets/logos/vllm-logo-text-dark.png diff --git a/docs/source/assets/logos/vllm-logo-text-light.png b/docs/assets/logos/vllm-logo-text-light.png similarity index 100% rename from docs/source/assets/logos/vllm-logo-text-light.png rename to docs/assets/logos/vllm-logo-text-light.png diff --git a/docs/cli/README.md b/docs/cli/README.md new file mode 100644 index 0000000000000..5feb316d61a89 --- /dev/null +++ b/docs/cli/README.md @@ -0,0 +1,179 @@ +# vLLM CLI Guide + +The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with: + +``` +vllm --help +``` + +Available Commands: + +``` +vllm {chat,complete,serve,bench,collect-env,run-batch} +``` + +## Table of Contents + +- [serve](#serve) +- [chat](#chat) +- [complete](#complete) +- [bench](#bench) + - [latency](#latency) + - [serve](#serve-1) + - [throughput](#throughput) +- [collect-env](#collect-env) +- [run-batch](#run-batch) +- [More Help](#more-help) + +## serve + +Start the vLLM OpenAI Compatible API server. + +Examples: + +```bash +# Start with a model +vllm serve meta-llama/Llama-2-7b-hf + +# Specify the port +vllm serve meta-llama/Llama-2-7b-hf --port 8100 + +# Check with --help for more options +# To list all groups +vllm serve --help=listgroup + +# To view a argument group +vllm serve --help=ModelConfig + +# To view a single argument +vllm serve --help=max-num-seqs + +# To search by keyword +vllm serve --help=max +``` + +## chat + +Generate chat completions via the running API server. + +Examples: + +```bash +# Directly connect to localhost API without arguments +vllm chat + +# Specify API url +vllm chat --url http://{vllm-serve-host}:{vllm-serve-port}/v1 + +# Quick chat with a single prompt +vllm chat --quick "hi" +``` + +## complete + +Generate text completions based on the given prompt via the running API server. + +Examples: + +```bash +# Directly connect to localhost API without arguments +vllm complete + +# Specify API url +vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1 + +# Quick complete with a single prompt +vllm complete --quick "The future of AI is" +``` + +## bench + +Run benchmark tests for latency online serving throughput and offline inference throughput. + +Available Commands: + +```bash +vllm bench {latency, serve, throughput} +``` + +### latency + +Benchmark the latency of a single batch of requests. + +Example: + +```bash +vllm bench latency \ + --model meta-llama/Llama-3.2-1B-Instruct \ + --input-len 32 \ + --output-len 1 \ + --enforce-eager \ + --load-format dummy +``` + +### serve + +Benchmark the online serving throughput. + +Example: + +```bash +vllm bench serve \ + --model meta-llama/Llama-3.2-1B-Instruct \ + --host server-host \ + --port server-port \ + --random-input-len 32 \ + --random-output-len 4 \ + --num-prompts 5 +``` + +### throughput + +Benchmark offline inference throughput. + +Example: + +```bash +vllm bench throughput \ + --model meta-llama/Llama-3.2-1B-Instruct \ + --input-len 32 \ + --output-len 1 \ + --enforce-eager \ + --load-format dummy +``` + +## collect-env + +Start collecting environment information. + +```bash +vllm collect-env +``` + +## run-batch + +Run batch prompts and write results to file. + +Examples: + +```bash +# Running with a local file +vllm run-batch \ + -i offline_inference/openai_batch/openai_example_batch.jsonl \ + -o results.jsonl \ + --model meta-llama/Meta-Llama-3-8B-Instruct + +# Using remote file +vllm run-batch \ + -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \ + -o results.jsonl \ + --model meta-llama/Meta-Llama-3-8B-Instruct +``` + +## More Help + +For detailed options of any subcommand, use: + +```bash +vllm --help +``` diff --git a/docs/source/community/meetups.md b/docs/community/meetups.md similarity index 98% rename from docs/source/community/meetups.md rename to docs/community/meetups.md index aa1a71c86c0a6..8ea42e3cad185 100644 --- a/docs/source/community/meetups.md +++ b/docs/community/meetups.md @@ -1,6 +1,7 @@ -(meetups)= - -# vLLM Meetups +--- +title: Meetups +--- +[](){ #meetups } We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: diff --git a/docs/source/community/sponsors.md b/docs/community/sponsors.md similarity index 100% rename from docs/source/community/sponsors.md rename to docs/community/sponsors.md diff --git a/docs/configuration/README.md b/docs/configuration/README.md new file mode 100644 index 0000000000000..6a8fbc79f4aff --- /dev/null +++ b/docs/configuration/README.md @@ -0,0 +1,9 @@ +# Configuration Options + +This section lists the most common options for running vLLM. + +There are three main levels of configuration, from highest priority to lowest priority: + +- [Request parameters][completions-api] and [input arguments][sampling-params] +- [Engine arguments](./engine_args.md) +- [Environment variables](./env_vars.md) diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md new file mode 100644 index 0000000000000..a1283a503a6df --- /dev/null +++ b/docs/configuration/conserving_memory.md @@ -0,0 +1,144 @@ +# Conserving Memory + +Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem. + +## Tensor Parallelism (TP) + +Tensor parallelism (`tensor_parallel_size` option) can be used to split the model across multiple GPUs. + +The following code splits the model across 2 GPUs. + +```python +from vllm import LLM + +llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", + tensor_parallel_size=2) +``` + +!!! warning + To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.cuda.set_device][]) + before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`. + + To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable. + +!!! note + With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). + + You can convert the model checkpoint to a sharded checkpoint using . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. + +## Quantization + +Quantized models take less memory at the cost of lower precision. + +Statically quantized models can be downloaded from HF Hub (some popular ones are available at [Red Hat AI](https://huggingface.co/RedHatAI)) +and used directly without extra configuration. + +Dynamic quantization is also supported via the `quantization` option -- see [here][quantization-index] for more details. + +## Context length and batch size + +You can further reduce memory usage by limiting the context length of the model (`max_model_len` option) +and the maximum batch size (`max_num_seqs` option). + +```python +from vllm import LLM + +llm = LLM(model="adept/fuyu-8b", + max_model_len=2048, + max_num_seqs=2) +``` + +## Reduce CUDA Graphs + +By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU. + +!!! warning + CUDA graph capture takes up more memory in V1 than in V0. + +You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage: + +```python +from vllm import LLM +from vllm.config import CompilationConfig, CompilationLevel + +llm = LLM( + model="meta-llama/Llama-3.1-8B-Instruct", + compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + # By default, it goes up to max_num_seqs + cudagraph_capture_sizes=[1, 2, 4, 8, 16], + ), +) +``` + +You can disable graph capturing completely via the `enforce_eager` flag: + +```python +from vllm import LLM + +llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", + enforce_eager=True) +``` + +## Adjust cache size + +If you run out of CPU RAM, try the following options: + +- (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB). +- (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB). + +## Multi-modal input limits + +You can allow a smaller number of multi-modal items per prompt to reduce the memory footprint of the model: + +```python +from vllm import LLM + +# Accept up to 3 images and 1 video per prompt +llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", + limit_mm_per_prompt={"image": 3, "video": 1}) +``` + +You can go a step further and disable unused modalities completely by setting its limit to zero. +For example, if your application only accepts image input, there is no need to allocate any memory for videos. + +```python +from vllm import LLM + +# Accept any number of images but no videos +llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", + limit_mm_per_prompt={"video": 0}) +``` + +You can even run a multi-modal model for text-only inference: + +```python +from vllm import LLM + +# Don't accept images. Just text. +llm = LLM(model="google/gemma-3-27b-it", + limit_mm_per_prompt={"image": 0}) +``` + +## Multi-modal processor arguments + +For certain models, you can adjust the multi-modal processor arguments to +reduce the size of the processed multi-modal inputs, which in turn saves memory. + +Here are some examples: + +```python +from vllm import LLM + +# Available for Qwen2-VL series models +llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", + mm_processor_kwargs={ + "max_pixels": 768 * 768, # Default is 1280 * 28 * 28 + }) + +# Available for InternVL series models +llm = LLM(model="OpenGVLab/InternVL2-2B", + mm_processor_kwargs={ + "max_dynamic_patch": 4, # Default is 12 + }) +``` diff --git a/docs/configuration/engine_args.md b/docs/configuration/engine_args.md new file mode 100644 index 0000000000000..fb2689a56391b --- /dev/null +++ b/docs/configuration/engine_args.md @@ -0,0 +1,18 @@ +--- +title: Engine Arguments +--- +[](){ #engine-args } + +Engine arguments control the behavior of the vLLM engine. + +- For [offline inference][offline-inference], they are part of the arguments to [LLM][vllm.LLM] class. +- For [online serving][openai-compatible-server], they are part of the arguments to `vllm serve`. + +You can look at [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs] to see the available engine arguments. + +However, these classes are a combination of the configuration classes defined in [vllm.config][]. Therefore, we would recommend you read about them there where they are best documented. + +For offline inference you will have access to these configuration classes and for online serving you can cross-reference the configs with `vllm serve --help`, which has its arguments grouped by config. + +!!! note + Additional arguments are available to the [AsyncLLMEngine][vllm.engine.async_llm_engine.AsyncLLMEngine] which is used for online serving. These can be found by running `vllm serve --help` diff --git a/docs/configuration/env_vars.md b/docs/configuration/env_vars.md new file mode 100644 index 0000000000000..f6d548a19d91f --- /dev/null +++ b/docs/configuration/env_vars.md @@ -0,0 +1,12 @@ +# Environment Variables + +vLLM uses the following environment variables to configure the system: + +!!! warning + Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work. + + All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables). + +```python +--8<-- "vllm/envs.py:env-vars-definition" +``` diff --git a/docs/configuration/model_resolution.md b/docs/configuration/model_resolution.md new file mode 100644 index 0000000000000..8757c257d3e93 --- /dev/null +++ b/docs/configuration/model_resolution.md @@ -0,0 +1,23 @@ +# Model Resolution + +vLLM loads HuggingFace-compatible models by inspecting the `architectures` field in `config.json` of the model repository +and finding the corresponding implementation that is registered to vLLM. +Nevertheless, our model resolution may fail for the following reasons: + +- The `config.json` of the model repository lacks the `architectures` field. +- Unofficial repositories refer to a model using alternative names which are not recorded in vLLM. +- The same architecture name is used for multiple models, creating ambiguity as to which model should be loaded. + +To fix this, explicitly specify the model architecture by passing `config.json` overrides to the `hf_overrides` option. +For example: + +```python +from vllm import LLM + +model = LLM( + model="cerebras/Cerebras-GPT-1.3B", + hf_overrides={"architectures": ["GPT2LMHeadModel"]}, # GPT-2 +) +``` + +Our [list of supported models][supported-models] shows the model architectures that are recognized by vLLM. diff --git a/docs/source/performance/optimization.md b/docs/configuration/optimization.md similarity index 99% rename from docs/source/performance/optimization.md rename to docs/configuration/optimization.md index 4160f07849626..811925c19e63e 100644 --- a/docs/source/performance/optimization.md +++ b/docs/configuration/optimization.md @@ -1,5 +1,3 @@ -(optimization-and-tuning)= - # Optimization and Tuning This guide covers optimization strategies and performance tuning for vLLM V1. @@ -26,7 +24,7 @@ You can monitor the number of preemption requests through Prometheus metrics exp In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as recomputation has lower overhead in the V1 architecture. -(chunked-prefill)= +[](){ #chunked-prefill } ## Chunked Prefill diff --git a/docs/configuration/serve_args.md b/docs/configuration/serve_args.md new file mode 100644 index 0000000000000..16b4b29f45d98 --- /dev/null +++ b/docs/configuration/serve_args.md @@ -0,0 +1,38 @@ +--- +title: Server Arguments +--- +[](){ #serve-args } + +The `vllm serve` command is used to launch the OpenAI-compatible server. + +## CLI Arguments + +The `vllm serve` command is used to launch the OpenAI-compatible server. +To see the available CLI arguments, run `vllm serve --help`! + +## Configuration file + +You can load CLI arguments via a [YAML](https://yaml.org/) config file. +The argument names must be the long form of those outlined [above][serve-args]. + +For example: + +```yaml +# config.yaml + +model: meta-llama/Llama-3.1-8B-Instruct +host: "127.0.0.1" +port: 6379 +uvicorn-log-level: "info" +``` + +To use the above config file: + +```bash +vllm serve --config config.yaml +``` + +!!! note + In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence. + The order of priorities is `command line > config file values > defaults`. + e.g. `vllm serve SOME_MODEL --config config.yaml`, SOME_MODEL takes precedence over `model` in config file. diff --git a/docs/source/contributing/overview.md b/docs/contributing/README.md similarity index 70% rename from docs/source/contributing/overview.md rename to docs/contributing/README.md index 89b31f0311e23..65ae9cc963676 100644 --- a/docs/source/contributing/overview.md +++ b/docs/contributing/README.md @@ -16,9 +16,9 @@ Finally, one of the most impactful ways to support us is by raising awareness ab Unsure on where to start? Check out the following links for tasks to work on: - [Good first issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22) - - [Selected onboarding tasks](gh-project:6) + - [Selected onboarding tasks](gh-project:6) - [New model requests](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22new-model%22) - - [Models with multi-modal capabilities](gh-project:10) + - [Models with multi-modal capabilities](gh-project:10) ## License @@ -27,7 +27,69 @@ See . ## Developing Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. -Check out the [building from source](#build-from-source) documentation for details. +Check out the [building from source][build-from-source] documentation for details. + +### Building the docs with MkDocs + +#### Introduction to MkDocs + +[MkDocs](https://github.com/mkdocs/mkdocs) is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file. + +#### Install MkDocs and Plugins + +Install MkDocs along with the [plugins](https://github.com/vllm-project/vllm/blob/main/mkdocs.yaml) used in the vLLM documentation, as well as required dependencies: + +```bash +pip install -r requirements/docs.txt +``` + +!!! note + Ensure that your Python version is compatible with the plugins (e.g., `mkdocs-awesome-nav` requires Python 3.10+) + +#### Verify Installation + +Confirm that MkDocs is correctly installed: + +```bash +mkdocs --version +``` + +Example output: + +```console +mkdocs, version 1.6.1 from /opt/miniconda3/envs/mkdoc/lib/python3.10/site-packages/mkdocs (Python 3.10) +``` + +#### Clone the `vLLM` repository + +```bash +git clone https://github.com/vllm-project/vllm.git +cd vllm +``` + +#### Start the Development Server + +MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it. Make sure you're in the same directory as the `mkdocs.yml` configuration file, and then start the server by running the `mkdocs serve` command: + +```bash +mkdocs serve +``` + +Example output: + +```console +INFO - Documentation built in 106.83 seconds +INFO - [22:02:02] Watching paths for changes: 'docs', 'mkdocs.yaml' +INFO - [22:02:02] Serving on http://127.0.0.1:8000/ +``` + +#### View in Your Browser + +Open up [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in your browser to see a live preview:. + +#### Learn More + +For additional features and advanced configurations, refer to the official [MkDocs Documentation](https://www.mkdocs.org/). ## Testing @@ -46,31 +108,30 @@ pre-commit run mypy-3.9 --hook-stage manual --all-files # Unit tests pytest tests/ + +# Run tests for a single test file with detailed output +pytest -s -v tests/test_logger.py ``` -:::{tip} -Since the ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12. +!!! tip + Since the ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12. -Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment. -::: + Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment. -:::{note} -Currently, the repository is not fully checked by `mypy`. -::: +!!! note + Currently, the repository is not fully checked by `mypy`. -:::{note} -Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU -platform to run unit tests locally, rely on the continuous integration system to run the tests for -now. -::: +!!! note + Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU + platform to run unit tests locally, rely on the continuous integration system to run the tests for + now. ## Issues If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. -:::{important} -If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability). -::: +!!! warning + If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability). ## Pull Requests & Code Reviews @@ -106,9 +167,8 @@ appropriately to indicate the type of change. Please use one of the following: - `[Misc]` for PRs that do not fit the above categories. Please use this sparingly. -:::{note} -If the PR spans more than one category, please include all relevant prefixes. -::: +!!! note + If the PR spans more than one category, please include all relevant prefixes. ### Code Quality @@ -121,9 +181,8 @@ The PR needs to meet the following code quality standards: understand the code. - Include sufficient tests to ensure the project stays correct and robust. This includes both unit tests and integration tests. -- Please add documentation to `docs/source/` if the PR modifies the - user-facing behaviors of vLLM. It helps vLLM users understand and utilize the - new features or changes. +- Please add documentation to `docs/` if the PR modifies the user-facing behaviors of vLLM. + It helps vLLM users understand and utilize the new features or changes. ### Adding or Changing Kernels diff --git a/docs/source/performance/benchmarks.md b/docs/contributing/benchmarks.md similarity index 86% rename from docs/source/performance/benchmarks.md rename to docs/contributing/benchmarks.md index 39dc470a1c708..00505fc6f2a98 100644 --- a/docs/source/performance/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -1,13 +1,14 @@ -(benchmarks)= - -# Benchmark Suites +--- +title: Benchmark Suites +--- +[](){ #benchmarks } vLLM contains two sets of benchmarks: -- [Performance benchmarks](#performance-benchmarks) -- [Nightly benchmarks](#nightly-benchmarks) +- [Performance benchmarks][performance-benchmarks] +- [Nightly benchmarks][nightly-benchmarks] -(performance-benchmarks)= +[](){ #performance-benchmarks } ## Performance Benchmarks @@ -17,7 +18,7 @@ The latest performance results are hosted on the public [vLLM Performance Dashbo More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md). -(nightly-benchmarks)= +[](){ #nightly-benchmarks } ## Nightly Benchmarks diff --git a/docs/source/contributing/deprecation_policy.md b/docs/contributing/deprecation_policy.md similarity index 100% rename from docs/source/contributing/deprecation_policy.md rename to docs/contributing/deprecation_policy.md diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/contributing/dockerfile/dockerfile.md similarity index 82% rename from docs/source/contributing/dockerfile/dockerfile.md rename to docs/contributing/dockerfile/dockerfile.md index 90b9a33cfbe62..a39f335c87b87 100644 --- a/docs/source/contributing/dockerfile/dockerfile.md +++ b/docs/contributing/dockerfile/dockerfile.md @@ -1,7 +1,7 @@ # Dockerfile We provide a to construct the image for running an OpenAI compatible server with vLLM. -More information about deploying with Docker can be found [here](#deployment-docker). +More information about deploying with Docker can be found [here][deployment-docker]. Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: @@ -17,18 +17,21 @@ The edges of the build graph represent: - `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head) - > :::{figure} /assets/contributing/dockerfile-stages-dependency.png - > :align: center - > :alt: query - > :width: 100% - > ::: + >
+ > ![](../../assets/contributing/dockerfile-stages-dependency.png){ align="center" alt="query" width="100%" } + >
> > Made using: > > Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present): > > ```bash - > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename docker/Dockerfile + > dockerfilegraph \ + > -o png \ + > --legend \ + > --dpi 200 \ + > --max-label-length 50 \ + > --filename docker/Dockerfile > ``` > > or in case you want to run it directly with the docker image: diff --git a/docs/contributing/model/README.md b/docs/contributing/model/README.md new file mode 100644 index 0000000000000..b7727f02c11bf --- /dev/null +++ b/docs/contributing/model/README.md @@ -0,0 +1,23 @@ +--- +title: Adding a New Model +--- +[](){ #new-model } + +This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM. + +Contents: + +- [Basic](basic.md) +- [Registration](registration.md) +- [Tests](tests.md) +- [Multimodal](multimodal.md) + +!!! note + The complexity of adding a new model depends heavily on the model's architecture. + The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. + However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. + +!!! tip + If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues) + or ask on our [developer slack](https://slack.vllm.ai). + We will be happy to help you out! diff --git a/docs/source/contributing/model/basic.md b/docs/contributing/model/basic.md similarity index 82% rename from docs/source/contributing/model/basic.md rename to docs/contributing/model/basic.md index ad31995f76bea..0c0ba33792578 100644 --- a/docs/source/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -1,6 +1,7 @@ -(new-model-basic)= - -# Implementing a Basic Model +--- +title: Implementing a Basic Model +--- +[](){ #new-model-basic } This guide walks you through the steps to implement a basic vLLM model. @@ -10,9 +11,8 @@ First, clone the PyTorch model code from the source repository. For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file. -:::{warning} -Make sure to review and adhere to the original code's copyright and licensing terms! -::: +!!! warning + Make sure to review and adhere to the original code's copyright and licensing terms! ## 2. Make your code compatible with vLLM @@ -67,7 +67,7 @@ class MyModel(nn.Module): ... ``` -- Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. +- Rewrite the [forward][torch.nn.Module.forward] method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. ```python def forward( @@ -78,10 +78,9 @@ def forward( ... ``` -:::{note} -Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. -If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. -::: +!!! note + Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. + If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out for more examples. @@ -89,7 +88,7 @@ For reference, check out our [Llama implementation](gh-file:vllm/model_executor/ If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it. To do this, substitute your model's linear and embedding layers with their tensor-parallel versions. -For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with `VocabParallelEmbedding`. For the output LM head, you can use `ParallelLMHead`. +For the embedding layer, you can simply replace [torch.nn.Embedding][] with `VocabParallelEmbedding`. For the output LM head, you can use `ParallelLMHead`. When it comes to the linear layers, we provide the following options to parallelize them: - `ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving. @@ -107,7 +106,7 @@ This method should load the weights from the HuggingFace's checkpoint file and a ## 5. Register your model -See [this page](#new-model-registration) for instructions on how to register your new model to be used by vLLM. +See [this page][new-model-registration] for instructions on how to register your new model to be used by vLLM. ## Frequently Asked Questions @@ -117,7 +116,7 @@ For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `m To support a model with interleaving sliding windows, we need to take care of the following details: -- Make sure [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/config.py#L308) evaluates `has_interleaved_attention` to `True` for this model, and set `self.hf_text_config.interleaved_sliding_window` to the format of interleaving sliding windows the model can understand. Then, `self.hf_text_config.sliding_window` will be deleted, and the model will be treated as a full-attention model. +- Make sure the model's `config.json` contains `sliding_window_pattern`. vLLM then sets `self.hf_text_config.interleaved_sliding_window` to the value of `self.hf_text_config.sliding_window` and deletes `sliding_window` from `self.hf_text_config`. The model will then be treated as a full-attention model. - In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171). With these two steps, interleave sliding windows should work with the model. diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md new file mode 100644 index 0000000000000..892ab9098407c --- /dev/null +++ b/docs/contributing/model/multimodal.md @@ -0,0 +1,803 @@ +--- +title: Multi-Modal Support +--- +[](){ #supports-multimodal } + +This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs][multimodal-inputs]. + +## 1. Update the base vLLM model + +It is assumed that you have already implemented the model in vLLM according to [these steps][new-model-basic]. +Further update the model as follows: + +- Reserve a keyword parameter in [forward][torch.nn.Module.forward] for each input tensor that corresponds to a multi-modal input, as shown in the following example: + + ```diff + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + + pixel_values: torch.Tensor, + ) -> SamplerOutput: + ``` + + More conveniently, you can simply pass `**kwargs` to the [forward][torch.nn.Module.forward] method and retrieve the keyword parameters for multimodal inputs from it. + +- Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs. + + ```python + class YourModelForImage2Seq(nn.Module): + ... + + def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor: + + assert self.vision_encoder is not None + image_features = self.vision_encoder(image_input) + return self.multi_modal_projector(image_features) + + def get_multimodal_embeddings( + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + + # Validate the multimodal input keyword arguments + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + + # Run multimodal inputs through encoder and projector + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + ``` + +!!! warning + The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request. + +- Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings. + + ```python + from .utils import merge_multimodal_embeddings + + class YourModelForImage2Seq(nn.Module): + ... + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + + # `get_input_embeddings` should already be implemented for the language + # model as one of the requirements of basic vLLM model implementation. + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids=input_ids, + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + placeholder_token_id=self.config.image_token_index) + + return inputs_embeds + ``` + +- Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model. + + ```python + class YourModelForImage2Seq(nn.Module): + ... + + def get_language_model(self) -> torch.nn.Module: + # Change `language_model` according to your implementation. + return self.language_model + ``` + +- Once the above steps are done, update the model class with the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface. + + ```diff + + from vllm.model_executor.models.interfaces import SupportsMultiModal + + - class YourModelForImage2Seq(nn.Module): + + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): + ``` + +!!! note + The model class does not have to be named `*ForCausalLM`. + Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples. + +## 2. Specify processing information + +Next, create a subclass of [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo] +to provide basic information related to HF processing. + +### Maximum number of input items + +You need to override the abstract method [get_supported_mm_limits][vllm.multimodal.processing.BaseProcessingInfo.get_supported_mm_limits] +to return the maximum number of input items for each modality supported by the model. + +For example, if the model supports any number of images but only one video per prompt: + +```python +def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": 1} +``` + +## 3. Specify dummy inputs + +Then, inherit [BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] to construct dummy inputs for +HF processing as well as memory profiling. + +### For memory profiling + +Override the abstract methods [get_dummy_text][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text] and [get_dummy_mm_data][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_mm_data] to construct dummy inputs for memory profiling. These dummy inputs should result in the worst-case memory usage of the model so that vLLM can reserve the correct amount of memory for it. + +Assuming that the memory usage increases with the number of tokens, the dummy inputs can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens. + +=== "Basic example: LLaVA" + + Looking at the code of HF's `LlavaForConditionalGeneration`: + + ```python + # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544 + n_image_tokens = (input_ids == self.config.image_token_index).sum().item() + n_image_features = image_features.shape[0] * image_features.shape[1] + + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + special_image_mask = ( + (input_ids == self.config.image_token_index) + .unsqueeze(-1) + .expand_as(inputs_embeds) + .to(inputs_embeds.device) + ) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + ``` + + The number of placeholder feature tokens per image is `image_features.shape[1]`. + `image_features` is calculated inside the `get_image_features` method: + + ```python + # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300 + image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) + + selected_image_feature = image_outputs.hidden_states[vision_feature_layer] + if vision_feature_select_strategy == "default": + selected_image_feature = selected_image_feature[:, 1:] + elif vision_feature_select_strategy == "full": + selected_image_feature = selected_image_feature + else: + raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") + image_features = self.multi_modal_projector(selected_image_feature) + return image_features + ``` + + We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower + (`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model). + Moreover, we only need the sequence length (the second dimension of the tensor) to get `image_features.shape[1]`. + The sequence length is determined by the initial hidden states in `CLIPVisionTransformer` since the attention + mechanism doesn't change the sequence length of the output hidden states. + + ```python + # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L1094-L1102 + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + hidden_states = self.pre_layrnorm(hidden_states) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + ``` + + To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`: + + ```python + # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257 + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) + return embeddings + ``` + + We can infer that `embeddings.shape[1] == self.num_positions`, where + + ```python + # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L195-L196 + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + ``` + + Overall, the number of placeholder feature tokens for an image can be calculated as: + + ```python + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self.get_hf_config() + hf_processor = self.get_hf_processor() + + image_size = hf_config.vision_config.image_size + patch_size = hf_config.vision_config.patch_size + + num_image_tokens = (image_size // patch_size) ** 2 + 1 + if hf_processor.vision_feature_select_strategy == "default": + num_image_tokens -= 1 + + return num_image_tokens + ``` + + Notice that the number of image tokens doesn't depend on the image width and height. + We can simply use a dummy `image_size` to calculate the multimodal profiling data: + + ```python + # NOTE: In actuality, this is usually implemented as part of the + # model's subclass of `BaseProcessingInfo`, but we show it as is + # here for simplicity. + def get_image_size_with_most_features(self) -> ImageSize: + hf_config = self.get_hf_config() + width = height = hf_config.image_size + return ImageSize(width=width, height=height) + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + + target_width, target_height = \ + self.info.get_image_size_with_most_features() + + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + ``` + + For the text, we simply expand the multimodal image token from the model config to match the desired number of images. + + ```python + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + + processor = self.info.get_hf_processor() + image_token = processor.image_token + + return image_token * num_images + ``` + +=== "No input placeholders: Fuyu" + + Looking at the code of HF's `FuyuForCausalLM`: + + ```python + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322 + if image_patches is not None and past_key_values is None: + patch_embeddings = [ + self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype)) + .squeeze(0) + .to(inputs_embeds.device) + for patch in image_patches + ] + inputs_embeds = self.gather_continuous_embeddings( + word_embeddings=inputs_embeds, + continuous_embeddings=patch_embeddings, + image_patch_input_indices=image_patches_indices, + ) + ``` + + The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`, + which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`. + + Unlike LLaVA, Fuyu does not define the number of patches inside the modeling file. Where can we get more information? + Considering that the model input comes from the output of `FuyuProcessor`, let's **look at the preprocessing files**. + + The image outputs are obtained by calling `FuyuImageProcessor.preprocess` and then + `FuyuImageProcessor.preprocess_with_tokenizer_info` inside `FuyuProcessor`. + + In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`, + returning the dimensions after resizing (but before padding) as metadata. + + ```python + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544 + image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"]) + batch_images = image_encoding["images"] + image_unpadded_heights = image_encoding["image_unpadded_heights"] + image_unpadded_widths = image_encoding["image_unpadded_widths"] + + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L + if do_resize: + batch_images = [ + [self.resize(image, size=size, input_data_format=input_data_format) for image in images] + for images in batch_images + ] + + image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images] + image_unpadded_heights = [[image_size[0]] for image_size in image_sizes] + image_unpadded_widths = [[image_size[1]] for image_size in image_sizes] + + if do_pad: + batch_images = [ + [ + self.pad_image( + image, + size=size, + mode=padding_mode, + constant_values=padding_value, + input_data_format=input_data_format, + ) + for image in images + ] + for images in batch_images + ] + ``` + + In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata: + + ```python + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425 + model_image_input = self.image_processor.preprocess_with_tokenizer_info( + image_input=tensor_batch_images, + image_present=image_present, + image_unpadded_h=image_unpadded_heights, + image_unpadded_w=image_unpadded_widths, + image_placeholder_id=image_placeholder_id, + image_newline_id=image_newline_id, + variable_sized=True, + ) + + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658 + image_height, image_width = image.shape[1], image.shape[2] + if variable_sized: # variable_sized=True + new_h = min( + image_height, + math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height, + ) + new_w = min( + image_width, + math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width, + ) + image = image[:, :new_h, :new_w] + image_height, image_width = new_h, new_w + + num_patches = self.get_num_patches(image_height=image_height, image_width=image_width) + tensor_of_image_ids = torch.full( + [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device + ) + patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0) + assert num_patches == patches.shape[0] + ``` + + The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`: + + ```python + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562 + patch_size = patch_size if patch_size is not None else self.patch_size + patch_height, patch_width = self.patch_size["height"], self.patch_size["width"] + + if image_height % patch_height != 0: + raise ValueError(f"{image_height=} must be divisible by {patch_height}") + if image_width % patch_width != 0: + raise ValueError(f"{image_width=} must be divisible by {patch_width}") + + num_patches_per_dim_h = image_height // patch_height + num_patches_per_dim_w = image_width // patch_width + num_patches = num_patches_per_dim_h * num_patches_per_dim_w + ``` + + These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized + to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`. + + ```python + def get_image_size_with_most_features(self) -> ImageSize: + image_processor = self.get_image_processor() + return ImageSize(width=image_processor.size["width"], + height=image_processor.size["height"]) + ``` + + Fuyu does not expect image placeholders in the inputs to HF processor, so + the dummy prompt text is empty regardless of the number of images. + + ```python + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + return "" + ``` + + For the multimodal image profiling data, the logic is very similar to LLaVA: + + ```python + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + target_width, target_height = \ + self.info.get_image_size_with_most_features() + num_images = mm_counts.get("image", 0) + + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + ``` + +## 4. Specify processing details + +Afterwards, create a subclass of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] +to fill in the missing details about HF processing. + +!!! info + [Multi-Modal Data Processing][mm-processing] + +### Multi-modal fields + +Override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] to +return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items. + +=== "Basic example: LLaVA" + + The output of `CLIPImageProcessor` is a simple tensor with shape + `(num_images, num_channels, image_height, image_width)`: + + ```python + # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345 + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in all_images + ] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + ``` + + So, we override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] as follows: + + ```python + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + ) + ``` + + !!! note + Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports + pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument. + +=== "With postprocessing: Fuyu" + + The `image_patches` output of `FuyuImageProcessor.preprocess_with_tokenizer_info` concatenates + the patches from each image belonging to an item in the batch: + + ```python + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L673-L679 + image_input_ids.append(tensor_of_image_ids) + image_patches.append(patches) + else: + image_input_ids.append(torch.tensor([], dtype=torch.int32, device=image_input.device)) + + batch_image_input_ids.append(image_input_ids) + batch_image_patches.append(image_patches) + ``` + + The shape of `image_patches` outputted by `FuyuImageProcessor` is therefore + `(1, num_images, num_patches, patch_width * patch_height * num_channels)`. + + In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA, + we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]: + + ```python + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + + image_patches = processed_outputs.get("image_patches") + if image_patches is not None: + images = mm_data["images"] + assert isinstance(images, list) + + # Original output: (1, num_images, Pn, Px * Py * C) + # New output: (num_images, Pn, Px * Py * C) + assert (isinstance(image_patches, list) + and len(image_patches) == 1) + assert (isinstance(image_patches[0], torch.Tensor) + and len(image_patches[0]) == len(images)) + + processed_outputs["image_patches"] = image_patches[0] + + return processed_outputs + ``` + + !!! note + Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling + for text-only inputs to prevent unnecessary warnings from HF processor. + + This lets us override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] as follows: + + ```python + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict(image_patches=MultiModalFieldConfig.batched("image")) + ``` + +### Prompt updates + +Override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] to +return a list of [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instances. + +Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies an update operation +(e.g.: insertion, replacement) performed by the HF processor. + +=== "Basic example: LLaVA" + + Looking at HF's `LlavaProcessor`: + + ```python + # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/processing_llava.py#L167-L170 + prompt_strings = [] + for sample in text: + sample = sample.replace(self.image_token, self.image_token * num_image_tokens) + prompt_strings.append(sample) + ``` + + It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`). + Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows: + + ```python + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + hf_config = self.info.get_hf_config() + image_token_id = hf_config.image_token_index + + def get_replacement(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + + image_size = images.get_image_size(item_idx) + num_image_tokens = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + ) + + return [image_token_id] * num_image_tokens + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=get_replacement, + ), + ] + ``` + +=== "Handling additional tokens: Fuyu" + + Recall the layout of feature tokens from Step 2: + + ``` + |SPEAKER||SPEAKER|...|SPEAKER||NEWLINE| + |SPEAKER||SPEAKER|...|SPEAKER||NEWLINE| + ... + |SPEAKER||SPEAKER|...|SPEAKER||NEWLINE| + ``` + + We define a helper function to return `ncols` and `nrows` directly: + + ```python + def get_image_feature_grid_size( + self, + *, + image_width: int, + image_height: int, + ) -> tuple[int, int]: + image_processor = self.get_image_processor() + target_width = image_processor.size["width"] + target_height = image_processor.size["height"] + patch_width = image_processor.patch_size["width"] + patch_height = image_processor.patch_size["height"] + + if not (image_width <= target_width and image_height <= target_height): + height_scale_factor = target_height / image_height + width_scale_factor = target_width / image_width + optimal_scale_factor = min(height_scale_factor, width_scale_factor) + + image_height = int(image_height * optimal_scale_factor) + image_width = int(image_width * optimal_scale_factor) + + ncols = math.ceil(image_width / patch_width) + nrows = math.ceil(image_height / patch_height) + return ncols, nrows + ``` + + Based on this, we can initially define our replacement tokens as: + + ```python + def get_replacement(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + image_size = images.get_image_size(item_idx) + + ncols, nrows = self.info.get_image_feature_grid_size( + image_width=image_size.width, + image_height=image_size.height, + ) + + # `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|` + # `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|` + return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows + ``` + + However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called, + a BOS token (``) is also added to the promopt: + + ```python + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435 + model_image_input = self.image_processor.preprocess_with_tokenizer_info( + image_input=tensor_batch_images, + image_present=image_present, + image_unpadded_h=image_unpadded_heights, + image_unpadded_w=image_unpadded_widths, + image_placeholder_id=image_placeholder_id, + image_newline_id=image_newline_id, + variable_sized=True, + ) + prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch( + tokenizer=self.tokenizer, + prompts=prompts, + scale_factors=scale_factors, + max_tokens_to_generate=self.max_tokens_to_generate, + max_position_embeddings=self.max_position_embeddings, + add_BOS=True, + add_beginning_of_answer_token=True, + ) + ``` + + To assign the vision embeddings to only the image tokens, instead of a string + you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]: + + ```python + hf_config = self.info.get_hf_config() + bos_token_id = hf_config.bos_token_id # `` + assert isinstance(bos_token_id, int) + + def get_replacement_fuyu(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + image_size = images.get_image_size(item_idx) + + ncols, nrows = self.info.get_image_feature_grid_size( + image_width=image_size.width, + image_height=image_size.height, + ) + image_tokens = ([_IMAGE_TOKEN_ID] * ncols + + [_NEWLINE_TOKEN_ID]) * nrows + + return PromptUpdateDetails.select_token_id( + image_tokens + [bos_token_id], + embed_token_id=_IMAGE_TOKEN_ID, + ) + ``` + + Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt, + we can search for it to conduct the replacement at the start of the string: + + ```python + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + hf_config = self.info.get_hf_config() + bos_token_id = hf_config.bos_token_id + assert isinstance(bos_token_id, int) + + tokenizer = self.info.get_tokenizer() + eot_token_id = tokenizer.bos_token_id + assert isinstance(eot_token_id, int) + + def get_replacement_fuyu(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + image_size = images.get_image_size(item_idx) + + ncols, nrows = self.info.get_image_feature_grid_size( + image_width=image_size.width, + image_height=image_size.height, + ) + image_tokens = ([_IMAGE_TOKEN_ID] * ncols + + [_NEWLINE_TOKEN_ID]) * nrows + + return PromptUpdateDetails.select_token_id( + image_tokens + [bos_token_id], + embed_token_id=_IMAGE_TOKEN_ID, + ) + + return [ + PromptReplacement( + modality="image", + target=[eot_token_id], + replacement=get_replacement_fuyu, + ) + ] + ``` + +## 5. Register processor-related classes + +After you have defined [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo] (Step 2), +[BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] (Step 3), +and [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] (Step 4), +decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor ` +to register them to the multi-modal registry: + +```diff + from vllm.model_executor.models.interfaces import SupportsMultiModal ++ from vllm.multimodal import MULTIMODAL_REGISTRY + ++ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor, ++ info=YourProcessingInfo, ++ dummy_inputs=YourDummyInputsBuilder) + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): +``` + +## Notes + +### Inserting feature tokens without replacement + +Some HF processors directly insert feature tokens without replacing anything in the original prompt. In that case, you can use [PromptInsertion][vllm.multimodal.processing.PromptInsertion] instead of [PromptReplacement][vllm.multimodal.processing.PromptReplacement] inside [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates]. + +Examples: + +- BLIP-2 (insert at start of prompt): +- Florence2 (insert at start of prompt): +- Molmo (insert after `<|endoftext|>` token): + +### Handling prompt updates unrelated to multi-modal data + +[_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] assumes that each application of prompt update corresponds to one multi-modal item. If the HF processor performs additional processing regardless of how many multi-modal items there are, you should override [_apply_hf_processor_tokens_only][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_tokens_only] so that the processed token inputs are consistent with the result of applying the HF processor on text inputs. This is because token inputs bypass the HF processor according to [our design][mm-processing]. + +Examples: + +- Chameleon (appends `sep_token`): +- Fuyu (appends `boa_token`): +- Molmo (applies chat template which is not defined elsewhere): + +### Custom HF processor + +Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to [_call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor]. + +Examples: + +- DeepSeek-VL2: +- InternVL: +- Qwen-VL: diff --git a/docs/contributing/model/registration.md b/docs/contributing/model/registration.md new file mode 100644 index 0000000000000..7a7bd79140585 --- /dev/null +++ b/docs/contributing/model/registration.md @@ -0,0 +1,54 @@ +--- +title: Registering a Model to vLLM +--- +[](){ #new-model-registration } + +vLLM relies on a model registry to determine how to run each model. +A list of pre-registered architectures can be found [here][supported-models]. + +If your model is not on this list, you must register it to vLLM. +This page provides detailed instructions on how to do so. + +## Built-in models + +To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source]. +This gives you the ability to modify the codebase and test your model. + +After you have implemented your model (see [tutorial][new-model-basic]), put it into the directory. +Then, add your model class to `_VLLM_MODELS` in so that it is automatically registered upon importing vLLM. +Finally, update our [list of supported models][supported-models] to promote your model! + +!!! warning + The list of models in each section should be maintained in alphabetical order. + +## Out-of-tree models + +You can load an external model [using a plugin][plugin-system] without modifying the vLLM codebase. + +To register the model, use the following code: + +```python +# The entrypoint of your plugin +def register(): + from vllm import ModelRegistry + from your_code import YourModelForCausalLM + + ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) +``` + +If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`: + +```python +# The entrypoint of your plugin +def register(): + from vllm import ModelRegistry + + ModelRegistry.register_model( + "YourModelForCausalLM", + "your_code:YourModelForCausalLM" + ) +``` + +!!! warning + If your model is a multimodal model, ensure the model class implements the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface. + Read more about that [here][supports-multimodal]. diff --git a/docs/source/contributing/model/tests.md b/docs/contributing/model/tests.md similarity index 74% rename from docs/source/contributing/model/tests.md rename to docs/contributing/model/tests.md index 68d51d89f7cff..67f8eda61dc54 100644 --- a/docs/source/contributing/model/tests.md +++ b/docs/contributing/model/tests.md @@ -1,6 +1,7 @@ -(new-model-tests)= - -# Writing Unit Tests +--- +title: Writing Unit Tests +--- +[](){ #new-model-tests } This page explains how to write unit tests to verify the implementation of your model. @@ -14,14 +15,12 @@ Without them, the CI for your PR will fail. Include an example HuggingFace repository for your model in . This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM. -:::{important} -The list of models in each section should be maintained in alphabetical order. -::: +!!! warning + The list of models in each section should be maintained in alphabetical order. -:::{tip} -If your model requires a development version of HF Transformers, you can set -`min_transformers_version` to skip the test in CI until the model is released. -::: +!!! tip + If your model requires a development version of HF Transformers, you can set + `min_transformers_version` to skip the test in CI until the model is released. ## Optional Tests @@ -34,16 +33,16 @@ These tests compare the model outputs of vLLM against [HF Transformers](https:// #### Generative models -For [generative models](#generative-models), there are two levels of correctness tests, as defined in : +For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in : - Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF. - Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa. #### Pooling models -For [pooling models](#pooling-models), we simply check the cosine similarity, as defined in . +For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in . -(mm-processing-tests)= +[](){ #mm-processing-tests } ### Multi-modal processing diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/contributing/profiling.md similarity index 90% rename from docs/source/contributing/profiling/profiling_index.md rename to docs/contributing/profiling.md index ce25daa39c5cb..be01b9b65f65c 100644 --- a/docs/source/contributing/profiling/profiling_index.md +++ b/docs/contributing/profiling.md @@ -1,8 +1,7 @@ # Profiling vLLM -:::{warning} -Profiling is only intended for vLLM developers and maintainers to understand the proportion of time spent in different parts of the codebase. **vLLM end-users should never turn on profiling** as it will significantly slow down the inference. -::: +!!! warning + Profiling is only intended for vLLM developers and maintainers to understand the proportion of time spent in different parts of the codebase. **vLLM end-users should never turn on profiling** as it will significantly slow down the inference. ## Profile with PyTorch Profiler @@ -14,15 +13,13 @@ When using `benchmarks/benchmark_serving.py`, you can enable profiling by passin Traces can be visualized using . -:::{tip} -Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly. -::: +!!! tip + Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly. -:::{tip} -To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100. -Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes. -`export VLLM_RPC_TIMEOUT=1800000` -::: +!!! tip + To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100. + Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes. + `export VLLM_RPC_TIMEOUT=1800000` ### Example commands and usage diff --git a/docs/source/contributing/vulnerability_management.md b/docs/contributing/vulnerability_management.md similarity index 100% rename from docs/source/contributing/vulnerability_management.md rename to docs/contributing/vulnerability_management.md diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md new file mode 100644 index 0000000000000..516640f6fd3c4 --- /dev/null +++ b/docs/deployment/docker.md @@ -0,0 +1,129 @@ +--- +title: Using Docker +--- +[](){ #deployment-docker } + +[](){ #deployment-docker-pre-built-image } + +## Use vLLM's Official Docker Image + +vLLM offers an official Docker image for deployment. +The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags). + +```console +docker run --runtime nvidia --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --env "HUGGING_FACE_HUB_TOKEN=" \ + -p 8000:8000 \ + --ipc=host \ + vllm/vllm-openai:latest \ + --model mistralai/Mistral-7B-v0.1 +``` + +This image can also be used with other container engines such as [Podman](https://podman.io/). + +```console +podman run --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ + -p 8000:8000 \ + --ipc=host \ + vllm/vllm-openai:latest \ + --model mistralai/Mistral-7B-v0.1 +``` + +You can add any other [engine-args][engine-args] you need after the image tag (`vllm/vllm-openai:latest`). + +!!! note + You can either use the `ipc=host` flag or `--shm-size` flag to allow the + container to access the host's shared memory. vLLM uses PyTorch, which uses shared + memory to share data between processes under the hood, particularly for tensor parallel inference. + +!!! note + Optional dependencies are not included in order to avoid licensing issues (e.g. ). + + If you need to use those dependencies (having accepted the license terms), + create a custom Dockerfile on top of the base image with an extra layer that installs them: + + ```Dockerfile + FROM vllm/vllm-openai:v0.8.3 + + # e.g. install the `audio` optional dependencies + # NOTE: Make sure the version of vLLM matches the base image! + RUN uv pip install --system vllm[audio]==0.8.3 + ``` + +!!! tip + Some new models may only be available on the main branch of [HF Transformers](https://github.com/huggingface/transformers). + + To use the development version of `transformers`, create a custom Dockerfile on top of the base image + with an extra layer that installs their code from source: + + ```Dockerfile + FROM vllm/vllm-openai:latest + + RUN uv pip install --system git+https://github.com/huggingface/transformers.git + ``` + +[](){ #deployment-docker-build-image-from-source } + +## Building vLLM's Docker Image from Source + +You can build and run vLLM from source via the provided . To build vLLM: + +```console +# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 +DOCKER_BUILDKIT=1 docker build . \ + --target vllm-openai \ + --tag vllm/vllm-openai \ + --file docker/Dockerfile +``` + +!!! note + By default vLLM will build for all GPU types for widest distribution. If you are just building for the + current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""` + for vLLM to find the current GPU type and build for that. + + If you are using Podman instead of Docker, you might need to disable SELinux labeling by + adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184). + +## Building for Arm64/aarch64 + +A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use +of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64. + +!!! note + Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=` + flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits. + Keep an eye on memory usage with parallel jobs as it can be substantial (see example below). + +```console +# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) +python3 use_existing_torch.py +DOCKER_BUILDKIT=1 docker build . \ + --file docker/Dockerfile \ + --target vllm-openai \ + --platform "linux/arm64" \ + -t vllm/vllm-gh200-openai:latest \ + --build-arg max_jobs=66 \ + --build-arg nvcc_threads=2 \ + --build-arg torch_cuda_arch_list="9.0+PTX" \ + --build-arg vllm_fa_cmake_gpu_arches="90-real" +``` + +## Use the custom-built vLLM Docker image + +To run vLLM with the custom-built Docker image: + +```console +docker run --runtime nvidia --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + -p 8000:8000 \ + --env "HUGGING_FACE_HUB_TOKEN=" \ + vllm/vllm-openai +``` + +The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command). + +!!! note + **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` . diff --git a/docs/source/deployment/frameworks/anything-llm.md b/docs/deployment/frameworks/anything-llm.md similarity index 78% rename from docs/source/deployment/frameworks/anything-llm.md rename to docs/deployment/frameworks/anything-llm.md index d430c170ef541..a89e633c086ea 100644 --- a/docs/source/deployment/frameworks/anything-llm.md +++ b/docs/deployment/frameworks/anything-llm.md @@ -1,6 +1,7 @@ -(deployment-anything-llm)= - -# Anything LLM +--- +title: Anything LLM +--- +[](){ #deployment-anything-llm } [Anything LLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting. @@ -25,23 +26,19 @@ vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096 - Base URL: http://{vllm server host}:{vllm server port}/v1 - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ` -:::{image} /assets/deployment/anything-llm-provider.png -::: +![](../../assets/deployment/anything-llm-provider.png) - Back to home page, New Workspace --> create `vllm` workspace, and start to chat: -:::{image} /assets/deployment/anything-llm-chat-without-doc.png -::: +![](../../assets/deployment/anything-llm-chat-without-doc.png) - Click the upload button: - upload the doc - select the doc and move to the workspace - save and embed -:::{image} /assets/deployment/anything-llm-upload-doc.png -::: +![](../../assets/deployment/anything-llm-upload-doc.png) - Chat again: -:::{image} /assets/deployment/anything-llm-chat-with-doc.png -::: +![](../../assets/deployment/anything-llm-chat-with-doc.png) diff --git a/docs/deployment/frameworks/autogen.md b/docs/deployment/frameworks/autogen.md new file mode 100644 index 0000000000000..ad8c167659efa --- /dev/null +++ b/docs/deployment/frameworks/autogen.md @@ -0,0 +1,83 @@ +--- +title: AutoGen +--- +[](){ #deployment-autogen } + +[AutoGen](https://github.com/microsoft/autogen) is a framework for creating multi-agent AI applications that can act autonomously or work alongside humans. + +## Prerequisites + +- Setup vLLM environment + +- Setup [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment + +```console +pip install vllm + +# Install AgentChat and OpenAI client from Extensions +# AutoGen requires Python 3.10 or later. +pip install -U "autogen-agentchat" "autogen-ext[openai]" +``` + +## Deploy + +- Start the vLLM server with the supported chat completion model, e.g. + +```console +python -m vllm.entrypoints.openai.api_server \ + --model mistralai/Mistral-7B-Instruct-v0.2 +``` + +- Call it with AutoGen: + +```python +import asyncio +from autogen_core.models import UserMessage +from autogen_ext.models.openai import OpenAIChatCompletionClient +from autogen_core.models import ModelFamily + + +async def main() -> None: + # Create a model client + model_client = OpenAIChatCompletionClient( + model="mistralai/Mistral-7B-Instruct-v0.2", + base_url="http://{your-vllm-host-ip}:{your-vllm-host-port}/v1", + api_key="EMPTY", + model_info={ + "vision": False, + "function_calling": False, + "json_output": False, + "family": ModelFamily.MISTRAL, + "structured_output": True, + }, + ) + + messages = [UserMessage(content="Write a very short story about a dragon.", source="user")] + + # Create a stream. + stream = model_client.create_stream(messages=messages) + + # Iterate over the stream and print the responses. + print("Streamed responses:") + async for response in stream: + if isinstance(response, str): + # A partial response is a string. + print(response, flush=True, end="") + else: + # The last response is a CreateResult object with the complete message. + print("\n\n------------\n") + print("The complete response:", flush=True) + print(response.content, flush=True) + + # Close the client when done. + await model_client.close() + + +asyncio.run(main()) +``` + +For details, see the tutorial: + +- [Using vLLM in AutoGen](https://microsoft.github.io/autogen/0.2/docs/topics/non-openai-models/local-vllm/) + +- [OpenAI-compatible API examples](https://microsoft.github.io/autogen/stable/reference/python/autogen_ext.models.openai.html#autogen_ext.models.openai.OpenAIChatCompletionClient) diff --git a/docs/source/deployment/frameworks/bentoml.md b/docs/deployment/frameworks/bentoml.md similarity index 89% rename from docs/source/deployment/frameworks/bentoml.md rename to docs/deployment/frameworks/bentoml.md index 2bf435bda8380..7e64b6eb6fb03 100644 --- a/docs/source/deployment/frameworks/bentoml.md +++ b/docs/deployment/frameworks/bentoml.md @@ -1,6 +1,7 @@ -(deployment-bentoml)= - -# BentoML +--- +title: BentoML +--- +[](){ #deployment-bentoml } [BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes. diff --git a/docs/source/deployment/frameworks/cerebrium.md b/docs/deployment/frameworks/cerebrium.md similarity index 98% rename from docs/source/deployment/frameworks/cerebrium.md rename to docs/deployment/frameworks/cerebrium.md index b20c95137b6e7..84cb2304fac20 100644 --- a/docs/source/deployment/frameworks/cerebrium.md +++ b/docs/deployment/frameworks/cerebrium.md @@ -1,12 +1,11 @@ -(deployment-cerebrium)= +--- +title: Cerebrium +--- +[](){ #deployment-cerebrium } -# Cerebrium - -:::{raw} html

vLLM_plus_cerebrium

-::: vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebrium.ai/), a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications. diff --git a/docs/source/deployment/frameworks/chatbox.md b/docs/deployment/frameworks/chatbox.md similarity index 84% rename from docs/source/deployment/frameworks/chatbox.md rename to docs/deployment/frameworks/chatbox.md index e62f4647150f4..10da2fc710027 100644 --- a/docs/source/deployment/frameworks/chatbox.md +++ b/docs/deployment/frameworks/chatbox.md @@ -1,6 +1,7 @@ -(deployment-chatbox)= - -# Chatbox +--- +title: Chatbox +--- +[](){ #deployment-chatbox } [Chatbox](https://github.com/chatboxai/chatbox) is a desktop client for LLMs, available on Windows, Mac, Linux. @@ -27,10 +28,8 @@ vllm serve qwen/Qwen1.5-0.5B-Chat - API Path: `/chat/completions` - Model: `qwen/Qwen1.5-0.5B-Chat` -:::{image} /assets/deployment/chatbox-settings.png -::: +![](../../assets/deployment/chatbox-settings.png) - Go to `Just chat`, and start to chat: -:::{image} /assets/deployment/chatbox-chat.png -::: +![](../../assets/deployment/chatbox-chat.png) diff --git a/docs/source/deployment/frameworks/dify.md b/docs/deployment/frameworks/dify.md similarity index 90% rename from docs/source/deployment/frameworks/dify.md rename to docs/deployment/frameworks/dify.md index 5cdf6a3876371..886484b543475 100644 --- a/docs/source/deployment/frameworks/dify.md +++ b/docs/deployment/frameworks/dify.md @@ -1,6 +1,7 @@ -(deployment-dify)= - -# Dify +--- +title: Dify +--- +[](){ #deployment-dify } [Dify](https://github.com/langgenius/dify) is an open-source LLM app development platform. Its intuitive interface combines agentic AI workflow, RAG pipeline, agent capabilities, model management, observability features, and more, allowing you to quickly move from prototype to production. @@ -42,15 +43,12 @@ docker compose up -d - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat` - **Completion Mode**: `Completion` -:::{image} /assets/deployment/dify-settings.png -::: +![](../../assets/deployment/dify-settings.png) - To create a test chatbot, go to `Studio โ†’ Chatbot โ†’ Create from Blank`, then select Chatbot as the type: -:::{image} /assets/deployment/dify-create-chatbot.png -::: +![](../../assets/deployment/dify-create-chatbot.png) - Click the chatbot you just created to open the chat interface and start interacting with the model: -:::{image} /assets/deployment/dify-chat.png -::: +![](../../assets/deployment/dify-chat.png) diff --git a/docs/source/deployment/frameworks/dstack.md b/docs/deployment/frameworks/dstack.md similarity index 83% rename from docs/source/deployment/frameworks/dstack.md rename to docs/deployment/frameworks/dstack.md index a16e28f2d8983..7de92855745b0 100644 --- a/docs/source/deployment/frameworks/dstack.md +++ b/docs/deployment/frameworks/dstack.md @@ -1,12 +1,11 @@ -(deployment-dstack)= +--- +title: dstack +--- +[](){ #deployment-dstack } -# dstack - -:::{raw} html

vLLM_plus_dstack

-::: vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment. @@ -97,6 +96,5 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` -:::{note} -dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm) -::: +!!! note + dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm) diff --git a/docs/deployment/frameworks/haystack.md b/docs/deployment/frameworks/haystack.md new file mode 100644 index 0000000000000..2eac4a5279fd6 --- /dev/null +++ b/docs/deployment/frameworks/haystack.md @@ -0,0 +1,60 @@ +--- +title: Haystack +--- +[](){ #deployment-haystack } + +# Haystack + +[Haystack](https://github.com/deepset-ai/haystack) is an end-to-end LLM framework that allows you to build applications powered by LLMs, Transformer models, vector search and more. Whether you want to perform retrieval-augmented generation (RAG), document search, question answering or answer generation, Haystack can orchestrate state-of-the-art embedding models and LLMs into pipelines to build end-to-end NLP applications and solve your use case. + +It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. + +## Prerequisites + +- Setup vLLM and Haystack environment + +```console +pip install vllm haystack-ai +``` + +## Deploy + +- Start the vLLM server with the supported chat completion model, e.g. + +```console +vllm serve mistralai/Mistral-7B-Instruct-v0.1 +``` + +- Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server. + +```python +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ChatMessage +from haystack.utils import Secret + +generator = OpenAIChatGenerator( + # for compatibility with the OpenAI API, a placeholder api_key is needed + api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"), + model="mistralai/Mistral-7B-Instruct-v0.1", + api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1", + generation_kwargs = {"max_tokens": 512} +) + +response = generator.run( + messages=[ChatMessage.from_user("Hi. Can you help me plan my next trip to Italy?")] +) + +print("-"*30) +print(response) +print("-"*30) +``` + +Output e.g.: + +```console +------------------------------ +{'replies': [ChatMessage(_role=, _content=[TextContent(text=' Of course! Where in Italy would you like to go and what type of trip are you looking to plan?')], _name=None, _meta={'model': 'mistralai/Mistral-7B-Instruct-v0.1', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 23, 'prompt_tokens': 21, 'total_tokens': 44, 'completion_tokens_details': None, 'prompt_tokens_details': None}})]} +------------------------------ +``` + +For details, see the tutorial [Using vLLM in Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/vllm.md). diff --git a/docs/deployment/frameworks/helm.md b/docs/deployment/frameworks/helm.md new file mode 100644 index 0000000000000..192b90438acf0 --- /dev/null +++ b/docs/deployment/frameworks/helm.md @@ -0,0 +1,95 @@ +--- +title: Helm +--- +[](){ #deployment-helm } + +A Helm chart to deploy vLLM for Kubernetes + +Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values. + +This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file. + +## Prerequisites + +Before you begin, ensure that you have the following: + +- A running Kubernetes cluster +- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin) +- Available GPU resources in your cluster +- S3 with the model which will be deployed + +## Installing the chart + +To install the chart with the release name `test-vllm`: + +```console +helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY +``` + +## Uninstalling the Chart + +To uninstall the `test-vllm` deployment: + +```console +helm uninstall test-vllm --namespace=ns-vllm +``` + +The command removes all the Kubernetes components associated with the +chart **including persistent volumes** and deletes the release. + +## Architecture + +![](../../assets/deployment/architecture_helm_deployment.png) + +## Values + +| Key | Type | Default | Description | +|--------------------------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------| +| autoscaling | object | {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} | Autoscaling configuration | +| autoscaling.enabled | bool | false | Enable autoscaling | +| autoscaling.maxReplicas | int | 100 | Maximum replicas | +| autoscaling.minReplicas | int | 1 | Minimum replicas | +| autoscaling.targetCPUUtilizationPercentage | int | 80 | Target CPU utilization for autoscaling | +| configs | object | {} | Configmap | +| containerPort | int | 8000 | Container port | +| customObjects | list | [] | Custom Objects configuration | +| deploymentStrategy | object | {} | Deployment strategy configuration | +| externalConfigs | list | [] | External configuration | +| extraContainers | list | [] | Additional containers configuration | +| extraInit | object | {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} | Additional configuration for the init container | +| extraInit.pvcStorage | string | "50Gi" | Storage size of the s3 | +| extraInit.s3modelpath | string | "relative_s3_model_path/opt-125m" | Path of the model on the s3 which hosts model weights and config files | +| extraInit.awsEc2MetadataDisabled | boolean | true | Disables the use of the Amazon EC2 instance metadata service | +| extraPorts | list | [] | Additional ports configuration | +| gpuModels | list | ["TYPE_GPU_USED"] | Type of gpu used | +| image | object | {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} | Image configuration | +| image.command | list | ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] | Container launch command | +| image.repository | string | "vllm/vllm-openai" | Image repository | +| image.tag | string | "latest" | Image tag | +| livenessProbe | object | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} | Liveness probe configuration | +| livenessProbe.failureThreshold | int | 3 | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive | +| livenessProbe.httpGet | object | {"path":"/health","port":8000} | Configuration of the Kubelet http request on the server | +| livenessProbe.httpGet.path | string | "/health" | Path to access on the HTTP server | +| livenessProbe.httpGet.port | int | 8000 | Name or number of the port to access on the container, on which the server is listening | +| livenessProbe.initialDelaySeconds | int | 15 | Number of seconds after the container has started before liveness probe is initiated | +| livenessProbe.periodSeconds | int | 10 | How often (in seconds) to perform the liveness probe | +| maxUnavailablePodDisruptionBudget | string | "" | Disruption Budget Configuration | +| readinessProbe | object | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} | Readiness probe configuration | +| readinessProbe.failureThreshold | int | 3 | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready | +| readinessProbe.httpGet | object | {"path":"/health","port":8000} | Configuration of the Kubelet http request on the server | +| readinessProbe.httpGet.path | string | "/health" | Path to access on the HTTP server | +| readinessProbe.httpGet.port | int | 8000 | Name or number of the port to access on the container, on which the server is listening | +| readinessProbe.initialDelaySeconds | int | 5 | Number of seconds after the container has started before readiness probe is initiated | +| readinessProbe.periodSeconds | int | 5 | How often (in seconds) to perform the readiness probe | +| replicaCount | int | 1 | Number of replicas | +| resources | object | {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} | Resource configuration | +| resources.limits."nvidia.com/gpu" | int | 1 | Number of gpus used | +| resources.limits.cpu | int | 4 | Number of CPUs | +| resources.limits.memory | string | "16Gi" | CPU memory configuration | +| resources.requests."nvidia.com/gpu" | int | 1 | Number of gpus used | +| resources.requests.cpu | int | 4 | Number of CPUs | +| resources.requests.memory | string | "16Gi" | CPU memory configuration | +| secrets | object | {} | Secrets configuration | +| serviceName | string | Service name | | +| servicePort | int | 80 | Service port | +| labels.environment | string | test | Environment name | diff --git a/docs/source/deployment/frameworks/litellm.md b/docs/deployment/frameworks/litellm.md similarity index 97% rename from docs/source/deployment/frameworks/litellm.md rename to docs/deployment/frameworks/litellm.md index 6dd3607ca5e37..3011cde830180 100644 --- a/docs/source/deployment/frameworks/litellm.md +++ b/docs/deployment/frameworks/litellm.md @@ -1,6 +1,7 @@ -(deployment-litellm)= - -# LiteLLM +--- +title: LiteLLM +--- +[](){ #deployment-litellm } [LiteLLM](https://github.com/BerriAI/litellm) call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.] diff --git a/docs/source/deployment/frameworks/lobe-chat.md b/docs/deployment/frameworks/lobe-chat.md similarity index 89% rename from docs/source/deployment/frameworks/lobe-chat.md rename to docs/deployment/frameworks/lobe-chat.md index 6d86b7fa9cce1..cd95c028155e4 100644 --- a/docs/source/deployment/frameworks/lobe-chat.md +++ b/docs/deployment/frameworks/lobe-chat.md @@ -1,6 +1,7 @@ -(deployment-lobe-chat)= - -# Lobe Chat +--- +title: Lobe Chat +--- +[](){ #deployment-lobe-chat } [Lobe Chat](https://github.com/lobehub/lobe-chat) is an open-source, modern-design ChatGPT/LLMs UI/Framework. diff --git a/docs/source/deployment/frameworks/lws.md b/docs/deployment/frameworks/lws.md similarity index 99% rename from docs/source/deployment/frameworks/lws.md rename to docs/deployment/frameworks/lws.md index 4e9a03b5c4c17..18282a89ddfff 100644 --- a/docs/source/deployment/frameworks/lws.md +++ b/docs/deployment/frameworks/lws.md @@ -1,6 +1,7 @@ -(deployment-lws)= - -# LWS +--- +title: LWS +--- +[](){ #deployment-lws } LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads. A major use case is for multi-host/multi-node distributed inference. diff --git a/docs/source/deployment/frameworks/modal.md b/docs/deployment/frameworks/modal.md similarity index 85% rename from docs/source/deployment/frameworks/modal.md rename to docs/deployment/frameworks/modal.md index e7c42088e36a9..dbdb739a10005 100644 --- a/docs/source/deployment/frameworks/modal.md +++ b/docs/deployment/frameworks/modal.md @@ -1,6 +1,7 @@ -(deployment-modal)= - -# Modal +--- +title: Modal +--- +[](){ #deployment-modal } vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling. diff --git a/docs/source/deployment/frameworks/open-webui.md b/docs/deployment/frameworks/open-webui.md similarity index 87% rename from docs/source/deployment/frameworks/open-webui.md rename to docs/deployment/frameworks/open-webui.md index 83e5303a00ef2..1ab1931068fae 100644 --- a/docs/source/deployment/frameworks/open-webui.md +++ b/docs/deployment/frameworks/open-webui.md @@ -1,6 +1,7 @@ -(deployment-open-webui)= - -# Open WebUI +--- +title: Open WebUI +--- +[](){ #deployment-open-webui } 1. Install the [Docker](https://docs.docker.com/engine/install/) @@ -25,5 +26,4 @@ ghcr.io/open-webui/open-webui:main On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`. -:::{image} /assets/deployment/open_webui.png -::: +![](../../assets/deployment/open_webui.png) diff --git a/docs/source/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md similarity index 96% rename from docs/source/deployment/frameworks/retrieval_augmented_generation.md rename to docs/deployment/frameworks/retrieval_augmented_generation.md index f84451fafe91d..cb26c8378deec 100644 --- a/docs/source/deployment/frameworks/retrieval_augmented_generation.md +++ b/docs/deployment/frameworks/retrieval_augmented_generation.md @@ -1,6 +1,7 @@ -(deployment-retrieval-augmented-generation)= - -# Retrieval-Augmented Generation +--- +title: Retrieval-Augmented Generation +--- +[](){ #deployment-retrieval-augmented-generation } [Retrieval-augmented generation (RAG)](https://en.wikipedia.org/wiki/Retrieval-augmented_generation) is a technique that enables generative artificial intelligence (Gen AI) models to retrieve and incorporate new information. It modifies interactions with a large language model (LLM) so that the model responds to user queries with reference to a specified set of documents, using this information to supplement information from its pre-existing training data. This allows LLMs to use domain-specific and/or updated information. Use cases include providing chatbot access to internal company data or generating responses based on authoritative sources. diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md similarity index 94% rename from docs/source/deployment/frameworks/skypilot.md rename to docs/deployment/frameworks/skypilot.md index 5e101b9001033..9763745f23787 100644 --- a/docs/source/deployment/frameworks/skypilot.md +++ b/docs/deployment/frameworks/skypilot.md @@ -1,12 +1,11 @@ -(deployment-skypilot)= +--- +title: SkyPilot +--- +[](){ #deployment-skypilot } -# SkyPilot - -:::{raw} html

vLLM

-::: vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html). @@ -83,7 +82,11 @@ Check the output of the command. There will be a shareable gradio link (like the **Optional**: Serve the 70B model instead of the default 8B and use more GPU: ```console -HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct +HF_TOKEN="your-huggingface-token" \ + sky launch serving.yaml \ + --gpus A100:8 \ + --env HF_TOKEN \ + --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct ``` ## Scale up to multiple replicas @@ -104,10 +107,8 @@ service: max_completion_tokens: 1 ``` -:::{raw} html
Click to see the full recipe YAML -::: ```yaml service: @@ -153,14 +154,14 @@ run: | 2>&1 | tee api_server.log ``` -:::{raw} html
-::: Start the serving the Llama-3 8B model on multiple replicas: ```console -HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN +HF_TOKEN="your-huggingface-token" \ + sky serve up -n vllm serving.yaml \ + --env HF_TOKEN ``` Wait until the service is ready: @@ -169,10 +170,8 @@ Wait until the service is ready: watch -n10 sky serve status vllm ``` -:::{raw} html
Example outputs: -::: ```console Services @@ -185,9 +184,7 @@ vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) R vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 ``` -:::{raw} html
-::: After the service is READY, you can find a single endpoint for the service and access the service with the endpoint: @@ -223,10 +220,8 @@ service: This will scale the service up to when the QPS exceeds 2 for each replica. -:::{raw} html
Click to see the full recipe YAML -::: ```yaml service: @@ -275,9 +270,7 @@ run: | 2>&1 | tee api_server.log ``` -:::{raw} html
-::: To update the service with the new config: @@ -295,10 +288,8 @@ sky serve down vllm It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas. -:::{raw} html
Click to see the full GUI YAML -::: ```yaml envs: @@ -328,14 +319,14 @@ run: | --stop-token-ids 128009,128001 | tee ~/gradio.log ``` -:::{raw} html
-::: 1. Start the chat web UI: ```console - sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) + sky launch \ + -c gui ./gui.yaml \ + --env ENDPOINT=$(sky serve status --endpoint vllm) ``` 2. Then, we can access the GUI at the returned gradio link: diff --git a/docs/source/deployment/frameworks/streamlit.md b/docs/deployment/frameworks/streamlit.md similarity index 81% rename from docs/source/deployment/frameworks/streamlit.md rename to docs/deployment/frameworks/streamlit.md index 084550ec991e1..33ed8c5f5b54d 100644 --- a/docs/source/deployment/frameworks/streamlit.md +++ b/docs/deployment/frameworks/streamlit.md @@ -1,6 +1,7 @@ -(deployment-streamlit)= - -# Streamlit +--- +title: Streamlit +--- +[](){ #deployment-streamlit } [Streamlit](https://github.com/streamlit/streamlit) lets you transform Python scripts into interactive web apps in minutes, instead of weeks. Build dashboards, generate reports, or create chat apps. @@ -32,11 +33,11 @@ pip install streamlit openai streamlit run streamlit_openai_chatbot_webserver.py # or specify the VLLM_API_BASE or VLLM_API_KEY -VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" streamlit run streamlit_openai_chatbot_webserver.py +VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" \ + streamlit run streamlit_openai_chatbot_webserver.py # start with debug mode to view more details streamlit run streamlit_openai_chatbot_webserver.py --logger.level=debug ``` -:::{image} /assets/deployment/streamlit-chat.png -::: +![](../../assets/deployment/streamlit-chat.png) diff --git a/docs/source/deployment/frameworks/triton.md b/docs/deployment/frameworks/triton.md similarity index 87% rename from docs/source/deployment/frameworks/triton.md rename to docs/deployment/frameworks/triton.md index 94d87120159c6..082bc24d85aad 100644 --- a/docs/source/deployment/frameworks/triton.md +++ b/docs/deployment/frameworks/triton.md @@ -1,5 +1,6 @@ -(deployment-triton)= - -# NVIDIA Triton +--- +title: NVIDIA Triton +--- +[](){ #deployment-triton } The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details. diff --git a/docs/source/deployment/integrations/kserve.md b/docs/deployment/integrations/kserve.md similarity index 85% rename from docs/source/deployment/integrations/kserve.md rename to docs/deployment/integrations/kserve.md index c780fd74e8f55..754b983dee92c 100644 --- a/docs/source/deployment/integrations/kserve.md +++ b/docs/deployment/integrations/kserve.md @@ -1,6 +1,7 @@ -(deployment-kserve)= - -# KServe +--- +title: KServe +--- +[](){ #deployment-kserve } vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving. diff --git a/docs/source/deployment/integrations/kubeai.md b/docs/deployment/integrations/kubeai.md similarity index 93% rename from docs/source/deployment/integrations/kubeai.md rename to docs/deployment/integrations/kubeai.md index 2f5772e075d87..ba0a3c52cca7a 100644 --- a/docs/source/deployment/integrations/kubeai.md +++ b/docs/deployment/integrations/kubeai.md @@ -1,6 +1,7 @@ -(deployment-kubeai)= - -# KubeAI +--- +title: KubeAI +--- +[](){ #deployment-kubeai } [KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies. diff --git a/docs/source/deployment/integrations/llamastack.md b/docs/deployment/integrations/llamastack.md similarity index 94% rename from docs/source/deployment/integrations/llamastack.md rename to docs/deployment/integrations/llamastack.md index a6c3569637abf..2ae600a423ff9 100644 --- a/docs/source/deployment/integrations/llamastack.md +++ b/docs/deployment/integrations/llamastack.md @@ -1,6 +1,7 @@ -(deployment-llamastack)= - -# Llama Stack +--- +title: Llama Stack +--- +[](){ #deployment-llamastack } vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) . diff --git a/docs/source/deployment/integrations/llmaz.md b/docs/deployment/integrations/llmaz.md similarity index 87% rename from docs/source/deployment/integrations/llmaz.md rename to docs/deployment/integrations/llmaz.md index cd4a76353d264..03d284c34769c 100644 --- a/docs/source/deployment/integrations/llmaz.md +++ b/docs/deployment/integrations/llmaz.md @@ -1,6 +1,7 @@ -(deployment-llmaz)= - -# llmaz +--- +title: llmaz +--- +[](){ #deployment-llmaz } [llmaz](https://github.com/InftyAI/llmaz) is an easy-to-use and advanced inference platform for large language models on Kubernetes, aimed for production use. It uses vLLM as the default model serving backend. diff --git a/docs/source/deployment/integrations/production-stack.md b/docs/deployment/integrations/production-stack.md similarity index 98% rename from docs/source/deployment/integrations/production-stack.md rename to docs/deployment/integrations/production-stack.md index 05f1568306cc9..8288a4b6e6be3 100644 --- a/docs/source/deployment/integrations/production-stack.md +++ b/docs/deployment/integrations/production-stack.md @@ -1,6 +1,7 @@ -(deployment-production-stack)= - -# Production stack +--- +title: Production stack +--- +[](){ #deployment-production-stack } Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using the [vLLM production stack](https://github.com/vllm-project/production-stack). Born out of a Berkeley-UChicago collaboration, [vLLM production stack](https://github.com/vllm-project/production-stack) is an officially released, production-optimized codebase under the [vLLM project](https://github.com/vllm-project), designed for LLM deployment with: @@ -114,7 +115,7 @@ To remove the deployment, run: sudo helm uninstall vllm ``` ------- +--- ### (Advanced) Configuring vLLM production stack diff --git a/docs/source/deployment/k8s.md b/docs/deployment/k8s.md similarity index 98% rename from docs/source/deployment/k8s.md rename to docs/deployment/k8s.md index 9079cfa8e1b66..6b08c4960d028 100644 --- a/docs/source/deployment/k8s.md +++ b/docs/deployment/k8s.md @@ -1,6 +1,7 @@ -(deployment-k8s)= - -# Using Kubernetes +--- +title: Using Kubernetes +--- +[](){ #deployment-k8s } Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes. @@ -8,6 +9,7 @@ Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine le * [Deployment with GPUs](#deployment-with-gpus) Alternatively, you can deploy vLLM to Kubernetes using any of the following: + * [Helm](frameworks/helm.md) * [InftyAI/llmaz](integrations/llmaz.md) * [KServe](integrations/kserve.md) @@ -19,9 +21,8 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following: ## Deployment with CPUs -:::{note} -The use of CPUs here is for demonstration and testing purposes only and its performance will not be on par with GPUs. -::: +!!! note + The use of CPUs here is for demonstration and testing purposes only and its performance will not be on par with GPUs. First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model: diff --git a/docs/source/deployment/nginx.md b/docs/deployment/nginx.md similarity index 60% rename from docs/source/deployment/nginx.md rename to docs/deployment/nginx.md index bf404f1098c3b..80242919ba5b3 100644 --- a/docs/source/deployment/nginx.md +++ b/docs/deployment/nginx.md @@ -1,20 +1,21 @@ -(nginxloadbalancer)= - -# Using Nginx +--- +title: Using Nginx +--- +[](){ #nginxloadbalancer } This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers. Table of contents: -1. [Build Nginx Container](#nginxloadbalancer-nginx-build) -2. [Create Simple Nginx Config file](#nginxloadbalancer-nginx-conf) -3. [Build vLLM Container](#nginxloadbalancer-nginx-vllm-container) -4. [Create Docker Network](#nginxloadbalancer-nginx-docker-network) -5. [Launch vLLM Containers](#nginxloadbalancer-nginx-launch-container) -6. [Launch Nginx](#nginxloadbalancer-nginx-launch-nginx) -7. [Verify That vLLM Servers Are Ready](#nginxloadbalancer-nginx-verify-nginx) +1. [Build Nginx Container][nginxloadbalancer-nginx-build] +2. [Create Simple Nginx Config file][nginxloadbalancer-nginx-conf] +3. [Build vLLM Container][nginxloadbalancer-nginx-vllm-container] +4. [Create Docker Network][nginxloadbalancer-nginx-docker-network] +5. [Launch vLLM Containers][nginxloadbalancer-nginx-launch-container] +6. [Launch Nginx][nginxloadbalancer-nginx-launch-nginx] +7. [Verify That vLLM Servers Are Ready][nginxloadbalancer-nginx-verify-nginx] -(nginxloadbalancer-nginx-build)= +[](){ #nginxloadbalancer-nginx-build } ## Build Nginx Container @@ -39,7 +40,7 @@ Build the container: docker build . -f Dockerfile.nginx --tag nginx-lb ``` -(nginxloadbalancer-nginx-conf)= +[](){ #nginxloadbalancer-nginx-conf } ## Create Simple Nginx Config file @@ -63,7 +64,7 @@ server { } ``` -(nginxloadbalancer-nginx-vllm-container)= +[](){ #nginxloadbalancer-nginx-vllm-container } ## Build vLLM Container @@ -76,10 +77,14 @@ If you are behind proxy, you can pass the proxy settings to the docker build com ```console cd $vllm_root -docker build -f docker/Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy +docker build \ + -f docker/Dockerfile . \ + --tag vllm \ + --build-arg http_proxy=$http_proxy \ + --build-arg https_proxy=$https_proxy ``` -(nginxloadbalancer-nginx-docker-network)= +[](){ #nginxloadbalancer-nginx-docker-network } ## Create Docker Network @@ -87,7 +92,7 @@ docker build -f docker/Dockerfile . --tag vllm --build-arg http_proxy=$http_prox docker network create vllm_nginx ``` -(nginxloadbalancer-nginx-launch-container)= +[](){ #nginxloadbalancer-nginx-launch-container } ## Launch vLLM Containers @@ -101,23 +106,45 @@ Notes: ```console mkdir -p ~/.cache/huggingface/hub/ hf_cache_dir=~/.cache/huggingface/ -docker run -itd --ipc host --network vllm_nginx --gpus device=0 --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf -docker run -itd --ipc host --network vllm_nginx --gpus device=1 --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf +docker run \ + -itd \ + --ipc host \ + --network vllm_nginx \ + --gpus device=0 \ + --shm-size=10.24gb \ + -v $hf_cache_dir:/root/.cache/huggingface/ \ + -p 8081:8000 \ + --name vllm0 vllm \ + --model meta-llama/Llama-2-7b-chat-hf +docker run \ + -itd \ + --ipc host \ + --network vllm_nginx \ + --gpus device=1 \ + --shm-size=10.24gb \ + -v $hf_cache_dir:/root/.cache/huggingface/ \ + -p 8082:8000 \ + --name vllm1 vllm \ + --model meta-llama/Llama-2-7b-chat-hf ``` -:::{note} -If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`. -::: +!!! note + If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`. -(nginxloadbalancer-nginx-launch-nginx)= +[](){ #nginxloadbalancer-nginx-launch-nginx } ## Launch Nginx ```console -docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest +docker run \ + -itd \ + -p 8000:80 \ + --network vllm_nginx \ + -v ./nginx_conf/:/etc/nginx/conf.d/ \ + --name nginx-lb nginx-lb:latest ``` -(nginxloadbalancer-nginx-verify-nginx)= +[](){ #nginxloadbalancer-nginx-verify-nginx } ## Verify That vLLM Servers Are Ready diff --git a/docs/source/design/arch_overview.md b/docs/design/arch_overview.md similarity index 80% rename from docs/source/design/arch_overview.md rename to docs/design/arch_overview.md index 94bda8b5c58d5..14720a392aafb 100644 --- a/docs/source/design/arch_overview.md +++ b/docs/design/arch_overview.md @@ -1,22 +1,18 @@ -(arch-overview)= - -# Architecture Overview +--- +title: Architecture Overview +--- +[](){ #arch-overview } This document provides an overview of the vLLM architecture. -:::{contents} Table of Contents -:depth: 2 -:local: true -::: +[TOC] ## Entrypoints vLLM provides a number of entrypoints for interacting with the system. The following diagram shows the relationship between them. -:::{image} /assets/design/arch_overview/entrypoints.excalidraw.png -:alt: Entrypoints Diagram -::: +![Entrypoints Diagram](../assets/design/arch_overview/entrypoints.excalidraw.png) ### LLM Class @@ -52,8 +48,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -More API details can be found in the [Offline Inference] -(#offline-inference-api) section of the API docs. +More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs. The code for the `LLM` class can be found in . @@ -77,16 +72,14 @@ python -m vllm.entrypoints.openai.api_server --model That code can be found in . -More details on the API server can be found in the [OpenAI-Compatible Server](#openai-compatible-server) document. +More details on the API server can be found in the [OpenAI-Compatible Server][openai-compatible-server] document. ## LLM Engine The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of the vLLM system, handling model inference and asynchronous request processing. -:::{image} /assets/design/arch_overview/llm_engine.excalidraw.png -:alt: LLMEngine Diagram -::: +![LLMEngine Diagram](../assets/design/arch_overview/llm_engine.excalidraw.png) ### LLMEngine @@ -137,18 +130,16 @@ input tensors and capturing cudagraphs. ## Model Every model runner object has one model object, which is the actual -`torch.nn.Module` instance. See [huggingface_integration](#huggingface-integration) for how various +`torch.nn.Module` instance. See [huggingface_integration][huggingface-integration] for how various configurations affect the class we ultimately get. ## Class Hierarchy The following figure shows the class hierarchy of vLLM: -> :::{figure} /assets/design/hierarchy.png -> :align: center -> :alt: query -> :width: 100% -> ::: +>
+> ![](../assets/design/hierarchy.png){ align="center" alt="query" width="100%" } +>
There are several important design choices behind this class hierarchy: @@ -178,44 +169,43 @@ of a vision model and a language model. By making the constructor uniform, we can easily create a vision model and a language model and compose them into a vision-language model. -:::{note} -To support this change, all vLLM models' signatures have been updated to: +!!! note + To support this change, all vLLM models' signatures have been updated to: -```python -def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): -``` - -To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one: - -```python -class MyOldModel(nn.Module): - def __init__( - self, - config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: - ... - -from vllm.config import VllmConfig -class MyNewModel(MyOldModel): + ```python def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config - super().__init__(config, cache_config, quant_config, lora_config, prefix) + ``` -if __version__ >= "0.6.4": - MyModel = MyNewModel -else: - MyModel = MyOldModel -``` + To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one: -This way, the model can work with both old and new versions of vLLM. -::: + ```python + class MyOldModel(nn.Module): + def __init__( + self, + config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + prefix: str = "", + ) -> None: + ... + + from vllm.config import VllmConfig + class MyNewModel(MyOldModel): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + super().__init__(config, cache_config, quant_config, lora_config, prefix) + + if __version__ >= "0.6.4": + MyModel = MyNewModel + else: + MyModel = MyOldModel + ``` + + This way, the model can work with both old and new versions of vLLM. 3\. **Sharding and Quantization at Initialization**: Certain features require changing the model weights. For example, tensor parallelism needs to shard the diff --git a/docs/source/design/automatic_prefix_caching.md b/docs/design/automatic_prefix_caching.md similarity index 98% rename from docs/source/design/automatic_prefix_caching.md rename to docs/design/automatic_prefix_caching.md index 3928e0c16568b..80883bb1d90d8 100644 --- a/docs/source/design/automatic_prefix_caching.md +++ b/docs/design/automatic_prefix_caching.md @@ -1,6 +1,7 @@ -(design-automatic-prefix-caching)= - -# Automatic Prefix Caching +--- +title: Automatic Prefix Caching +--- +[](){ #design-automatic-prefix-caching } The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand. diff --git a/docs/source/design/huggingface_integration.md b/docs/design/huggingface_integration.md similarity index 64% rename from docs/source/design/huggingface_integration.md rename to docs/design/huggingface_integration.md index 7d271b1cfb3a0..2d462ccb65350 100644 --- a/docs/source/design/huggingface_integration.md +++ b/docs/design/huggingface_integration.md @@ -1,23 +1,22 @@ -(huggingface-integration)= - -# Integration with HuggingFace +--- +title: Integration with HuggingFace +--- +[](){ #huggingface-integration } This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`. Let's say we want to serve the popular QWen model by running `vllm serve Qwen/Qwen2-7B`. 1. The `model` argument is `Qwen/Qwen2-7B`. vLLM determines whether this model exists by checking for the corresponding config file `config.json`. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182) for the implementation. Within this process: - - - If the `model` argument corresponds to an existing local path, vLLM will load the config file directly from this path. - - If the `model` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the HuggingFace cache works. - - If the `model` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file. + - If the `model` argument corresponds to an existing local path, vLLM will load the config file directly from this path. + - If the `model` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the HuggingFace cache works. + - If the `model` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file. 2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186) for the implementation. 3. Next, vLLM [inspects](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189) the `model_type` field in the config dictionary to [generate](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L190-L216) the config object to use. There are some `model_type` values that vLLM directly supports; see [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48) for the list. If the `model_type` is not in the list, vLLM will use [AutoConfig.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained) to load the config class, with `model`, `--revision`, and `--trust_remote_code` as the arguments. Please note that: - - - HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example. - - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled. + - HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example. + - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled. 4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see [here](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244) for the implementation. @@ -28,8 +27,7 @@ Beyond that, there are two more things vLLM depends on HuggingFace for. 1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24). 2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights. - - - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that: + - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that: This completes the integration between vLLM and HuggingFace. diff --git a/docs/design/kernel/paged_attention.md b/docs/design/kernel/paged_attention.md new file mode 100644 index 0000000000000..6ebe1ee48acf1 --- /dev/null +++ b/docs/design/kernel/paged_attention.md @@ -0,0 +1,498 @@ +--- +title: vLLM Paged Attention +--- +[](){ #design-paged-attention } + +Currently, vLLM utilizes its own implementation of a multi-head query +attention kernel (`csrc/attention/attention_kernels.cu`). +This kernel is designed to be compatible with +vLLM's paged KV caches, where the key and value cache are stored in +separate blocks (note that this block concept differs from the GPU +thread block. So in a later document, I will refer to vLLM paged +attention block as "block", while refer to GPU thread block as +"thread block"). + +To achieve high performance, this kernel relies on a specially +designed memory layout and access method, specifically when threads +read data from global memory to shared memory. The purpose of this +document is to provide a high-level explanation of the kernel +implementation step by step, aiding those who wish to learn about the +vLLM multi-head query attention kernel. After going through this +document, users will likely have a better understanding and feel easier +to follow the actual implementation. + +Please note that this document may not cover all details, such as how +to calculate the correct index for the corresponding data or the dot +multiplication implementation. However, after reading this document +and becoming familiar with the high-level logic flow, it should be +easier for you to read the actual code and understand the details. + +## Inputs + +The kernel function takes a list of arguments for the current thread +to perform its assigned work. The three most important arguments are +the input pointers `q`, `k_cache`, and `v_cache`, which point +to query, key, and value data on global memory that need to be read +and processed. The output pointer `out` points to global memory +where the result should be written. These four pointers actually +refer to multi-dimensional arrays, but each thread only accesses the +portion of data assigned to it. I have omitted all other runtime +parameters here for simplicity. + +```cpp +template +__device__ void paged_attention_kernel( + ... // Other side args. + const scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size] + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] + const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] + ... // Other side args. +) +``` + +There are also a list of template arguments above the function +signature that are determined during compilation time. `scalar_t` +represents the data type of the query, key, and value data elements, +such as FP16. `HEAD_SIZE` indicates the number of elements in each +head. `BLOCK_SIZE` refers to the number of tokens in each block. +`NUM_THREADS` denotes the number of threads in each thread block. +`PARTITION_SIZE` represents the number of tensor parallel GPUs (For +simplicity, we assume this is 0 and tensor parallel is disabled). + +With these arguments, we need to perform a sequence of preparations. +This includes calculating the current head index, block index, and +other necessary variables. However, for now, we can ignore these +preparations and proceed directly to the actual calculations. It will +be easier to understand them once we grasp the entire flow. + +## Concepts + +Just before we dive into the calculation flow, I want to describe a +few concepts that are needed for later sections. However, you may +skip this section and return later if you encounter any confusing +terminologies. + +- **Sequence**: A sequence represents a client request. For example, + the data pointed to by `q` has a shape of + `[num_seqs, num_heads, head_size]`. That represents there are total + `num_seqs` of query sequence data are pointed by `q`. Since this + kernel is a single query attention kernel, each sequence only has one + query token. Hence, the `num_seqs` equals the total number of tokens + that are processed in the batch. +- **Context**: The context consists of the generated tokens from the + sequence. For instance, `["What", "is", "your"]` are the context + tokens, and the input query token is `"name"`. The model might + generate the token `"?"`. +- **Vec**: The vec is a list of elements that are fetched and + calculated together. For query and key data, the vec size + (`VEC_SIZE`) is determined so that each thread group can fetch and + calculate 16 bytes of data at a time. For value data, the vec size + (`V_VEC_SIZE`) is determined so that each thread can fetch and + calculate 16 bytes of data at a time. For example, if the + `scalar_t` is FP16 (2 bytes) and `THREAD_GROUP_SIZE` is 2, the + `VEC_SIZE` will be 4, while the `V_VEC_SIZE` will be 8. +- **Thread group**: The thread group is a small group of + threads(`THREAD_GROUP_SIZE`) that fetches and calculates one + query token and one key token at a time. Each thread handles only a + portion of the token data. The total number of elements processed by + one thread group is referred as `x`. For example, if the thread + group contains 2 threads and the head size is 8, then thread 0 + handles the query and key elements at index 0, 2, 4, 6, while thread + 1 handles the elements at index 1, 3, 5, 7. +- **Block**: The key and value cache data in vLLM are split into + blocks. Each block stores data for a fixed number(`BLOCK_SIZE`) + of tokens at one head. Each block may contain only a portion of the + whole context tokens. For example, if the block size is 16 and the + head size is 128, then for one head, one block can store 16 * 128 = + 2048 elements. +- **Warp**: A warp is a group of 32 threads(`WARP_SIZE`) that + execute simultaneously on a stream multiprocessor (SM). In this + kernel, each warp processes the calculation between one query token + and key tokens of one entire block at a time (it may process multiple + blocks in multiple iterations). For example, if there are 4 warps and + 6 blocks for one context, the assignment would be like warp 0 handles + the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2 + handles the 2nd block and warp 3 handles the 3rd block. +- **Thread block**: A thread block is a group of + threads(`NUM_THREADS`) that can access the same shared memory. + Each thread block contains multiple warps(`NUM_WARPS`), and in + this kernel, each thread block processes the calculation between one + query token and key tokens of a whole context. +- **Grid**: A grid is a collection of thread blocks and defines the + shape of the collection. In this kernel, the shape is + `(num_heads, num_seqs, max_num_partitions)`. Therefore, each thread + block only handles the calculation for one head, one sequence, and + one partition. + +## Query + +This section will introduce how query data is stored in memory and +fetched by each thread. As mentioned above, each thread group fetches +one query token data, while each thread itself only handles a part of +one query token data. Within each warp, every thread group will fetch +the same query token data, but will multiply it with different key +token data. + +```cpp +const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; +``` + +
+ ![](../../assets/kernel/query.png){ align="center" alt="query" width="70%" } +
+ +Each thread defines its own `q_ptr` which points to the assigned +query token data on global memory. For example, if `VEC_SIZE` is 4 +and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains +total of 128 elements divided into 128 / 4 = 32 vecs. + +
+ ![](../../assets/kernel/q_vecs.png){ align="center" alt="q_vecs" width="70%" } +
+ +```cpp +__shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; +``` + +Next, we need to read the global memory data pointed to by `q_ptr` +into shared memory as `q_vecs`. It is important to note that each +vecs is assigned to a different row. For example, if the +`THREAD_GROUP_SIZE` is 2, thread 0 will handle the 0th row vecs, +while thread 1 handles the 1st row vecs. By reading the query data in +this way, neighboring threads like thread 0 and thread 1 can read +neighbor memory, achieving the memory coalescing to improve +performance. + +## Key + +Similar to the "Query" section, this section introduces memory layout +and assignment for keys. While each thread group only handle one +query token one kernel run, it may handle multiple key tokens across +multiple iterations. Meanwhile, each warp will process multiple blocks +of key tokens in multiple iterations, ensuring that all context +tokens are processed by the entire thread group after the kernel run. +In this context, "handle" refers to performing the dot multiplication +between query data and key data. + +```cpp +const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride + + kv_head_idx * kv_head_stride + + physical_block_offset * x; +``` + +Unlike to `q_ptr`, `k_ptr` in each thread will point to different +key token at different iterations. As shown above, that `k_ptr` +points to key token data based on `k_cache` at assigned block, +assigned head and assigned token. + +
+ ![](../../assets/kernel/key.png){ align="center" alt="key" width="70%" } +
+ +The diagram above illustrates the memory layout for key data. It +assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is +8, `THREAD_GROUP_SIZE` is 2, and there are a total of 4 warps. Each +rectangle represents all the elements for one key token at one head, +which will be processed by one thread group. The left half shows the +total 16 blocks of key token data for warp 0, while the right half +represents the remaining key token data for other warps or +iterations. Inside each rectangle, there are a total 32 vecs (128 +elements for one token) that will be processed by 2 threads (one +thread group) separately. + +
+ ![](../../assets/kernel/k_vecs.png){ align="center" alt="k_vecs" width="70%" } +
+ +```cpp +K_vec k_vecs[NUM_VECS_PER_THREAD] +``` + +Next, we need to read the key token data from `k_ptr` and store +them on register memory as `k_vecs`. We use register memory for +`k_vecs` because it will only be accessed by one thread once, +whereas `q_vecs` will be accessed by multiple threads multiple +times. Each `k_vecs` will contain multiple vectors for later +calculation. Each vec will be set at each inner iteration. The +assignment of vecs allows neighboring threads in a warp to read +neighboring memory together, which again promotes the memory +coalescing. For instance, thread 0 will read vec 0, while thread 1 +will read vec 1. In the next inner loop, thread 0 will read vec 2, +while thread 1 will read vec 3, and so on. + +You may still be a little confused about the overall flow. Don't +worry, please keep reading the next "QK" section. It will illustrate +the query and key calculation flow in a clearer and higher-level +manner. + +## QK + +As shown the pseudo code below, before the entire for loop block, we +fetch the query data for one token and store it in `q_vecs`. Then, +in the outer for loop, we iterate through different `k_ptrs` that +point to different tokens and prepare the `k_vecs` in the inner for +loop. Finally, we perform the dot multiplication between the +`q_vecs` and each `k_vecs`. + +```cpp +q_vecs = ... +for ... { + k_ptr = ... + for ... { + k_vecs[i] = ... + } + ... + float qk = scale * Qk_dot::dot(q_vecs[thread_group_offset], k_vecs); +} +``` + +As mentioned before, for each thread, it only fetches part of the +query and key token data at a time. However, there will be a cross +thread group reduction happen in the `Qk_dot<>::dot` . So `qk` +returned here is not just between part of the query and key token dot +multiplication, but actually a full result between entire query and +key token data. + +For example, if the value of `HEAD_SIZE` is 128 and +`THREAD_GROUP_SIZE` is 2, each thread's `k_vecs` will contain +total 64 elements. However, the returned `qk` is actually the +result of dot multiplication between 128 query elements and 128 key +elements. If you want to learn more about the details of the dot +multiplication and reduction, you may refer to the implementation of +`Qk_dot<>::dot`. However, for the sake of simplicity, I will not +cover it in this document. + +## Softmax + +Next, we need to calculate the normalized softmax for all `qk`s, +as shown above, where each $x$ represents a `qk`. To do this, +we must obtain the reduced value of `qk_max`($m(x)$) and +the `exp_sum`($\ell(x)$) of all `qk`s. The reduction +should be performed across the entire thread block, encompassing +results between the query token and all context key tokens. + +$$ +\begin{gather*} +m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\ +\quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)} +\end{gather*} +$$ + +### `qk_max` and `logits` + +Just right after we get the `qk` result, we can set the temporary +`logits` result with `qk` (In the end, the `logits` should +store the normalized softmax result). Also we can compare and collect +the `qk_max` for all `qk`s that are calculated by current +thread group. + +```cpp +if (thread_group_offset == 0) { + const bool mask = token_idx >= context_len; + logits[token_idx - start_token_idx] = mask ? 0.f : qk; + qk_max = mask ? qk_max : fmaxf(qk_max, qk); +} +``` + +Please note that the `logits` here is on shared memory, so each +thread group will set the fields for its own assigned context tokens. +Overall, the size of logits should be number of context tokens. + +```cpp +for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) { + qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); +} + +if (lane == 0) { + red_smem[warp_idx] = qk_max; +} +``` + +Then we need to get the reduced `qk_max` across each warp. The main +idea is to make threads in warp to communicate with each other and +get the final max `qk` . + +```cpp +for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); +} +qk_max = VLLM_SHFL_SYNC(qk_max, 0); +``` + +Finally, we can get the reduced `qk_max` from whole thread block by +compare the `qk_max` from all warps in this thread block. Then we +need to broadcast the final result to each thread. + +### `exp_sum` + +Similar to `qk_max`, we need to get the reduced sum value from the +entire thread block too. + +```cpp +for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { + float val = __expf(logits[i] - qk_max); + logits[i] = val; + exp_sum += val; +} +... +exp_sum = block_sum(&red_smem[NUM_WARPS], exp_sum); +``` + +Firstly, sum all exp values from each thread group, and meanwhile, +convert each entry of `logits` from `qk` to `exp(qk - qk_max)`. +Please note, the `qk_max` here is already the max `qk` across the +whole thread block. And then we can do reduction for `exp_sum` +across whole thread block just like the `qk_max`. + +```cpp +const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f); +for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { + logits[i] *= inv_sum; +} +``` + +Finally, with the reduced `qk_max` and `exp_sum`, we can obtain +the final normalized softmax result as `logits`. This `logits` +variable will be used for dot multiplication with the value data in +later steps. Now, it should store the normalized softmax result of +`qk` for all assigned context tokens. + +## Value + +
+ ![](../../assets/kernel/value.png){ align="center" alt="value" width="70%" } +
+ +
+ ![](../../assets/kernel/logits_vec.png){ align="center" alt="logits_vec" width="50%" } +
+ +
+ ![](../../assets/kernel/v_vec.png){ align="center" alt="v_vec" width="70%" } +
+ +Now we need to retrieve the value data and perform dot multiplication +with `logits`. Unlike query and key, there is no thread group +concept for value data. As shown in diagram, different from key token +memory layout, elements from the same column correspond to the same +value token. For one block of value data, there are `HEAD_SIZE` of +rows and `BLOCK_SIZE` of columns that are split into multiple +`v_vecs`. + +Each thread always fetches `V_VEC_SIZE` elements from the same +`V_VEC_SIZE` of tokens at a time. As a result, a single thread +retrieves multiple `v_vec`s from different rows and the same +columns through multiple inner iterations. For each `v_vec`, it +needs to be dot multiplied with the corresponding `logits_vec`, +which is also `V_VEC_SIZE` elements from `logits`. Overall, with +multiple inner iterations, each warp will process one block of value +tokens. And with multiple outer iterations, the whole context value +tokens are processed + +```cpp +float accs[NUM_ROWS_PER_THREAD]; +for ... { // Iteration over different blocks. + logits_vec = ... + for ... { // Iteration over different rows. + v_vec = ... + ... + accs[i] += dot(logits_vec, v_vec); + } +} +``` + +As shown in the above pseudo code, in the outer loop, similar to +`k_ptr`, `logits_vec` iterates over different blocks and reads +`V_VEC_SIZE` elements from `logits`. In the inner loop, each +thread reads `V_VEC_SIZE` elements from the same tokens as a +`v_vec` and performs dot multiplication. It is important to note +that in each inner iteration, the thread fetches different head +position elements for the same tokens. The dot result is then +accumulated in `accs`. Therefore, each entry of `accs` is mapped +to a head position assigned to the current thread. + +For example, if `BLOCK_SIZE` is 16 and `V_VEC_SIZE` is 8, each +thread fetches 8 value elements for 8 tokens at a time. Each element +is from different tokens at the same head position. If `HEAD_SIZE` +is 128 and `WARP_SIZE` is 32, for each inner loop, a warp needs to +fetch `WARP_SIZE * V_VEC_SIZE = 256` elements. This means there are +a total of 128 * 16 / 256 = 8 inner iterations for a warp to handle +a whole block of value tokens. And each `accs` in each thread +contains 8 elements that accumulated at 8 different head positions. +For the thread 0, the `accs` variable will have 8 elements, which +are 0th, 32th โ€ฆ 224th elements of a value head that are accumulated +from all assigned 8 tokens. + +## LV + +Now, we need to perform reduction for `accs` within each warp. This +process allows each thread to accumulate the `accs` for the +assigned head positions of all tokens in one block. + +```cpp +for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + float acc = accs[i]; + for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) { + acc += VLLM_SHFL_XOR_SYNC(acc, mask); + } + accs[i] = acc; +} +``` + +Next, we perform reduction for `accs` across all warps, allowing +each thread to have the accumulation of `accs` for the assigned +head positions of all context tokens. Please note that each `accs` +in every thread only stores the accumulation for a portion of +elements of the entire head for all context tokens. However, overall, +all results for output have been calculated but are just stored in +different thread register memory. + +```cpp +float* out_smem = reinterpret_cast(shared_mem); +for (int i = NUM_WARPS; i > 1; i /= 2) { + // Upper warps write to shared memory. + ... + float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE]; + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + ... + dst[row_idx] = accs[i]; + } + + // Lower warps update the output. + const float* src = &out_smem[warp_idx * HEAD_SIZE]; + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + ... + accs[i] += src[row_idx]; + } + + // Write out the accs. +} +``` + +## Output + +Now we can write all of calculated result from local register memory +to final output global memory. + +```cpp +scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE + + head_idx * max_num_partitions * HEAD_SIZE + + partition_idx * HEAD_SIZE; +``` + +First, we need to define the `out_ptr` variable, which points to +the start address of the assigned sequence and assigned head. + +```cpp +for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER; + if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) { + from_float(*(out_ptr + row_idx), accs[i]); + } +} +``` + +Finally, we need to iterate over different assigned head positions +and write out the corresponding accumulated result based on the +`out_ptr`. diff --git a/docs/source/design/mm_processing.md b/docs/design/mm_processing.md similarity index 61% rename from docs/source/design/mm_processing.md rename to docs/design/mm_processing.md index dc92a3c2c511e..f3685ce76a4bd 100644 --- a/docs/source/design/mm_processing.md +++ b/docs/design/mm_processing.md @@ -1,10 +1,11 @@ -(mm-processing)= +--- +title: Multi-Modal Data Processing +--- +[](){ #mm-processing } -# Multi-Modal Data Processing +To enable various optimizations in vLLM such as [chunked prefill][chunked-prefill] and [prefix caching][automatic-prefix-caching], we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. ``) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor. -To enable various optimizations in vLLM such as [chunked prefill](#chunked-prefill) and [prefix caching](#automatic-prefix-caching), we use {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` to provide the correspondence between placeholder feature tokens (e.g. ``) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor. - -Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`: +Here are the main features of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor]: ## Prompt Update Detection @@ -15,7 +16,7 @@ One of the main responsibilities of HF processor is to update the prompt with pl The information about which tokens have been updated is key to finding the correspondence between placeholder feature tokens and multi-modal inputs. -In vLLM, this information is specified using {class}`~vllm.multimodal.processing.PromptUpdate` in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`. We can automatically detect whether HF has updated the prompt by checking the existence of the updated tokens. +In vLLM, this information is specified using [PromptUpdate][vllm.multimodal.processing.PromptUpdate] in [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates]. We can automatically detect whether HF has updated the prompt by checking the existence of the updated tokens. ## Tokenized Prompt Inputs @@ -43,22 +44,22 @@ While HF processors support text + multi-modal inputs natively, this is not so f Moreover, since the tokenized text has not passed through the HF processor, we have to apply Step 3 by ourselves to keep the output tokens and multi-modal data consistent with each other. -(mm-dummy-text)= +[](){ #mm-dummy-text } ### Dummy text -We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data. +We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via [get_dummy_text][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text]. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data. -(mm-automatic-prompt-updating)= +[](){ #mm-automatic-prompt-updating } ### Automatic prompt updating We address the second issue by implementing model-agnostic code in -{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_updates` to automatically update the prompt with feature placeholder tokens based on the specification outputted by {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`. +[_apply_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_updates] to automatically update the prompt with feature placeholder tokens based on the specification outputted by [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates]. ### Summary -With the help of dummy text and automatic prompt updating, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main`. +With the help of dummy text and automatic prompt updating, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in [_apply_hf_processor_main][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main]. ## Processor Output Caching @@ -66,4 +67,4 @@ Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238) When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache. -Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#mm-dummy-text) to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating](#mm-automatic-prompt-updating) afterwards to keep the output tokens and multi-modal data consistent with each other. +Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text][mm-dummy-text] to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating][mm-automatic-prompt-updating] afterwards to keep the output tokens and multi-modal data consistent with each other. diff --git a/docs/source/design/multiprocessing.md b/docs/design/multiprocessing.md similarity index 93% rename from docs/source/design/multiprocessing.md rename to docs/design/multiprocessing.md index 43fe5fe2e5e94..4d58fae20f06c 100644 --- a/docs/source/design/multiprocessing.md +++ b/docs/design/multiprocessing.md @@ -2,14 +2,13 @@ ## Debugging -Please see the [Troubleshooting](#troubleshooting-python-multiprocessing) +Please see the [Troubleshooting][troubleshooting-python-multiprocessing] page for information on known issues and how to solve them. ## Introduction -:::{important} -The source code references are to the state of the code at the time of writing in December, 2024. -::: +!!! warning + The source code references are to the state of the code at the time of writing in December, 2024. The use of Python multiprocessing in vLLM is complicated by: @@ -23,13 +22,13 @@ This document describes how vLLM deals with these challenges. [Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include: -- `spawn` - spawn a new Python process. This will be the default as of Python - 3.14. In macOS, this is already the default. +- `spawn` - spawn a new Python process. The default on Windows and macOS. -- `fork` - Use `os.fork()` to fork the Python interpreter. This is the default - in Python versions prior to 3.14. +- `fork` - Use `os.fork()` to fork the Python interpreter. The default on + Linux for Python versions prior to 3.14. - `forkserver` - Spawn a server process that will fork a new process on request. + The default on Linux for Python version 3.14 and newer. ### Tradeoffs @@ -124,7 +123,7 @@ what is happening. First, a log message from vLLM: WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously initialized. We must use the `spawn` multiprocessing start method. Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See - https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing + https://docs.vllm.ai/en/latest/usage/debugging.html#python-multiprocessing for more information. ``` diff --git a/docs/source/design/plugin_system.md b/docs/design/plugin_system.md similarity index 83% rename from docs/source/design/plugin_system.md rename to docs/design/plugin_system.md index 225030885f629..0764dfb6501bc 100644 --- a/docs/source/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -1,12 +1,13 @@ -(plugin-system)= - -# vLLM's Plugin System +--- +title: vLLM's Plugin System +--- +[](){ #plugin-system } The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM. ## How Plugins Work in vLLM -Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [](#arch-overview)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work. +Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [Arch Overview][arch-overview]), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work. ## How vLLM Discovers Plugins @@ -29,8 +30,10 @@ def register(): from vllm import ModelRegistry if "MyLlava" not in ModelRegistry.get_supported_archs(): - ModelRegistry.register_model("MyLlava", - "vllm_add_dummy_model.my_llava:MyLlava") + ModelRegistry.register_model( + "MyLlava", + "vllm_add_dummy_model.my_llava:MyLlava", + ) ``` For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html). diff --git a/docs/source/design/v1/metrics.md b/docs/design/v1/metrics.md similarity index 97% rename from docs/source/design/v1/metrics.md rename to docs/design/v1/metrics.md index de80226553728..7156ee9dd3ecb 100644 --- a/docs/source/design/v1/metrics.md +++ b/docs/design/v1/metrics.md @@ -57,11 +57,11 @@ In v0, the following metrics are exposed via a Prometheus-compatible `/metrics` - `vllm:spec_decode_num_draft_tokens_total` (Counter) - `vllm:spec_decode_num_emitted_tokens_total` (Counter) -These are documented under [Inferencing and Serving -> Production Metrics](project:../../serving/metrics.md). +These are documented under [Inferencing and Serving -> Production Metrics](../../usage/metrics.md). ### Grafana Dashboard -vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/getting_started/examples/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard. +vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/examples/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard. The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important: @@ -222,9 +222,7 @@ And the calculated intervals are: Put another way: -:::{image} /assets/design/v1/metrics/intervals-1.png -:alt: Interval calculations - common case -::: +![Interval calculations - common case](../../assets/design/v1/metrics/intervals-1.png) We explored the possibility of having the frontend calculate these intervals using the timing of events visible by the frontend. However, @@ -239,17 +237,13 @@ When a preemption occurs during decode, since any already generated tokens are reused, we consider the preemption as affecting the inter-token, decode, and inference intervals. -:::{image} /assets/design/v1/metrics/intervals-2.png -:alt: Interval calculations - preempted decode -::: +![Interval calculations - preempted decode](../../assets/design/v1/metrics/intervals-2.png) When a preemption occurs during prefill (assuming such an event is possible), we consider the preemption as affecting the time-to-first-token and prefill intervals. -:::{image} /assets/design/v1/metrics/intervals-3.png -:alt: Interval calculations - preempted prefill -::: +![Interval calculations - preempted prefill](../../assets/design/v1/metrics/intervals-3.png) ### Frontend Stats Collection @@ -467,7 +461,7 @@ In general: hatch](https://kubernetes.io/docs/concepts/cluster-administration/system-metrics/#show-hidden-metrics) for some time before deleting them. -See the [deprecation policy](project:../../contributing/deprecation_policy.md) for +See the [deprecation policy](../../contributing/deprecation_policy.md) for the project-wide deprecation policy. ### Unimplemented - `vllm:tokens_total` @@ -679,7 +673,7 @@ v0 has support for OpenTelemetry tracing: - [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/) - [User-facing - docs](https://docs.vllm.ai/en/latest/getting_started/examples/opentelemetry.html) + docs](https://docs.vllm.ai/en/latest/examples/opentelemetry.html) - [Blog post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f) - [IBM product diff --git a/docs/source/design/v1/prefix_caching.md b/docs/design/v1/prefix_caching.md similarity index 94% rename from docs/source/design/v1/prefix_caching.md rename to docs/design/v1/prefix_caching.md index 0f7475777797b..ad041b0059f58 100644 --- a/docs/source/design/v1/prefix_caching.md +++ b/docs/design/v1/prefix_caching.md @@ -122,9 +122,7 @@ There are two design points to highlight: As a result, we will have the following components when the KV cache manager is initialized: -:::{image} /assets/design/v1/prefix_caching/overview.png -:alt: Component Overview -::: +![Component Overview](../../assets/design/v1/prefix_caching/overview.png) * Block Pool: A list of KVCacheBlock. * Free Block Queue: Only store the pointers of head and tail blocks for manipulations. @@ -194,9 +192,7 @@ As can be seen, block 3 is a new full block and is cached. However, it is redund When a request is finished, we free all its blocks if no other requests are using them (reference count = 0). In this example, we free request 1 and block 2, 3, 4, 8 associated with it. We can see that the freed blocks are added to the tail of the free queue in the *reverse* order. This is because the last block of a request must hash more tokens and is less likely to be reused by other requests. As a result, it should be evicted first. -:::{image} /assets/design/v1/prefix_caching/free.png -:alt: Free Queue after Free a Request -::: +![Free queue after a request us freed](../../assets/design/v1/prefix_caching/free.png) ### Eviction (LRU) @@ -212,36 +208,24 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens), **Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens. -:::{image} /assets/design/v1/prefix_caching/example-time-1.png -:alt: Example Time 1 -::: +![Example Time 1](../../assets/design/v1/prefix_caching/example-time-1.png) **Time 3: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4. -:::{image} /assets/design/v1/prefix_caching/example-time-3.png -:alt: Example Time 3 -::: +![Example Time 3](../../assets/design/v1/prefix_caching/example-time-3.png) **Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens. -:::{image} /assets/design/v1/prefix_caching/example-time-4.png -:alt: Example Time 4 -::: +![Example Time 4](../../assets/design/v1/prefix_caching/example-time-4.png) **Time 5: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1. -:::{image} /assets/design/v1/prefix_caching/example-time-5.png -:alt: Example Time 5 -::: +![Example Time 5](../../assets/design/v1/prefix_caching/example-time-5.png) **Time 6: Request 1 is finished and free.** -:::{image} /assets/design/v1/prefix_caching/example-time-6.png -:alt: Example Time 6 -::: +![Example Time 6](../../assets/design/v1/prefix_caching/example-time-6.png) **Time 7: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted). -:::{image} /assets/design/v1/prefix_caching/example-time-7.png -:alt: Example Time 7 -::: +![Example Time 7](../../assets/design/v1/prefix_caching/example-time-7.png) diff --git a/docs/source/design/v1/torch_compile.md b/docs/design/v1/torch_compile.md similarity index 98% rename from docs/source/design/v1/torch_compile.md rename to docs/design/v1/torch_compile.md index 4d8ce0fd9227f..64b6f0cc0a9b6 100644 --- a/docs/source/design/v1/torch_compile.md +++ b/docs/design/v1/torch_compile.md @@ -99,7 +99,9 @@ This time, Inductor compilation is completely bypassed, and we will load from di The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example: -`vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'compile_sizes': [1, 2, 4, 8]}"` +``` +vllm serve meta-llama/Llama-3.2-1B --compilation_config '{"compile_sizes": [1, 2, 4, 8]}' +``` Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel. @@ -134,12 +136,14 @@ The cudagraphs are captured and managed by the compiler backend, and replayed wh By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`: -`vllm serve meta-llama/Llama-3.2-1B --compilation-config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"` +``` +vllm serve meta-llama/Llama-3.2-1B --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}' +``` Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture. ### Full Cudagraph capture -It is possible to include attention as part of the cudagraph if using an attention backend that is cudagraph compatible. This can improve performance in some cases such as decode speed for smaller models. Enable this using `--compilation-config "{'full_cuda_graph': True}"` +It is possible to include attention as part of the cudagraph if using an attention backend that is cudagraph compatible. This can improve performance in some cases such as decode speed for smaller models. Enable this using `--compilation-config '{"full_cuda_graph": true}'`. Currently only FlashAttention 3 is compatible, and only when cascade attention is disabled. diff --git a/docs/features/automatic_prefix_caching.md b/docs/features/automatic_prefix_caching.md new file mode 100644 index 0000000000000..5e92796ddda7e --- /dev/null +++ b/docs/features/automatic_prefix_caching.md @@ -0,0 +1,28 @@ +--- +title: Automatic Prefix Caching +--- +[](){ #automatic-prefix-caching } + +## Introduction + +Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part. + +!!! note + Technical details on how vLLM implements APC can be found [here][design-automatic-prefix-caching]. + +## Enabling APC in vLLM + +Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example: + + + +## Example workloads + +We describe two example workloads, where APC can provide huge performance benefit: + +- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency. +- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency. + +## Limits + +APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused). diff --git a/docs/features/compatibility_matrix.md b/docs/features/compatibility_matrix.md new file mode 100644 index 0000000000000..5d448eb5c03d8 --- /dev/null +++ b/docs/features/compatibility_matrix.md @@ -0,0 +1,81 @@ +--- +title: Compatibility Matrix +--- +[](){ #compatibility-matrix } + +The tables below show mutually exclusive features and the support on some hardware. + +The symbols used have the following meanings: + +- โœ… = Full compatibility +- ๐ŸŸ  = Partial compatibility +- โŒ = No compatibility +- โ” = Unknown or TBD + +!!! note + Check the โŒ or ๐ŸŸ  with links to see tracking issue for unsupported feature/hardware combination. + +## Feature x Feature + + + +| Feature | [CP][chunked-prefill] | [APC][automatic-prefix-caching] | [LoRA][lora-adapter] | prmpt adptr | [SD][spec-decode] | CUDA graph | pooling | enc-dec | logP | prmpt logP | async output | multi-step | mm | best-of | beam-search | +|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---| +| [CP][chunked-prefill] | โœ… | | | | | | | | | | | | | | | +| [APC][automatic-prefix-caching] | โœ… | โœ… | | | | | | | | | | | | | | +| [LoRA][lora-adapter] | โœ… | โœ… | โœ… | | | | | | | | | | | | | +| prmpt adptr | โœ… | โœ… | โœ… | โœ… | | | | | | | | | | | | +| [SD][spec-decode] | โœ… | โœ… | โŒ | โœ… | โœ… | | | | | | | | | | | +| CUDA graph | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | | | | | | | | | | +| pooling | โŒ | โŒ | โŒ | โŒ | โŒ | โŒ | โœ… | | | | | | | | | +| enc-dec | โŒ | [โŒ](gh-issue:7366) | โŒ | โŒ | [โŒ](gh-issue:7366) | โœ… | โœ… | โœ… | | | | | | | | +| logP | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | โŒ | โœ… | โœ… | | | | | | | +| prmpt logP | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | โŒ | โœ… | โœ… | โœ… | | | | | | +| async output | โœ… | โœ… | โœ… | โœ… | โŒ | โœ… | โŒ | โŒ | โœ… | โœ… | โœ… | | | | | +| multi-step | โŒ | โœ… | โŒ | โœ… | โŒ | โœ… | โŒ | โŒ | โœ… | โœ… | โœ… | โœ… | | | | +| mm | โœ… | [๐ŸŸ ](gh-pr:8348) | [๐ŸŸ ](gh-pr:4194) | โ” | โ” | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | โ” | โœ… | | | +| best-of | โœ… | โœ… | โœ… | โœ… | [โŒ](gh-issue:6137) | โœ… | โŒ | โœ… | โœ… | โœ… | โ” | [โŒ](gh-issue:7968) | โœ… | โœ… | | +| beam-search | โœ… | โœ… | โœ… | โœ… | [โŒ](gh-issue:6137) | โœ… | โŒ | โœ… | โœ… | โœ… | โ” | [โŒ](gh-issue:7968) | โ” | โœ… | โœ… | + +[](){ #feature-x-hardware } + +## Feature x Hardware + +| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | +|-----------------------------------------------------------|--------------------|----------|----------|-------|----------|--------------------|-------| +| [CP][chunked-prefill] | [โŒ](gh-issue:2729) | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | +| [APC][automatic-prefix-caching] | [โŒ](gh-issue:3687) | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | +| [LoRA][lora-adapter] | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | +| prmpt adptr | โœ… | โœ… | โœ… | โœ… | โœ… | [โŒ](gh-issue:8475) | โœ… | +| [SD][spec-decode] | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | +| CUDA graph | โœ… | โœ… | โœ… | โœ… | โœ… | โŒ | โœ… | +| pooling | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | โ” | +| enc-dec | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | โŒ | +| mm | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | +| logP | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | +| prmpt logP | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | +| async output | โœ… | โœ… | โœ… | โœ… | โœ… | โŒ | โŒ | +| multi-step | โœ… | โœ… | โœ… | โœ… | โœ… | [โŒ](gh-issue:8477) | โœ… | +| best-of | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | +| beam-search | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | + +!!! note + Please refer to [Feature support through NxD Inference backend][feature-support-through-nxd-inference-backend] for features supported on AWS Neuron hardware diff --git a/docs/source/features/disagg_prefill.md b/docs/features/disagg_prefill.md similarity index 87% rename from docs/source/features/disagg_prefill.md rename to docs/features/disagg_prefill.md index 2fa20140c086d..54be05647d940 100644 --- a/docs/source/features/disagg_prefill.md +++ b/docs/features/disagg_prefill.md @@ -1,12 +1,12 @@ -(disagg-prefill)= - -# Disaggregated Prefilling (experimental) +--- +title: Disaggregated Prefilling (experimental) +--- +[](){ #disagg-prefill } This page introduces you the disaggregated prefilling feature in vLLM. -:::{note} -This feature is experimental and subject to change. -::: +!!! note + This feature is experimental and subject to change. ## Why disaggregated prefilling? @@ -15,9 +15,8 @@ Two main reasons: - **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. `tp` and `pp`) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT. - **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL. -:::{note} -Disaggregated prefill DOES NOT improve throughput. -::: +!!! note + Disaggregated prefill DOES NOT improve throughput. ## Usage example @@ -39,21 +38,16 @@ Key abstractions for disaggregated prefilling: - **LookupBuffer**: LookupBuffer provides two API: `insert` KV cache and `drop_select` KV cache. The semantics of `insert` and `drop_select` are similar to SQL, where `insert` inserts a KV cache into the buffer, and `drop_select` returns the KV cache that matches the given condition and drop it from the buffer. - **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports `send_tensor` and `recv_tensor`. -:::{note} -`insert` is non-blocking operation but `drop_select` is blocking operation. -::: +!!! note + `insert` is non-blocking operation but `drop_select` is blocking operation. Here is a figure illustrating how the above 3 abstractions are organized: -:::{image} /assets/features/disagg_prefill/abstraction.jpg -:alt: Disaggregated prefilling abstractions -::: +![Disaggregated prefilling abstractions](../assets/features/disagg_prefill/abstraction.jpg) The workflow of disaggregated prefilling is as follows: -:::{image} /assets/features/disagg_prefill/overview.jpg -:alt: Disaggregated prefilling workflow -::: +![Disaggregated prefilling workflow](../assets/features/disagg_prefill/overview.jpg) The `buffer` corresponds to `insert` API in LookupBuffer, and the `drop_select` corresponds to `drop_select` API in LookupBuffer. diff --git a/docs/source/features/lora.md b/docs/features/lora.md similarity index 96% rename from docs/source/features/lora.md rename to docs/features/lora.md index 5a3ce0c01f3fa..04e92dbc45924 100644 --- a/docs/source/features/lora.md +++ b/docs/features/lora.md @@ -1,10 +1,11 @@ -(lora-adapter)= - -# LoRA Adapters +--- +title: LoRA Adapters +--- +[](){ #lora-adapter } This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09685) with vLLM on top of a base model. -LoRA adapters can be used with any vLLM model that implements {class}`~vllm.model_executor.models.interfaces.SupportsLoRA`. +LoRA adapters can be used with any vLLM model that implements [SupportsLoRA][vllm.model_executor.models.interfaces.SupportsLoRA]. Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save them locally with @@ -60,9 +61,8 @@ vllm serve meta-llama/Llama-2-7b-hf \ --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ ``` -:::{note} -The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one. -::: +!!! note + The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one. The server entrypoint accepts all other LoRA configuration parameters (`max_loras`, `max_lora_rank`, `max_cpu_loras`, etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along @@ -165,6 +165,7 @@ it will first look in the local directory for a directory `foobar`, and attempt that adapter will then be available for normal use on the server. Alternatively, follow these example steps to implement your own plugin: + 1. Implement the LoRAResolver interface. Example of a simple S3 LoRAResolver implementation: @@ -198,9 +199,9 @@ Alternatively, follow these example steps to implement your own plugin: return lora_request ``` -2. Register LoRAResolver plugin. +2. Register `LoRAResolver` plugin. - ```python + ```python from vllm.lora.resolver import LoRAResolverRegistry s3_resolver = S3LoRAResolver() diff --git a/docs/source/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md similarity index 84% rename from docs/source/features/multimodal_inputs.md rename to docs/features/multimodal_inputs.md index bb2997f008ed5..19b6681729028 100644 --- a/docs/source/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -1,20 +1,20 @@ -(multimodal-inputs)= +--- +title: Multimodal Inputs +--- +[](){ #multimodal-inputs } -# Multimodal Inputs +This page teaches you how to pass multi-modal inputs to [multi-modal models][supported-mm-models] in vLLM. -This page teaches you how to pass multi-modal inputs to [multi-modal models](#supported-mm-models) in vLLM. - -:::{note} -We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes, -and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests. -::: +!!! note + We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes, + and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests. ## Offline Inference -To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`: +To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]: - `prompt`: The prompt should follow the format that is documented on HuggingFace. -- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.inputs.MultiModalDataDict`. +- `multi_modal_data`: This is a dictionary that follows the schema defined in [vllm.multimodal.inputs.MultiModalDataDict][]. ### Image Inputs @@ -211,16 +211,15 @@ for o in outputs: Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). -:::{important} -A chat template is **required** to use Chat Completions API. -For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`. +!!! warning + A chat template is **required** to use Chat Completions API. + For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`. -If no default chat template is available, we will first look for a built-in fallback in . -If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument. + If no default chat template is available, we will first look for a built-in fallback in . + If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument. -For certain models, we provide alternative chat templates inside . -For example, VLM2Vec uses which is different from the default one for Phi-3-Vision. -::: + For certain models, we provide alternative chat templates inside . + For example, VLM2Vec uses which is different from the default one for Phi-3-Vision. ### Image Inputs @@ -284,25 +283,21 @@ print("Chat completion output:", chat_response.choices[0].message.content) Full example: -:::{tip} -Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine, -and pass the file path as `url` in the API request. -::: +!!! tip + Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine, + and pass the file path as `url` in the API request. -:::{tip} -There is no need to place image placeholders in the text content of the API request - they are already represented by the image content. -In fact, you can place image placeholders in the middle of the text by interleaving text and image content. -::: +!!! tip + There is no need to place image placeholders in the text content of the API request - they are already represented by the image content. + In fact, you can place image placeholders in the middle of the text by interleaving text and image content. -:::{note} -By default, the timeout for fetching images through HTTP URL is `5` seconds. -You can override this by setting the environment variable: +!!! note + By default, the timeout for fetching images through HTTP URL is `5` seconds. + You can override this by setting the environment variable: -```console -export VLLM_IMAGE_FETCH_TIMEOUT= -``` - -::: + ```console + export VLLM_IMAGE_FETCH_TIMEOUT= + ``` ### Video Inputs @@ -357,15 +352,13 @@ print("Chat completion output from image url:", result) Full example: -:::{note} -By default, the timeout for fetching videos through HTTP URL is `30` seconds. -You can override this by setting the environment variable: +!!! note + By default, the timeout for fetching videos through HTTP URL is `30` seconds. + You can override this by setting the environment variable: -```console -export VLLM_VIDEO_FETCH_TIMEOUT= -``` - -::: + ```console + export VLLM_VIDEO_FETCH_TIMEOUT= + ``` ### Audio Inputs @@ -461,15 +454,13 @@ print("Chat completion output from audio url:", result) Full example: -:::{note} -By default, the timeout for fetching audios through HTTP URL is `10` seconds. -You can override this by setting the environment variable: +!!! note + By default, the timeout for fetching audios through HTTP URL is `10` seconds. + You can override this by setting the environment variable: -```console -export VLLM_AUDIO_FETCH_TIMEOUT= -``` - -::: + ```console + export VLLM_AUDIO_FETCH_TIMEOUT= + ``` ### Embedding Inputs @@ -535,7 +526,6 @@ chat_completion = client.chat.completions.create( ) ``` -:::{note} -Only one message can contain `{"type": "image_embeds"}`. -If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc. -::: +!!! note + Only one message can contain `{"type": "image_embeds"}`. + If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc. diff --git a/docs/source/features/prompt_embeds.md b/docs/features/prompt_embeds.md similarity index 92% rename from docs/source/features/prompt_embeds.md rename to docs/features/prompt_embeds.md index 9d7b242bbe51d..6f5616e05d8c1 100644 --- a/docs/source/features/prompt_embeds.md +++ b/docs/features/prompt_embeds.md @@ -6,13 +6,12 @@ This page teaches you how to pass prompt embedding inputs to vLLM. The traditional flow of text data for a Large Language Model goes from text to token ids (via a tokenizer) then from token ids to prompt embeddings. For a traditional decoder-only model (such as meta-llama/Llama-3.1-8B-Instruct), this step of converting token ids to prompt embeddings happens via a look-up from a learned embedding matrix, but the model is not limited to processing only the embeddings corresponding to its token vocabulary. -:::{note} -Prompt embeddings are currently only supported in the v0 engine. -::: +!!! note + Prompt embeddings are currently only supported in the v0 engine. ## Offline Inference -To input multi-modal data, follow this schema in {class}`vllm.inputs.EmbedsPrompt`: +To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]: - `prompt_embeds`: A torch tensor representing a sequence of prompt/token embeddings. This has the shape (sequence_length, hidden_size), where sequence length is the number of tokens embeddings and hidden_size is the hidden size (embedding size) of the model. diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md new file mode 100644 index 0000000000000..71f62065f63d2 --- /dev/null +++ b/docs/features/quantization/README.md @@ -0,0 +1,22 @@ +--- +title: Quantization +--- +[](){ #quantization-index } + +Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices. + +Contents: + +- [Supported_Hardware](supported_hardware.md) +- [Auto_Awq](auto_awq.md) +- [Bnb](bnb.md) +- [Bitblas](bitblas.md) +- [Gguf](gguf.md) +- [Gptqmodel](gptqmodel.md) +- [Int4](int4.md) +- [Int8](int8.md) +- [Fp8](fp8.md) +- [Modelopt](modelopt.md) +- [Quark](quark.md) +- [Quantized_Kvcache](quantized_kvcache.md) +- [Torchao](torchao.md) diff --git a/docs/source/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md similarity index 93% rename from docs/source/features/quantization/auto_awq.md rename to docs/features/quantization/auto_awq.md index b4ac597f5a79c..4366a080f52cf 100644 --- a/docs/source/features/quantization/auto_awq.md +++ b/docs/features/quantization/auto_awq.md @@ -1,6 +1,7 @@ -(auto-awq)= - -# AutoAWQ +--- +title: AutoAWQ +--- +[](){ #auto-awq } To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ). Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint. @@ -41,7 +42,9 @@ print(f'Model is quantized and saved at "{quant_path}"') To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: ```console -python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq +python examples/offline_inference/llm_engine_example.py \ + --model TheBloke/Llama-2-7b-Chat-AWQ \ + --quantization awq ``` AWQ models are also supported directly through the LLM entrypoint: diff --git a/docs/source/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md similarity index 62% rename from docs/source/features/quantization/bitblas.md rename to docs/features/quantization/bitblas.md index d0b2bf858c9b6..9001725d9c02d 100644 --- a/docs/source/features/quantization/bitblas.md +++ b/docs/features/quantization/bitblas.md @@ -1,14 +1,14 @@ -(bitblas)= - -# BitBLAS +--- +title: BitBLAS +--- +[](){ #bitblas } vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more efficient and flexible model inference. Compared to other quantization frameworks, BitBLAS provides more precision combinations. -:::{note} -Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`). -Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper. -For details see [supported hardware](https://docs.vllm.ai/en/latest/features/quantization/supported_hardware.html). -::: +!!! note + Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`). + Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper. + For details see [supported hardware](https://docs.vllm.ai/en/latest/features/quantization/supported_hardware.html). Below are the steps to utilize BitBLAS with vLLM. @@ -33,7 +33,12 @@ import torch # "hxbgsyxh/llama-13b-4bit-g-1-bitblas" is a pre-quantized checkpoint. model_id = "hxbgsyxh/llama-13b-4bit-g-1-bitblas" -llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, quantization="bitblas") +llm = LLM( + model=model_id, + dtype=torch.bfloat16, + trust_remote_code=True, + quantization="bitblas" +) ``` ## Read gptq format checkpoint @@ -44,5 +49,11 @@ import torch # "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint. model_id = "hxbgsyxh/llama-13b-4bit-g-1" -llm = LLM(model=model_id, dtype=torch.float16, trust_remote_code=True, quantization="bitblas", max_model_len=1024) +llm = LLM( + model=model_id, + dtype=torch.float16, + trust_remote_code=True, + quantization="bitblas", + max_model_len=1024 +) ``` diff --git a/docs/source/features/quantization/bnb.md b/docs/features/quantization/bnb.md similarity index 79% rename from docs/source/features/quantization/bnb.md rename to docs/features/quantization/bnb.md index 1843a33a3dfdd..a8dc2476f30aa 100644 --- a/docs/source/features/quantization/bnb.md +++ b/docs/features/quantization/bnb.md @@ -1,6 +1,7 @@ -(bits-and-bytes)= - -# BitsAndBytes +--- +title: BitsAndBytes +--- +[](){ #bits-and-bytes } vLLM now supports [BitsAndBytes](https://github.com/TimDettmers/bitsandbytes) for more efficient model inference. BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy. @@ -14,7 +15,7 @@ pip install bitsandbytes>=0.45.3 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. -You can find bitsandbytes quantized models on . +You can find bitsandbytes quantized models on [Hugging Face](https://huggingface.co/models?search=bitsandbytes). And usually, these repositories have a config.json file that includes a quantization_config section. ## Read quantized checkpoint @@ -26,7 +27,11 @@ from vllm import LLM import torch # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint. model_id = "unsloth/tinyllama-bnb-4bit" -llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True) +llm = LLM( + model=model_id, + dtype=torch.bfloat16, + trust_remote_code=True +) ``` ## Inflight quantization: load as 4bit quantization @@ -37,8 +42,12 @@ For inflight 4bit quantization with BitsAndBytes, you need to explicitly specify from vllm import LLM import torch model_id = "huggyllama/llama-7b" -llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ -quantization="bitsandbytes") +llm = LLM( + model=model_id, + dtype=torch.bfloat16, + trust_remote_code=True, + quantization="bitsandbytes" +) ``` ## OpenAI Compatible Server diff --git a/docs/source/features/quantization/fp8.md b/docs/features/quantization/fp8.md similarity index 88% rename from docs/source/features/quantization/fp8.md rename to docs/features/quantization/fp8.md index cb304d54726c8..01d5d9da046de 100644 --- a/docs/source/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -1,6 +1,7 @@ -(fp8)= - -# FP8 W8A8 +--- +title: FP8 W8A8 +--- +[](){ #fp8 } vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8. @@ -14,10 +15,9 @@ The FP8 types typically supported in hardware have two distinct representations, - **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and `nan`. - **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- `inf`, and `nan`. The tradeoff for the increased dynamic range is lower precision of the stored values. -:::{note} -FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper). -FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin. -::: +!!! note + FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper). + FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin. ## Installation @@ -94,9 +94,8 @@ print(result[0].outputs[0].text) Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`): -:::{note} -Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations. -::: +!!! note + Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations. ```console $ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic @@ -133,6 +132,5 @@ result = model.generate("Hello, my name is") print(result[0].outputs[0].text) ``` -:::{warning} -Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. -::: +!!! warning + Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. diff --git a/docs/source/features/quantization/gguf.md b/docs/features/quantization/gguf.md similarity index 64% rename from docs/source/features/quantization/gguf.md rename to docs/features/quantization/gguf.md index e93e4dcd3b578..72f758f653a8f 100644 --- a/docs/source/features/quantization/gguf.md +++ b/docs/features/quantization/gguf.md @@ -1,39 +1,42 @@ -(gguf)= +--- +title: GGUF +--- +[](){ #gguf } -# GGUF +!!! warning + Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team. -:::{warning} -Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team. -::: - -:::{warning} -Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model. -::: +!!! warning + Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model. To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command: ```console wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. -vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 +vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ + --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 ``` You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs: ```console # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. -vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 +vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ + --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --tensor-parallel-size 2 ``` -:::{warning} -We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size. -::: +!!! warning + We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size. GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path ```console # If you model is not supported by huggingface you can manually provide a huggingface compatible config path -vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0 +vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ + --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0 ``` You can also use the GGUF model directly through the LLM entrypoint: diff --git a/docs/source/features/quantization/gptqmodel.md b/docs/features/quantization/gptqmodel.md similarity index 95% rename from docs/source/features/quantization/gptqmodel.md rename to docs/features/quantization/gptqmodel.md index 9771d5a4fe9ee..53e938d2cbd7d 100644 --- a/docs/source/features/quantization/gptqmodel.md +++ b/docs/features/quantization/gptqmodel.md @@ -1,6 +1,7 @@ -(gptqmodel)= - -# GPTQModel +--- +title: GPTQModel +--- +[](){ #gptqmodel } To create a new 4-bit or 8-bit GPTQ quantized model, you can leverage [GPTQModel](https://github.com/ModelCloud/GPTQModel) from ModelCloud.AI. @@ -58,7 +59,8 @@ model.save(quant_path) To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command: ```console -python examples/offline_inference/llm_engine_example.py --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2 +python examples/offline_inference/llm_engine_example.py \ + --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2 ``` ## Using GPTQModel with vLLM's Python API diff --git a/docs/source/features/quantization/int4.md b/docs/features/quantization/int4.md similarity index 94% rename from docs/source/features/quantization/int4.md rename to docs/features/quantization/int4.md index 7a0ab4ad229e6..b7d09206365ff 100644 --- a/docs/source/features/quantization/int4.md +++ b/docs/features/quantization/int4.md @@ -1,14 +1,14 @@ -(int4)= - -# INT4 W4A16 +--- +title: INT4 W4A16 +--- +[](){ #int4 } vLLM supports quantizing weights to INT4 for memory savings and inference acceleration. This quantization method is particularly useful for reducing model size and maintaining low latency in workloads with low queries per second (QPS). Please visit the HF collection of [quantized INT4 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int4-llms-for-vllm-668ec34bf3c9fa45f857df2c). -:::{note} -INT4 computation is supported on NVIDIA GPUs with compute capability > 8.0 (Ampere, Ada Lovelace, Hopper, Blackwell). -::: +!!! note + INT4 computation is supported on NVIDIA GPUs with compute capability > 8.0 (Ampere, Ada Lovelace, Hopper, Blackwell). ## Prerequisites @@ -121,9 +121,8 @@ $ lm_eval --model vllm \ --batch_size 'auto' ``` -:::{note} -Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations. -::: +!!! note + Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations. ## Best Practices diff --git a/docs/source/features/quantization/int8.md b/docs/features/quantization/int8.md similarity index 92% rename from docs/source/features/quantization/int8.md rename to docs/features/quantization/int8.md index 1e4b01d35575c..1d9fba9dc87f1 100644 --- a/docs/source/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -1,15 +1,15 @@ -(int8)= - -# INT8 W8A8 +--- +title: INT8 W8A8 +--- +[](){ #int8 } vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration. This quantization method is particularly useful for reducing model size while maintaining good performance. Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415). -:::{note} -INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper, Blackwell). -::: +!!! note + INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper, Blackwell). ## Prerequisites @@ -125,9 +125,8 @@ $ lm_eval --model vllm \ --batch_size 'auto' ``` -:::{note} -Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations. -::: +!!! note + Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations. ## Best Practices diff --git a/docs/source/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md similarity index 100% rename from docs/source/features/quantization/modelopt.md rename to docs/features/quantization/modelopt.md diff --git a/docs/source/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md similarity index 98% rename from docs/source/features/quantization/quantized_kvcache.md rename to docs/features/quantization/quantized_kvcache.md index 86e6354ec82e0..e3ebd024bab3c 100644 --- a/docs/source/features/quantization/quantized_kvcache.md +++ b/docs/features/quantization/quantized_kvcache.md @@ -1,6 +1,7 @@ -(quantized-kvcache)= - -# Quantized KV Cache +--- +title: Quantized KV Cache +--- +[](){ #quantized-kvcache } ## FP8 KV Cache diff --git a/docs/source/features/quantization/quark.md b/docs/features/quantization/quark.md similarity index 94% rename from docs/source/features/quantization/quark.md rename to docs/features/quantization/quark.md index 955890dbc75ba..51da98cc09d3f 100644 --- a/docs/source/features/quantization/quark.md +++ b/docs/features/quantization/quark.md @@ -1,6 +1,7 @@ -(quark)= - -# AMD QUARK +--- +title: AMD QUARK +--- +[](){ #quark } Quantization can effectively reduce memory and bandwidth usage, accelerate computation and improve throughput while with minimal accuracy loss. vLLM can leverage [Quark](https://quark.docs.amd.com/latest/), @@ -86,13 +87,12 @@ We need to set the quantization configuration, you can check for further details. Here we use FP8 per-tensor quantization on weight, activation, kv-cache and the quantization algorithm is AutoSmoothQuant. -:::{note} -Note the quantization algorithm needs a JSON config file and the config file is located in -[Quark Pytorch examples](https://quark.docs.amd.com/latest/pytorch/pytorch_examples.html), -under the directory `examples/torch/language_modeling/llm_ptq/models`. For example, -AutoSmoothQuant config file for Llama is -`examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`. -::: +!!! note + Note the quantization algorithm needs a JSON config file and the config file is located in + [Quark Pytorch examples](https://quark.docs.amd.com/latest/pytorch/pytorch_examples.html), + under the directory `examples/torch/language_modeling/llm_ptq/models`. For example, + AutoSmoothQuant config file for Llama is + `examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`. ```python from quark.torch.quantization import (Config, QuantizationConfig, diff --git a/docs/features/quantization/supported_hardware.md b/docs/features/quantization/supported_hardware.md new file mode 100644 index 0000000000000..6a585b1ccb2ca --- /dev/null +++ b/docs/features/quantization/supported_hardware.md @@ -0,0 +1,28 @@ +--- +title: Supported Hardware +--- +[](){ #quantization-supported-hardware } + +The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: + +| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | x86 CPU | AWS Neuron | Google TPU | +|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-----------|------------------|--------------| +| AWQ | โŒ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โŒ | โœ…๏ธŽ | โœ…๏ธŽ | โŒ | โŒ | +| GPTQ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โŒ | โœ…๏ธŽ | โœ…๏ธŽ | โŒ | โŒ | +| Marlin (GPTQ/AWQ/FP8) | โŒ | โŒ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โŒ | โŒ | โŒ | โŒ | โŒ | +| INT8 (W8A8) | โŒ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โŒ | โŒ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | +| FP8 (W8A8) | โŒ | โŒ | โŒ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โŒ | โŒ | โœ…๏ธŽ | โŒ | +| BitBLAS (GPTQ) | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โŒ | โŒ | โŒ | โŒ | โŒ | +| AQLM | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โŒ | โŒ | โŒ | โŒ | โŒ | +| bitsandbytes | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โŒ | โŒ | โŒ | โŒ | โŒ | +| DeepSpeedFP | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โŒ | โŒ | โŒ | โŒ | โŒ | +| GGUF | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | โŒ | โŒ | โŒ | โŒ | + +- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. +- โœ…๏ธŽ indicates that the quantization method is supported on the specified hardware. +- โŒ indicates that the quantization method is not supported on the specified hardware. + +!!! note + This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. + + For the most up-to-date information on hardware support and quantization methods, please refer to or consult with the vLLM development team. diff --git a/docs/source/features/quantization/torchao.md b/docs/features/quantization/torchao.md similarity index 86% rename from docs/source/features/quantization/torchao.md rename to docs/features/quantization/torchao.md index 82100c6ddcac0..a7a517af85aa9 100644 --- a/docs/source/features/quantization/torchao.md +++ b/docs/features/quantization/torchao.md @@ -7,7 +7,9 @@ We recommend installing the latest torchao nightly with ```console # Install the latest TorchAO nightly build # Choose the CUDA version that matches your system (cu126, cu128, etc.) -pip install --pre torchao>=10.0.0 --index-url https://download.pytorch.org/whl/nightly/cu126 +pip install \ + --pre torchao>=10.0.0 \ + --index-url https://download.pytorch.org/whl/nightly/cu126 ``` ## Quantizing HuggingFace Models @@ -20,7 +22,12 @@ from torchao.quantization import Int8WeightOnlyConfig model_name = "meta-llama/Meta-Llama-3-8B" quantization_config = TorchAoConfig(Int8WeightOnlyConfig()) -quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config) +quantized_model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype="auto", + device_map="auto", + quantization_config=quantization_config +) tokenizer = AutoTokenizer.from_pretrained(model_name) input_text = "What are we having for dinner?" input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") diff --git a/docs/source/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md similarity index 96% rename from docs/source/features/reasoning_outputs.md rename to docs/features/reasoning_outputs.md index bf4f8901a11a8..cbcb246912f4c 100644 --- a/docs/source/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -1,6 +1,7 @@ -(reasoning-outputs)= - -# Reasoning Outputs +--- +title: Reasoning Outputs +--- +[](){ #reasoning-outputs } vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions. @@ -17,17 +18,17 @@ vLLM currently supports the following reasoning models: | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | โŒ | โŒ | | [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | โœ… | -:::{note} -IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. -The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`. -::: +!!! note + IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. + The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`. ## Quickstart To use reasoning models, you need to specify the `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output. ```bash -vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1 +vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ + --reasoning-parser deepseek_r1 ``` Next, make a request to the model that should return the reasoning content in the response. @@ -167,12 +168,10 @@ client = OpenAI( models = client.models.list() model = models.data[0].id - class People(BaseModel): name: str age: int - json_schema = People.model_json_schema() prompt = ("Generate a JSON with the name and age of one random person.") diff --git a/docs/source/features/spec_decode.md b/docs/features/spec_decode.md similarity index 91% rename from docs/source/features/spec_decode.md rename to docs/features/spec_decode.md index f16e0d96522da..5080960f72ddb 100644 --- a/docs/source/features/spec_decode.md +++ b/docs/features/spec_decode.md @@ -1,16 +1,15 @@ -(spec-decode)= +--- +title: Speculative Decoding +--- +[](){ #spec-decode } -# Speculative Decoding +!!! warning + Please note that speculative decoding in vLLM is not yet optimized and does + not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. + The work to optimize it is ongoing and can be followed here: -:::{warning} -Please note that speculative decoding in vLLM is not yet optimized and does -not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. -The work to optimize it is ongoing and can be followed here: -::: - -:::{warning} -Currently, speculative decoding in vLLM is not compatible with pipeline parallelism. -::: +!!! warning + Currently, speculative decoding in vLLM is not compatible with pipeline parallelism. This document shows how to use [Speculative Decoding](https://x.com/karpathy/status/1697318534555336961) with vLLM. Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference. @@ -46,14 +45,18 @@ for output in outputs: To perform the same with an online mode launch the server: ```bash -python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \ - --seed 42 -tp 1 --gpu_memory_utilization 0.8 \ +python -m vllm.entrypoints.openai.api_server \ + --host 0.0.0.0 \ + --port 8000 \ + --model facebook/opt-6.7b \ + --seed 42 \ + -tp 1 \ + --gpu_memory_utilization 0.8 \ --speculative_config '{"model": "facebook/opt-125m", "num_speculative_tokens": 5}' ``` -:::{warning} -Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately has been deprecated now. -::: +!!! warning + Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately has been deprecated now. Then use a client: @@ -172,7 +175,7 @@ A variety of speculative models of this type are available on HF hub: ## Speculating using EAGLE based draft models The following code configures vLLM to use speculative decoding where proposals are generated by -an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](). +an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py). ```python from vllm import LLM, SamplingParams @@ -255,7 +258,7 @@ speculative decoding, breaking down the guarantees into three key areas: 3. **vLLM Logprob Stability** \- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the same request across runs. For more details, see the FAQ section - titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). + titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs][faq]. While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding can occur due to following factors: @@ -264,7 +267,7 @@ can occur due to following factors: - **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially due to non-deterministic behavior in batched operations or numerical instability. -For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). +For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs][faq]. ## Resources for vLLM contributors diff --git a/docs/source/features/structured_outputs.md b/docs/features/structured_outputs.md similarity index 96% rename from docs/source/features/structured_outputs.md rename to docs/features/structured_outputs.md index 03119ec7441c9..f96b598cff98d 100644 --- a/docs/source/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -1,6 +1,7 @@ -(structured-outputs)= - -# Structured Outputs +--- +title: Structured Outputs +--- +[](){ #structured-outputs } vLLM supports the generation of structured outputs using [xgrammar](https://github.com/mlc-ai/xgrammar) or @@ -20,7 +21,7 @@ The following parameters are supported, which must be added as extra parameters: - `guided_grammar`: the output will follow the context free grammar. - `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text. -You can see the complete list of supported parameters on the [OpenAI-Compatible Server](#openai-compatible-server) page. +You can see the complete list of supported parameters on the [OpenAI-Compatible Server][openai-compatible-server] page. Structured outputs are supported by default in the OpenAI-Compatible Server. You may choose to specify the backend to use by setting the @@ -83,13 +84,11 @@ class CarType(str, Enum): truck = "Truck" coupe = "Coupe" - class CarDescription(BaseModel): brand: str model: str car_type: CarType - json_schema = CarDescription.model_json_schema() completion = client.chat.completions.create( @@ -105,11 +104,10 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` -:::{tip} -While not strictly necessary, normally itยดs better to indicate in the prompt the -JSON schema and how the fields should be populated. This can improve the -results notably in most cases. -::: +!!! tip + While not strictly necessary, normally itยดs better to indicate in the prompt the + JSON schema and how the fields should be populated. This can improve the + results notably in most cases. Finally we have the `guided_grammar` option, which is probably the most difficult to use, but itยดs really powerful. It allows us to define complete @@ -160,12 +158,10 @@ Here is a simple example demonstrating how to get structured output using Pydant from pydantic import BaseModel from openai import OpenAI - class Info(BaseModel): name: str age: int - client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") completion = client.beta.chat.completions.parse( model="meta-llama/Llama-3.1-8B-Instruct", @@ -199,17 +195,14 @@ from typing import List from pydantic import BaseModel from openai import OpenAI - class Step(BaseModel): explanation: str output: str - class MathResponse(BaseModel): steps: list[Step] final_answer: str - client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") completion = client.beta.chat.completions.parse( model="meta-llama/Llama-3.1-8B-Instruct", diff --git a/docs/source/features/tool_calling.md b/docs/features/tool_calling.md similarity index 95% rename from docs/source/features/tool_calling.md rename to docs/features/tool_calling.md index 2795b769345ee..6ee1060dd050a 100644 --- a/docs/source/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -93,7 +93,7 @@ specify the `name` of one of the tools in the `tool_choice` parameter of the cha ## Required Function Calling -vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](https://docs.vllm.ai/en/latest/getting_started/v1_user_guide.html#feature-model) for the V1 engine. +vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](https://docs.vllm.ai/en/latest/usage/v1_guide.html#feature-model) for the V1 engine. When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter. @@ -158,13 +158,13 @@ All Llama 3.1, 3.2 and 4 models should be supported. * `meta-llama/Llama-3.2-*` * `meta-llama/Llama-4-*` -The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below. +The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below. As for llama 4 models, it is recommended to use the `llama4_pythonic` tool parser. Other tool calling formats like the built in python tool calling or custom tool calling are not supported. Known issues: -1. Parallel tool calls are not supported. +1. Parallel tool calls are not supported for llama 3, but it is supported in llama 4 models. 2. The model can generate parameters with a wrong format, such as generating an array serialized as string instead of an array. @@ -177,11 +177,10 @@ images. Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}` -VLLM also provides a JSON based chat template for Llama 4: -* - this is based on the "official" chat template for the Llama 4 -models, but tweaked so that it works better with vLLM. +VLLM also provides a pythonic and JSON based chat template for Llama 4, but pythonic tool calling is recommended: +* - this is based on the [official chat template](https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/) for the Llama 4 models. -For Llama 4 use `--tool-call-parser llama4_json examples/tool_chat_template_llama4_json.jinja`. +For Llama 4 model, use `--tool-call-parser llama4_pythonic --chat-template examples/tool_chat_template_llama4_pythonic.jinja`. #### IBM Granite @@ -323,7 +322,6 @@ class ExampleToolParser(ToolParser): tool_calls=[], content=text) - ``` Then you can use this plugin in the command line like this. diff --git a/docs/getting_started/installation/.nav.yml b/docs/getting_started/installation/.nav.yml new file mode 100644 index 0000000000000..7acfc015ff508 --- /dev/null +++ b/docs/getting_started/installation/.nav.yml @@ -0,0 +1,5 @@ +nav: + - README.md + - gpu.md + - cpu.md + - ai_accelerator.md \ No newline at end of file diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md new file mode 100644 index 0000000000000..36bb16cc02249 --- /dev/null +++ b/docs/getting_started/installation/README.md @@ -0,0 +1,20 @@ +--- +title: Installation +--- +[](){ #installation-index } + +vLLM supports the following hardware platforms: + +- [GPU](gpu.md) + - [NVIDIA CUDA](gpu.md#nvidia-cuda) + - [AMD ROCm](gpu.md#amd-rocm) + - [Intel XPU](gpu.md#intel-xpu) +- [CPU](cpu.md) + - [Intel/AMD x86](cpu.md#intelamd-x86) + - [ARM AArch64](cpu.md#arm-aarch64) + - [Apple silicon](cpu.md#apple-silicon) + - [IBM Z (S390X)](cpu.md#ibm-z-s390x) +- [Other AI accelerators](ai_accelerator.md) + - [Google TPU](ai_accelerator.md#google-tpu) + - [Intel Gaudi](ai_accelerator.md#intel-gaudi) + - [AWS Neuron](ai_accelerator.md#aws-neuron) diff --git a/docs/getting_started/installation/ai_accelerator.md b/docs/getting_started/installation/ai_accelerator.md new file mode 100644 index 0000000000000..a4f136a172fed --- /dev/null +++ b/docs/getting_started/installation/ai_accelerator.md @@ -0,0 +1,117 @@ +# Other AI accelerators + +vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions: + +=== "Google TPU" + + --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:installation" + +=== "Intel Gaudi" + + --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:installation" + +=== "AWS Neuron" + + --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:installation" + +## Requirements + +=== "Google TPU" + + --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:requirements" + +=== "Intel Gaudi" + + --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:requirements" + +=== "AWS Neuron" + + --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:requirements" + +## Configure a new environment + +=== "Google TPU" + + --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:configure-a-new-environment" + +=== "Intel Gaudi" + + --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:configure-a-new-environment" + +=== "AWS Neuron" + + --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:configure-a-new-environment" + +## Set up using Python + +### Pre-built wheels + +=== "Google TPU" + + --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:pre-built-wheels" + +=== "Intel Gaudi" + + --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:pre-built-wheels" + +=== "AWS Neuron" + + --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:pre-built-wheels" + +### Build wheel from source + +=== "Google TPU" + + --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:build-wheel-from-source" + +=== "Intel Gaudi" + + --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:build-wheel-from-source" + +=== "AWS Neuron" + + --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:build-wheel-from-source" + +## Set up using Docker + +### Pre-built images + +=== "Google TPU" + + --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:pre-built-images" + +=== "Intel Gaudi" + + --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:pre-built-images" + +=== "AWS Neuron" + + --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:pre-built-images" + +### Build image from source + +=== "Google TPU" + + --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:build-image-from-source" + +=== "Intel Gaudi" + + --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:build-image-from-source" + +=== "AWS Neuron" + + --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:build-image-from-source" + +## Extra information + +=== "Google TPU" + + --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:extra-information" + +=== "Intel Gaudi" + + --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:extra-information" + +=== "AWS Neuron" + + --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:extra-information" diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md similarity index 83% rename from docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md rename to docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md index 78938de317c48..00935a37417e5 100644 --- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md +++ b/docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md @@ -1,12 +1,12 @@ -# Installation +# --8<-- [start:installation] This tab provides instructions on running vLLM with Intel Gaudi devices. -:::{attention} -There are no pre-built wheels or images for this device, so you must build vLLM from source. -::: +!!! warning + There are no pre-built wheels or images for this device, so you must build vLLM from source. -## Requirements +# --8<-- [end:installation] +# --8<-- [start:requirements] - OS: Ubuntu 22.04 LTS - Python: 3.10 @@ -45,16 +45,27 @@ Use the following commands to run a Docker image: ```console docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest -docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +docker run \ + -it \ + --runtime=habana \ + -e HABANA_VISIBLE_DEVICES=all \ + -e OMPI_MCA_btl_vader_single_copy_mechanism=none \ + --cap-add=sys_nice \ + --net=host \ + --ipc=host \ + vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest ``` -## Set up using Python +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] -### Pre-built wheels +# --8<-- [end:set-up-using-python] +# --8<-- [start:pre-built-wheels] Currently, there are no pre-built Intel Gaudi wheels. -### Build wheel from source +# --8<-- [end:pre-built-wheels] +# --8<-- [start:build-wheel-from-source] To build and install vLLM from source, run: @@ -75,29 +86,39 @@ pip install -r requirements/hpu.txt python setup.py develop ``` -## Set up using Docker +# --8<-- [end:build-wheel-from-source] +# --8<-- [start:set-up-using-docker] -### Pre-built images +# --8<-- [end:set-up-using-docker] +# --8<-- [start:pre-built-images] Currently, there are no pre-built Intel Gaudi images. -### Build image from source +# --8<-- [end:pre-built-images] +# --8<-- [start:build-image-from-source] ```console docker build -f docker/Dockerfile.hpu -t vllm-hpu-env . -docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env +docker run \ + -it \ + --runtime=habana \ + -e HABANA_VISIBLE_DEVICES=all \ + -e OMPI_MCA_btl_vader_single_copy_mechanism=none \ + --cap-add=sys_nice \ + --net=host \ + --rm vllm-hpu-env ``` -:::{tip} -If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered. -::: +!!! tip + If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered. -## Extra information +# --8<-- [end:build-image-from-source] +# --8<-- [start:extra-information] ## Supported features -- [Offline inference](#offline-inference) -- Online serving via [OpenAI-Compatible Server](#openai-compatible-server) +- [Offline inference][offline-inference] +- Online serving via [OpenAI-Compatible Server][openai-compatible-server] - HPU autodetection - no need to manually select device within vLLM - Paged KV cache with algorithms enabled for Intel Gaudi accelerators - Custom Intel Gaudi implementations of Paged Attention, KV cache ops, @@ -157,41 +178,25 @@ Gaudi2 devices. Configurations that are not listed may or may not work. Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag. -:::{list-table} vLLM execution modes -:widths: 25 25 50 -:header-rows: 1 +| `PT_HPU_LAZY_MODE` | `enforce_eager` | execution mode | +|----------------------|-------------------|--------------------| +| 0 | 0 | torch.compile | +| 0 | 1 | PyTorch eager mode | +| 1 | 0 | HPU Graphs | +
vLLM execution modes
-- * `PT_HPU_LAZY_MODE` - * `enforce_eager` - * execution mode -- * 0 - * 0 - * torch.compile -- * 0 - * 1 - * PyTorch eager mode -- * 1 - * 0 - * HPU Graphs -- * 1 - * 1 - * PyTorch lazy mode -::: +!!! warning + In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. -:::{warning} -In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. -::: - -(gaudi-bucketing-mechanism)= +[](){ #gaudi-bucketing-mechanism } ### Bucketing mechanism Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`. -:::{note} -Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase. -::: +!!! note + Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase. Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: @@ -224,15 +229,13 @@ min = 128, step = 128, max = 512 In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. -:::{warning} -If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. -::: +!!! warning + If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket. -:::{note} -Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. -::: +!!! note + Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. ### Warmup @@ -252,11 +255,10 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB ``` -This example uses the same buckets as in the [Bucketing Mechanism](#gaudi-bucketing-mechanism) section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. +This example uses the same buckets as in the [Bucketing Mechanism][gaudi-bucketing-mechanism] section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. -:::{tip} -Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. -::: +!!! tip + Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. ### HPU Graph capture @@ -271,9 +273,8 @@ With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory wil Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints. Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. -:::{note} -`gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory. -::: +!!! note + `gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory. User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented: @@ -282,9 +283,8 @@ User can also configure the strategy for capturing HPU Graphs for prompt and dec When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy. -:::{note} -`VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below. -::: +!!! note + `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below. Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): @@ -401,3 +401,4 @@ the below: higher batches. You can do that by adding `--enforce-eager` flag to server (for online serving), or by passing `enforce_eager=True` argument to LLM constructor (for offline inference). +# --8<-- [end:extra-information] diff --git a/docs/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/getting_started/installation/ai_accelerator/neuron.inc.md new file mode 100644 index 0000000000000..86c12472fb360 --- /dev/null +++ b/docs/getting_started/installation/ai_accelerator/neuron.inc.md @@ -0,0 +1,154 @@ +# --8<-- [start:installation] + +[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) is the software development kit (SDK) used to run deep learning and + generative AI workloads on AWS Inferentia and AWS Trainium powered Amazon EC2 instances and UltraServers (Inf1, Inf2, Trn1, Trn2, + and Trn2 UltraServer). Both Trainium and Inferentia are powered by fully-independent heterogeneous compute-units called NeuronCores. + This tab describes how to set up your environment to run vLLM on Neuron. + +!!! warning + There are no pre-built wheels or images for this device, so you must build vLLM from source. + +# --8<-- [end:installation] +# --8<-- [start:requirements] + +- OS: Linux +- Python: 3.9 or newer +- Pytorch 2.5/2.6 +- Accelerator: NeuronCore-v2 (in trn1/inf2 chips) or NeuronCore-v3 (in trn2 chips) +- AWS Neuron SDK 2.23 + +## Configure a new environment + +### Launch a Trn1/Trn2/Inf2 instance and verify Neuron dependencies + +The easiest way to launch a Trainium or Inferentia instance with pre-installed Neuron dependencies is to follow this +[quick start guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/multiframework/multi-framework-ubuntu22-neuron-dlami.html#setup-ubuntu22-multi-framework-dlami) using the Neuron Deep Learning AMI (Amazon machine image). + +- After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance +- Once inside your instance, activate the pre-installed virtual environment for inference by running +```console +source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate +``` + +Refer to the [NxD Inference Setup Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/nxdi-setup.html) +for alternative setup instructions including using Docker and manually installing dependencies. + +!!! note + NxD Inference is the default recommended backend to run inference on Neuron. If you are looking to use the legacy [transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) + library, refer to [Transformers NeuronX Setup](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/transformers-neuronx/setup/index.html). + +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] + +# --8<-- [end:set-up-using-python] +# --8<-- [start:pre-built-wheels] + +Currently, there are no pre-built Neuron wheels. + +# --8<-- [end:pre-built-wheels] +# --8<-- [start:build-wheel-from-source] + +#### Install vLLM from source + +Install vllm as follows: + +```console +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -U -r requirements/neuron.txt +VLLM_TARGET_DEVICE="neuron" pip install -e . +``` + +AWS Neuron maintains a [Github fork of vLLM](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2) at + [https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2), which contains several features in addition to what's + available on vLLM V0. Please utilize the AWS Fork for the following features: + +- Llama-3.2 multi-modal support +- Multi-node distributed inference + +Refer to [vLLM User Guide for NxD Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/vllm-user-guide.html) + for more details and usage examples. + +To install the AWS Neuron fork, run the following: + +```console +git clone -b neuron-2.23-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git +cd upstreaming-to-vllm +pip install -r requirements/neuron.txt +VLLM_TARGET_DEVICE="neuron" pip install -e . +``` + +Note that the AWS Neuron fork is only intended to support Neuron hardware; compatibility with other hardwares is not tested. + +# --8<-- [end:build-wheel-from-source] +# --8<-- [start:set-up-using-docker] + +# --8<-- [end:set-up-using-docker] +# --8<-- [start:pre-built-images] + +Currently, there are no pre-built Neuron images. + +# --8<-- [end:pre-built-images] +# --8<-- [start:build-image-from-source] + +See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image. + +Make sure to use in place of the default Dockerfile. + +# --8<-- [end:build-image-from-source] +# --8<-- [start:extra-information] + +[](){ #feature-support-through-nxd-inference-backend } +### Feature support through NxD Inference backend + +The current vLLM and Neuron integration relies on either the `neuronx-distributed-inference` (preferred) or `transformers-neuronx` backend + to perform most of the heavy lifting which includes PyTorch model initialization, compilation, and runtime execution. Therefore, most + [features supported on Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html) are also available via the vLLM integration. + +To configure NxD Inference features through the vLLM entrypoint, use the `override_neuron_config` setting. Provide the configs you want to override +as a dictionary (or JSON object when starting vLLM from the CLI). For example, to disable auto bucketing, include +```console +override_neuron_config={ + "enable_bucketing":False, +} +``` +or when launching vLLM from the CLI, pass +```console +--override-neuron-config "{\"enable_bucketing\":false}" +``` + +Alternatively, users can directly call the NxDI library to trace and compile your model, then load the pre-compiled artifacts +(via `NEURON_COMPILED_ARTIFACTS` environment variable) in vLLM to run inference workloads. + +### Known limitations + +- EAGLE speculative decoding: NxD Inference requires the EAGLE draft checkpoint to include the LM head weights from the target model. Refer to this + [guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html#eagle-checkpoint-compatibility) + for how to convert pretrained EAGLE model checkpoints to be compatible for NxDI. +- Quantization: the native quantization flow in vLLM is not well supported on NxD Inference. It is recommended to follow this + [Neuron quantization guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/custom-quantization.html) + to quantize and compile your model using NxD Inference, and then load the compiled artifacts into vLLM. +- Multi-LoRA serving: NxD Inference only supports loading of LoRA adapters at server startup. Dynamic loading of LoRA adapters at + runtime is not currently supported. Refer to [multi-lora example](https://github.com/aws-neuron/upstreaming-to-vllm/blob/neuron-2.23-vllm-v0.7.2/examples/offline_inference/neuron_multi_lora.py) +- Multi-modal support: multi-modal support is only available through the AWS Neuron fork. This feature has not been upstreamed + to vLLM main because NxD Inference currently relies on certain adaptations to the core vLLM logic to support this feature. +- Multi-node support: distributed inference across multiple Trainium/Inferentia instances is only supported on the AWS Neuron fork. Refer + to this [multi-node example](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2/examples/neuron/multi_node) + to run. Note that tensor parallelism (distributed inference across NeuronCores) is available in vLLM main. +- Known edge case bug in speculative decoding: An edge case failure may occur in speculative decoding when sequence length approaches + max model length (e.g. when requesting max tokens up to the max model length and ignoring eos). In this scenario, vLLM may attempt + to allocate an additional block to ensure there is enough memory for number of lookahead slots, but since we do not have good support + for paged attention, there isn't another Neuron block for vLLM to allocate. A workaround fix (to terminate 1 iteration early) is + implemented in the AWS Neuron fork but is not upstreamed to vLLM main as it modifies core vLLM logic. + + +### Environment variables +- `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid + compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the + artifacts under `neuron-compiled-artifacts/{unique_hash}/` sub-directory in the model path. If this environment variable is set, + but the directory does not exist, or the contents are invalid, Neuron will also fallback to a new compilation and store the artifacts + under this specified path. +- `NEURON_CONTEXT_LENGTH_BUCKETS`: Bucket sizes for context encoding. (Only applicable to `transformers-neuronx` backend). +- `NEURON_TOKEN_GEN_BUCKETS`: Bucket sizes for token generation. (Only applicable to `transformers-neuronx` backend). + +# --8<-- [end:extra-information] diff --git a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md b/docs/getting_started/installation/ai_accelerator/tpu.inc.md similarity index 55% rename from docs/source/getting_started/installation/ai_accelerator/tpu.inc.md rename to docs/getting_started/installation/ai_accelerator/tpu.inc.md index 4459cc61e1cde..d0b1681201376 100644 --- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md +++ b/docs/getting_started/installation/ai_accelerator/tpu.inc.md @@ -1,4 +1,4 @@ -# Installation +# --8<-- [start:installation] Tensor Processing Units (TPUs) are Google's custom-developed application-specific integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs @@ -30,11 +30,11 @@ For TPU pricing information, see [Cloud TPU pricing](https://cloud.google.com/tp You may need additional persistent storage for your TPU VMs. For more information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp.google.com/tpu/docs/storage-options). -:::{attention} -There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source. -::: +!!! warning + There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source. -## Requirements +# --8<-- [end:installation] +# --8<-- [start:requirements] - Google Cloud TPU VM - TPU versions: v6e, v5e, v5p, v4 @@ -51,10 +51,9 @@ When you request queued resources, the request is added to a queue maintained by the Cloud TPU service. When the requested resource becomes available, it's assigned to your Google Cloud project for your immediate exclusive use. -:::{note} -In all of the following commands, replace the ALL CAPS parameter names with -appropriate values. See the parameter descriptions table for more information. -::: +!!! note + In all of the following commands, replace the ALL CAPS parameter names with + appropriate values. See the parameter descriptions table for more information. ### Provision Cloud TPUs with GKE @@ -79,33 +78,15 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ --service-account SERVICE_ACCOUNT ``` -:::{list-table} Parameter descriptions -:header-rows: 1 - -- * Parameter name - * Description -- * QUEUED_RESOURCE_ID - * The user-assigned ID of the queued resource request. -- * TPU_NAME - * The user-assigned name of the TPU which is created when the queued - resource request is allocated. -- * PROJECT_ID - * Your Google Cloud project -- * ZONE - * The GCP zone where you want to create your Cloud TPU. The value you use - depends on the version of TPUs you are using. For more information, see - `TPU regions and zones `_ -- * ACCELERATOR_TYPE - * The TPU version you want to use. Specify the TPU version, for example - `v5litepod-4` specifies a v5e TPU with 4 cores, `v6e-1` specifies a v6e TPU with 1 core. For more information, - see [TPU versions](https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions). -- * RUNTIME_VERSION - * The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s). For more information see [TPU VM images](https://cloud.google.com/tpu/docs/runtimes). -- * SERVICE_ACCOUNT - * The email address for your service account. You can find it in the IAM - Cloud Console under *Service Accounts*. For example: - `tpu-service-account@.iam.gserviceaccount.com` -::: +| Parameter name | Description | +|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| QUEUED_RESOURCE_ID | The user-assigned ID of the queued resource request. | +| TPU_NAME | The user-assigned name of the TPU which is created when the queued | +| PROJECT_ID | Your Google Cloud project | +| ZONE | The GCP zone where you want to create your Cloud TPU. The value you use | +| ACCELERATOR_TYPE | The TPU version you want to use. Specify the TPU version, for example | +| RUNTIME_VERSION | The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s). For more information see [TPU VM images](https://cloud.google.com/tpu/docs/runtimes). | +
Parameter descriptions
Connect to your TPU using SSH: @@ -113,13 +94,16 @@ Connect to your TPU using SSH: gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE ``` -## Set up using Python +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] -### Pre-built wheels +# --8<-- [end:set-up-using-python] +# --8<-- [start:pre-built-wheels] Currently, there are no pre-built TPU wheels. -### Build wheel from source +# --8<-- [end:pre-built-wheels] +# --8<-- [start:build-wheel-from-source] Install Miniconda: @@ -161,13 +145,16 @@ Run the setup script: VLLM_TARGET_DEVICE="tpu" python -m pip install -e . ``` -## Set up using Docker +# --8<-- [end:build-wheel-from-source] +# --8<-- [start:set-up-using-docker] -### Pre-built images +# --8<-- [end:set-up-using-docker] +# --8<-- [start:pre-built-images] -See for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`. +See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`. -### Build image from source +# --8<-- [end:pre-built-images] +# --8<-- [start:build-image-from-source] You can use to build a Docker image with TPU support. @@ -182,31 +169,30 @@ Run the Docker image with the following command: docker run --privileged --net host --shm-size=16G -it vllm-tpu ``` -:::{note} -Since TPU relies on XLA which requires static shapes, vLLM bucketizes the -possible input shapes and compiles an XLA graph for each shape. The -compilation time may take 20~30 minutes in the first run. However, the -compilation time reduces to ~5 minutes afterwards because the XLA graphs are -cached in the disk (in {code}`VLLM_XLA_CACHE_PATH` or {code}`~/.cache/vllm/xla_cache` by default). -::: +!!! note + Since TPU relies on XLA which requires static shapes, vLLM bucketizes the + possible input shapes and compiles an XLA graph for each shape. The + compilation time may take 20~30 minutes in the first run. However, the + compilation time reduces to ~5 minutes afterwards because the XLA graphs are + cached in the disk (in `VLLM_XLA_CACHE_PATH` or `~/.cache/vllm/xla_cache` by default). -:::{tip} -If you encounter the following error: +!!! tip + If you encounter the following error: -```console -from torch._C import * # noqa: F403 -ImportError: libopenblas.so.0: cannot open shared object file: No such -file or directory -``` + ```console + from torch._C import * # noqa: F403 + ImportError: libopenblas.so.0: cannot open shared object file: No such + file or directory + ``` -Install OpenBLAS with the following command: + Install OpenBLAS with the following command: -```console -sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev -``` + ```console + sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev + ``` -::: - -## Extra information +# --8<-- [end:build-image-from-source] +# --8<-- [start:extra-information] There is no extra information for this device. +# --8<-- [end:extra-information] diff --git a/docs/source/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md similarity index 74% rename from docs/source/getting_started/installation/cpu.md rename to docs/getting_started/installation/cpu.md index 2c0ec60d7100f..18c96b264ad82 100644 --- a/docs/source/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -2,107 +2,47 @@ vLLM is a Python library that supports the following CPU variants. Select your CPU type to see vendor specific instructions: -:::::{tab-set} -:sync-group: device +=== "Intel/AMD x86" -::::{tab-item} Intel/AMD x86 -:selected: -:sync: x86 + --8<-- "docs/getting_started/installation/cpu/x86.inc.md:installation" -:::{include} cpu/x86.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: +=== "ARM AArch64" -:::: + --8<-- "docs/getting_started/installation/cpu/arm.inc.md:installation" -::::{tab-item} ARM AArch64 -:sync: arm +=== "Apple silicon" -:::{include} cpu/arm.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: + --8<-- "docs/getting_started/installation/cpu/apple.inc.md:installation" -:::: +=== "IBM Z (S390X)" -::::{tab-item} Apple silicon -:sync: apple - -:::{include} cpu/apple.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: - -:::: - -::::{tab-item} IBM Z (S390X) -:sync: s390x - -:::{include} cpu/s390x.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: - -:::: - -::::: + --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:installation" ## Requirements - Python: 3.9 -- 3.12 -:::::{tab-set} -:sync-group: device +=== "Intel/AMD x86" -::::{tab-item} Intel/AMD x86 -:sync: x86 + --8<-- "docs/getting_started/installation/cpu/x86.inc.md:requirements" -:::{include} cpu/x86.inc.md -:start-after: "## Requirements" -:end-before: "## Set up using Python" -::: +=== "ARM AArch64" -:::: + --8<-- "docs/getting_started/installation/cpu/arm.inc.md:requirements" -::::{tab-item} ARM AArch64 -:sync: arm +=== "Apple silicon" -:::{include} cpu/arm.inc.md -:start-after: "## Requirements" -:end-before: "## Set up using Python" -::: + --8<-- "docs/getting_started/installation/cpu/apple.inc.md:requirements" -:::: +=== "IBM Z (S390X)" -::::{tab-item} Apple silicon -:sync: apple - -:::{include} cpu/apple.inc.md -:start-after: "## Requirements" -:end-before: "## Set up using Python" -::: - -:::: - -::::{tab-item} IBM Z (S390X) -:sync: s390x - -:::{include} cpu/s390x.inc.md -:start-after: "## Requirements" -:end-before: "## Set up using Python" -::: - -:::: - -::::: + --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:requirements" ## Set up using Python ### Create a new Python environment -:::{include} python_env_setup.inc.md -::: +--8<-- "docs/getting_started/installation/python_env_setup.inc.md" ### Pre-built wheels @@ -110,69 +50,29 @@ Currently, there are no pre-built CPU wheels. ### Build wheel from source -:::::{tab-set} -:sync-group: device +=== "Intel/AMD x86" -::::{tab-item} Intel/AMD x86 -:sync: x86 + --8<-- "docs/getting_started/installation/cpu/x86.inc.md:build-wheel-from-source" -:::{include} cpu/x86.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: +=== "ARM AArch64" -:::: + --8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-wheel-from-source" -::::{tab-item} ARM AArch64 -:sync: arm +=== "Apple silicon" -:::{include} cpu/arm.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: + --8<-- "docs/getting_started/installation/cpu/apple.inc.md:build-wheel-from-source" -:::: +=== "IBM Z (s390x)" -::::{tab-item} Apple silicon -:sync: apple - -:::{include} cpu/apple.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: - -:::: - -::::{tab-item} IBM Z (s390x) -:sync: s390x - -:::{include} cpu/s390x.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: - -:::: - -::::: + --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:build-wheel-from-source" ## Set up using Docker ### Pre-built images -:::::{tab-set} -:sync-group: device +=== "Intel/AMD x86" -::::{tab-item} Intel/AMD x86 -:sync: x86 - -:::{include} cpu/x86.inc.md -:start-after: "### Pre-built images" -:end-before: "### Build image from source" -::: - -:::: - -::::: + --8<-- "docs/getting_started/installation/cpu/x86.inc.md:pre-built-images" ### Build image from source @@ -192,13 +92,11 @@ $ docker run --rm \ other vLLM OpenAI server arguments ``` -::::{tip} -For ARM or Apple silicon, use `docker/Dockerfile.arm` -:::: +!!! tip + For ARM or Apple silicon, use `docker/Dockerfile.arm` -::::{tip} -For IBM Z (s390x), use `docker/Dockerfile.s390x` and in `docker run` use flag `--dtype float` -:::: +!!! tip + For IBM Z (s390x), use `docker/Dockerfile.s390x` and in `docker run` use flag `--dtype float` ## Supported features diff --git a/docs/source/getting_started/installation/cpu/apple.inc.md b/docs/getting_started/installation/cpu/apple.inc.md similarity index 58% rename from docs/source/getting_started/installation/cpu/apple.inc.md rename to docs/getting_started/installation/cpu/apple.inc.md index 7bc9e85ecd964..7a91e3ce5e5bc 100644 --- a/docs/source/getting_started/installation/cpu/apple.inc.md +++ b/docs/getting_started/installation/cpu/apple.inc.md @@ -1,24 +1,27 @@ -# Installation +# --8<-- [start:installation] vLLM has experimental support for macOS with Apple silicon. For now, users shall build from the source vLLM to natively run on macOS. Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. -:::{attention} -There are no pre-built wheels or images for this device, so you must build vLLM from source. -::: +!!! warning + There are no pre-built wheels or images for this device, so you must build vLLM from source. -## Requirements +# --8<-- [end:installation] +# --8<-- [start:requirements] - OS: `macOS Sonoma` or later - SDK: `XCode 15.4` or later with Command Line Tools - Compiler: `Apple Clang >= 15.0.0` -## Set up using Python +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] -### Pre-built wheels +# --8<-- [end:set-up-using-python] +# --8<-- [start:pre-built-wheels] -### Build wheel from source +# --8<-- [end:pre-built-wheels] +# --8<-- [start:build-wheel-from-source] After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source. @@ -29,9 +32,8 @@ pip install -r requirements/cpu.txt pip install -e . ``` -:::{note} -On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device. -::: +!!! note + On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device. #### Troubleshooting @@ -51,10 +53,15 @@ If the build has error like the following snippet where standard C++ headers can 1 error generated. ``` -## Set up using Docker +# --8<-- [end:build-wheel-from-source] +# --8<-- [start:set-up-using-docker] -### Pre-built images +# --8<-- [end:set-up-using-docker] +# --8<-- [start:pre-built-images] -### Build image from source +# --8<-- [end:pre-built-images] +# --8<-- [start:build-image-from-source] -## Extra information +# --8<-- [end:build-image-from-source] +# --8<-- [start:extra-information] +# --8<-- [end:extra-information] diff --git a/docs/getting_started/installation/cpu/arm.inc.md b/docs/getting_started/installation/cpu/arm.inc.md new file mode 100644 index 0000000000000..59b71dcaf911a --- /dev/null +++ b/docs/getting_started/installation/cpu/arm.inc.md @@ -0,0 +1,41 @@ +# --8<-- [start:installation] + +vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. + +ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. + +!!! warning + There are no pre-built wheels or images for this device, so you must build vLLM from source. + +# --8<-- [end:installation] +# --8<-- [start:requirements] + +- OS: Linux +- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended) +- Instruction Set Architecture (ISA): NEON support is required + +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] + +# --8<-- [end:set-up-using-python] +# --8<-- [start:pre-built-wheels] + +# --8<-- [end:pre-built-wheels] +# --8<-- [start:build-wheel-from-source] + +--8<-- "docs/getting_started/installation/cpu/cpu/build.inc.md" + +Testing has been conducted on AWS Graviton3 instances for compatibility. + +# --8<-- [end:build-wheel-from-source] +# --8<-- [start:set-up-using-docker] + +# --8<-- [end:set-up-using-docker] +# --8<-- [start:pre-built-images] + +# --8<-- [end:pre-built-images] +# --8<-- [start:build-image-from-source] + +# --8<-- [end:build-image-from-source] +# --8<-- [start:extra-information] +# --8<-- [end:extra-information] diff --git a/docs/source/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md similarity index 96% rename from docs/source/getting_started/installation/cpu/build.inc.md rename to docs/getting_started/installation/cpu/build.inc.md index f385f3d5b1984..7d6472afa7ea7 100644 --- a/docs/source/getting_started/installation/cpu/build.inc.md +++ b/docs/getting_started/installation/cpu/build.inc.md @@ -32,3 +32,5 @@ If you want to develop vllm, install it in editable mode instead. ```console VLLM_TARGET_DEVICE=cpu python setup.py develop ``` + +# --8<-- [end:extra-information] diff --git a/docs/source/getting_started/installation/cpu/s390x.inc.md b/docs/getting_started/installation/cpu/s390x.inc.md similarity index 64% rename from docs/source/getting_started/installation/cpu/s390x.inc.md rename to docs/getting_started/installation/cpu/s390x.inc.md index 9b41173b44cee..670485feefb65 100644 --- a/docs/source/getting_started/installation/cpu/s390x.inc.md +++ b/docs/getting_started/installation/cpu/s390x.inc.md @@ -1,25 +1,28 @@ -# Installation +# --8<-- [start:installation] vLLM has experimental support for s390x architecture on IBM Z platform. For now, users shall build from the vLLM source to natively run on IBM Z platform. Currently the CPU implementation for s390x architecture supports FP32 datatype only. -:::{attention} -There are no pre-built wheels or images for this device, so you must build vLLM from source. -::: +!!! warning + There are no pre-built wheels or images for this device, so you must build vLLM from source. -## Requirements +# --8<-- [end:installation] +# --8<-- [start:requirements] - OS: `Linux` - SDK: `gcc/g++ >= 12.3.0` or later with Command Line Tools - Instruction Set Architecture (ISA): VXE support is required. Works with Z14 and above. - Build install python packages: `pyarrow`, `torch` and `torchvision` -## Set up using Python +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] -### Pre-built wheels +# --8<-- [end:set-up-using-python] +# --8<-- [start:pre-built-wheels] -### Build wheel from source +# --8<-- [end:pre-built-wheels] +# --8<-- [start:build-wheel-from-source] Install the following packages from the package manager before building the vLLM. For example on RHEL 9.4: @@ -39,9 +42,8 @@ curl https://sh.rustup.rs -sSf | sh -s -- -y && \ Execute the following commands to build and install vLLM from the source. -::::{tip} -Please build the following dependencies, `torchvision`, `pyarrow` from the source before building vLLM. -:::: +!!! tip + Please build the following dependencies, `torchvision`, `pyarrow` from the source before building vLLM. ```console sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds @@ -53,10 +55,15 @@ Please build the following dependencies, `torchvision`, `pyarrow` from the sourc pip install dist/*.whl ``` -## Set up using Docker +# --8<-- [end:build-wheel-from-source] +# --8<-- [start:set-up-using-docker] -### Pre-built images +# --8<-- [end:set-up-using-docker] +# --8<-- [start:pre-built-images] -### Build image from source +# --8<-- [end:pre-built-images] +# --8<-- [start:build-image-from-source] -## Extra information +# --8<-- [end:build-image-from-source] +# --8<-- [start:extra-information] +# --8<-- [end:extra-information] diff --git a/docs/getting_started/installation/cpu/x86.inc.md b/docs/getting_started/installation/cpu/x86.inc.md new file mode 100644 index 0000000000000..9434eeea8b4a1 --- /dev/null +++ b/docs/getting_started/installation/cpu/x86.inc.md @@ -0,0 +1,46 @@ +# --8<-- [start:installation] + +vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. + +!!! warning + There are no pre-built wheels or images for this device, so you must build vLLM from source. + +# --8<-- [end:installation] +# --8<-- [start:requirements] + +- OS: Linux +- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended) +- Instruction Set Architecture (ISA): AVX512 (optional, recommended) + +!!! tip + [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. + +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] + +# --8<-- [end:set-up-using-python] +# --8<-- [start:pre-built-wheels] + +# --8<-- [end:pre-built-wheels] +# --8<-- [start:build-wheel-from-source] + +--8<-- "docs/getting_started/installation/cpu/cpu/build.inc.md" + +!!! note + - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, which brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. + - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building. + +# --8<-- [end:build-wheel-from-source] +# --8<-- [start:set-up-using-docker] + +# --8<-- [end:set-up-using-docker] +# --8<-- [start:pre-built-images] + +See [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo) + +# --8<-- [end:pre-built-images] +# --8<-- [start:build-image-from-source] + +# --8<-- [end:build-image-from-source] +# --8<-- [start:extra-information] +# --8<-- [end:extra-information] diff --git a/docs/source/getting_started/installation/device.template.md b/docs/getting_started/installation/device.template.md similarity index 100% rename from docs/source/getting_started/installation/device.template.md rename to docs/getting_started/installation/device.template.md diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md new file mode 100644 index 0000000000000..3c983f600673d --- /dev/null +++ b/docs/getting_started/installation/gpu.md @@ -0,0 +1,124 @@ +# GPU + +vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions: + +=== "NVIDIA CUDA" + + --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:installation" + +=== "AMD ROCm" + + --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:installation" + +=== "Intel XPU" + + --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:installation" + +## Requirements + +- OS: Linux +- Python: 3.9 -- 3.12 + +=== "NVIDIA CUDA" + + --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:requirements" + +=== "AMD ROCm" + + --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:requirements" + +=== "Intel XPU" + + --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:requirements" + +## Set up using Python + +### Create a new Python environment + +--8<-- "docs/getting_started/installation/python_env_setup.inc.md" + +=== "NVIDIA CUDA" + + --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:create-a-new-python-environment" + +=== "AMD ROCm" + + There is no extra information on creating a new Python environment for this device. + +=== "Intel XPU" + + There is no extra information on creating a new Python environment for this device. + +### Pre-built wheels + +=== "NVIDIA CUDA" + + --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:pre-built-wheels" + +=== "AMD ROCm" + + --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:pre-built-wheels" + +=== "Intel XPU" + + --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-wheels" + +[](){ #build-from-source } + +### Build wheel from source + +=== "NVIDIA CUDA" + + --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-wheel-from-source" + +=== "AMD ROCm" + + --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-wheel-from-source" + +=== "Intel XPU" + + --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-wheel-from-source" + +## Set up using Docker + +### Pre-built images + +=== "NVIDIA CUDA" + + --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:pre-built-images" + +=== "AMD ROCm" + + --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:pre-built-images" + +=== "Intel XPU" + + --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-images" + +### Build image from source + +=== "NVIDIA CUDA" + + --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-image-from-source" + +=== "AMD ROCm" + + --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-image-from-source" + +=== "Intel XPU" + + --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-image-from-source" + +## Supported features + +=== "NVIDIA CUDA" + + --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:supported-features" + +=== "AMD ROCm" + + --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:supported-features" + +=== "Intel XPU" + + --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:supported-features" diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md similarity index 62% rename from docs/source/getting_started/installation/gpu/cuda.inc.md rename to docs/getting_started/installation/gpu/cuda.inc.md index 06915f09dd517..64dccef63d73d 100644 --- a/docs/source/getting_started/installation/gpu/cuda.inc.md +++ b/docs/getting_started/installation/gpu/cuda.inc.md @@ -1,43 +1,52 @@ -# Installation +# --8<-- [start:installation] -vLLM contains pre-compiled C++ and CUDA (12.6) binaries. +vLLM contains pre-compiled C++ and CUDA (12.8) binaries. -## Requirements +# --8<-- [end:installation] +# --8<-- [start:requirements] - GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) -## Set up using Python +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] ### Create a new Python environment -:::{note} -PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See for more details. -::: +!!! note + PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See for more details. In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. -Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-from-source) for more details. +Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below][build-from-source] for more details. -### Pre-built wheels +# --8<-- [end:set-up-using-python] +# --8<-- [start:pre-built-wheels] You can install vLLM using either `pip` or `uv pip`: ```console -# Install vLLM with CUDA 12.6. -pip install vllm # If you are using pip. -uv pip install vllm # If you are using uv. +# Install vLLM with CUDA 12.8. +# If you are using pip. +pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128 +# If you are using uv. +uv pip install vllm --torch-backend=auto ``` -As of now, vLLM's binaries are compiled with CUDA 12.6 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 11.8, and public PyTorch release versions: +We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first. + +!!! note + NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration. + +As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions: ```console # Install vLLM with CUDA 11.8. export VLLM_VERSION=0.6.1.post1 -export PYTHON_VERSION=310 -pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +export PYTHON_VERSION=312 +uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 ``` -(install-the-latest-code)= +[](){ #install-the-latest-code } #### Install the latest code @@ -46,11 +55,23 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe ##### Install the latest code using `pip` ```console -pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly +pip install -U vllm \ + --pre \ + --extra-index-url https://wheels.vllm.ai/nightly ``` `--pre` is required for `pip` to consider pre-released versions. +Another way to install the latest code is to use `uv`: + +```console +uv pip install -U vllm \ + --torch-backend=auto \ + --extra-index-url https://wheels.vllm.ai/nightly +``` + +##### Install specific revisions using `pip` + If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL: ```console @@ -60,26 +81,21 @@ pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manyl Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. -##### Install the latest code using `uv` - -Another way to install the latest code is to use `uv`: - -```console -uv pip install -U vllm --extra-index-url https://wheels.vllm.ai/nightly -``` - ##### Install specific revisions using `uv` If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL: ```console export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch -uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} +uv pip install vllm \ + --torch-backend=auto \ + --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} ``` The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. -### Build wheel from source +# --8<-- [end:pre-built-wheels] +# --8<-- [start:build-wheel-from-source] #### Set up using Python-only build (without compilation) @@ -92,15 +108,15 @@ VLLM_USE_PRECOMPILED=1 pip install --editable . ``` This command will do the following: -1. Look for the current branch in your vLLM clone. -2. Identify the corresponding base commit in the main branch. -3. Download the pre-built wheel of the base commit. -4. Use its compiled libraries in the installation. -:::{note} -1. If you change C++ or kernel code, you cannot use Python-only build; otherwise you will see an import error about library not found or undefined symbol. -2. If you rebase your dev branch, it is recommended to uninstall vllm and re-run the above command to make sure your libraries are up to date. -::: +1. Look for the current branch in your vLLM clone. +1. Identify the corresponding base commit in the main branch. +1. Download the pre-built wheel of the base commit. +1. Use its compiled libraries in the installation. + +!!! note + 1. If you change C++ or kernel code, you cannot use Python-only build; otherwise you will see an import error about library not found or undefined symbol. + 2. If you rebase your dev branch, it is recommended to uninstall vllm and re-run the above command to make sure your libraries are up to date. In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable. @@ -110,12 +126,11 @@ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vll pip install --editable . ``` -You can find more information about vLLM's wheels in . +You can find more information about vLLM's wheels in [install-the-latest-code][install-the-latest-code]. -:::{note} -There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. -It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to for instructions on how to install a specified wheel. -::: +!!! note + There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. + It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [install-the-latest-code][install-the-latest-code] for instructions on how to install a specified wheel. #### Full build (with compilation) @@ -127,17 +142,16 @@ cd vllm pip install -e . ``` -:::{tip} -Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. +!!! tip + Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. -For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` . -As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. + For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` . + As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. -When using `ccache` with `pip install -e .`, you should run `CCACHE_NOHASHDIR="true" pip install --no-build-isolation -e .`. This is because `pip` creates a new folder with a random name for each build, preventing `ccache` from recognizing that the same files are being built. + When using `ccache` with `pip install -e .`, you should run `CCACHE_NOHASHDIR="true" pip install --no-build-isolation -e .`. This is because `pip` creates a new folder with a random name for each build, preventing `ccache` from recognizing that the same files are being built. -[sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments. -The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`. -::: + [sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments. + The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`. ##### Use an existing PyTorch installation @@ -184,7 +198,11 @@ Additionally, if you have trouble building vLLM, we recommend using the NVIDIA P ```console # Use `--ipc=host` to make sure the shared memory is large enough. -docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 +docker run \ + --gpus all \ + -it \ + --rm \ + --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 ``` If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.: @@ -212,11 +230,13 @@ export VLLM_TARGET_DEVICE=empty pip install -e . ``` -## Set up using Docker +# --8<-- [end:build-wheel-from-source] +# --8<-- [start:set-up-using-docker] -### Pre-built images +# --8<-- [end:set-up-using-docker] +# --8<-- [start:pre-built-images] -See for instructions on using the official Docker image. +See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image. Another way to access the latest code is to use the docker images: @@ -229,10 +249,12 @@ These docker images are used for CI and testing only, and they are not intended The latest code can contain bugs and may not be stable. Please use it with caution. -### Build image from source +# --8<-- [end:pre-built-images] +# --8<-- [start:build-image-from-source] -See for instructions on building the Docker image. +See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image. ## Supported features -See compatibility matrix for feature support information. +See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information. +# --8<-- [end:extra-information] diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md similarity index 66% rename from docs/source/getting_started/installation/gpu/rocm.inc.md rename to docs/getting_started/installation/gpu/rocm.inc.md index dc74368fe2c96..0029b3a244968 100644 --- a/docs/source/getting_started/installation/gpu/rocm.inc.md +++ b/docs/getting_started/installation/gpu/rocm.inc.md @@ -1,28 +1,31 @@ -# Installation +# --8<-- [start:installation] vLLM supports AMD GPUs with ROCm 6.3. -:::{attention} -There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source. -::: +!!! warning + There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source. -## Requirements +# --8<-- [end:installation] +# --8<-- [start:requirements] - GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100/1101), Radeon RX 9000 series (gfx1200/1201) - ROCm 6.3 -## Set up using Python +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] -### Pre-built wheels +# --8<-- [end:set-up-using-python] +# --8<-- [start:pre-built-wheels] Currently, there are no pre-built ROCm wheels. -### Build wheel from source +# --8<-- [end:pre-built-wheels] +# --8<-- [start:build-wheel-from-source] 0. Install prerequisites (skip if you are already in an environment/docker with the following installed): -- [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) -- [PyTorch](https://pytorch.org/) + - [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) + - [PyTorch](https://pytorch.org/) For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.3_ubuntu24.04_py3.12_pytorch_release_2.4.0`, `rocm/pytorch-nightly`. If you are using docker image, you can skip to Step 3. @@ -49,9 +52,8 @@ Currently, there are no pre-built ROCm wheels. cd ../.. ``` - :::{note} - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. - ::: + !!! note + If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. 2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention) @@ -69,9 +71,8 @@ Currently, there are no pre-built ROCm wheels. cd .. ``` - :::{note} - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) - ::: + !!! note + You might need to downgrade the "ninja" version to 1.10 as it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) 3. If you choose to build AITER yourself to use a certain branch or commit, you can build AITER using the following steps: @@ -84,55 +85,56 @@ Currently, there are no pre-built ROCm wheels. python3 setup.py develop ``` - :::{note} - You will need to config the `$AITER_BRANCH_OR_COMMIT` for your purpose. - ::: + !!! note + You will need to config the `$AITER_BRANCH_OR_COMMIT` for your purpose. 4. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps: ```bash - $ pip install --upgrade pip + pip install --upgrade pip # Build & install AMD SMI - $ pip install /opt/rocm/share/amd_smi + pip install /opt/rocm/share/amd_smi # Install dependencies - $ pip install --upgrade numba scipy huggingface-hub[cli,hf_transfer] setuptools_scm - $ pip install "numpy<2" - $ pip install -r requirements/rocm.txt + pip install --upgrade numba \ + scipy \ + huggingface-hub[cli,hf_transfer] \ + setuptools_scm + pip install "numpy<2" + pip install -r requirements/rocm.txt # Build vLLM for MI210/MI250/MI300. - $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" - $ python3 setup.py develop + export PYTORCH_ROCM_ARCH="gfx90a;gfx942" + python3 setup.py develop ``` This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation. - :::{tip} - - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. - - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. - - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. - - The ROCm version of PyTorch, ideally, should match the ROCm driver version. - ::: + !!! tip + - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. + - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. + - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. + - The ROCm version of PyTorch, ideally, should match the ROCm driver version. -:::{tip} -- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. - For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). -::: +!!! tip + - For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. + For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). ## Set up using Docker (Recommended) -### Pre-built images +# --8<-- [end:set-up-using-docker] +# --8<-- [start:pre-built-images] The [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized docker image designed for validating inference performance on the AMD Instinctโ„ข MI300X accelerator. -:::{tip} -Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html) -for instructions on how to use this prebuilt docker image. -::: +!!! tip + Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html) + for instructions on how to use this prebuilt docker image. -### Build image from source +# --8<-- [end:pre-built-images] +# --8<-- [start:build-image-from-source] Building the Docker image from source is the recommended way to use vLLM with ROCm. @@ -155,7 +157,9 @@ It is important that the user kicks off the docker build using buildkit. Either To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default: ```console -DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm_base -t rocm/vllm-dev:base . +DOCKER_BUILDKIT=1 docker build \ + -f docker/Dockerfile.rocm_base \ + -t rocm/vllm-dev:base . ``` #### Build an image with vLLM @@ -190,7 +194,11 @@ DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm . To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image: ```console -DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f docker/Dockerfile.rocm -t vllm-rocm . +DOCKER_BUILDKIT=1 docker build \ + --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" \ + -f docker/Dockerfile.rocm \ + -t vllm-rocm \ + . ``` To run the above docker image `vllm-rocm`, use the below command: @@ -213,4 +221,5 @@ Where the `` is the location where the model is stored, for examp ## Supported features -See compatibility matrix for feature support information. +See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information. +# --8<-- [end:extra-information] diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/getting_started/installation/gpu/xpu.inc.md similarity index 67% rename from docs/source/getting_started/installation/gpu/xpu.inc.md rename to docs/getting_started/installation/gpu/xpu.inc.md index 4ab41a21c2a15..bee9a7ebb717b 100644 --- a/docs/source/getting_started/installation/gpu/xpu.inc.md +++ b/docs/getting_started/installation/gpu/xpu.inc.md @@ -1,23 +1,26 @@ -# Installation +# --8<-- [start:installation] vLLM initially supports basic model inference and serving on Intel GPU platform. -:::{attention} -There are no pre-built wheels or images for this device, so you must build vLLM from source. -::: +!!! warning + There are no pre-built wheels or images for this device, so you must build vLLM from source. -## Requirements +# --8<-- [end:installation] +# --8<-- [start:requirements] - Supported Hardware: Intel Data Center GPU, Intel ARC GPU - OneAPI requirements: oneAPI 2025.0 -## Set up using Python +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] -### Pre-built wheels +# --8<-- [end:set-up-using-python] +# --8<-- [start:pre-built-wheels] Currently, there are no pre-built XPU wheels. -### Build wheel from source +# --8<-- [end:pre-built-wheels] +# --8<-- [start:build-wheel-from-source] - First, install required driver and Intel OneAPI 2025.0 or later. - Second, install Python packages for vLLM XPU backend building: @@ -35,18 +38,20 @@ pip install -v -r requirements/xpu.txt VLLM_TARGET_DEVICE=xpu python setup.py install ``` -:::{note} -- FP16 is the default data type in the current XPU backend. The BF16 data - type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet. -::: +!!! note + - FP16 is the default data type in the current XPU backend. The BF16 data + type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet. -## Set up using Docker +# --8<-- [end:build-wheel-from-source] +# --8<-- [start:set-up-using-docker] -### Pre-built images +# --8<-- [end:set-up-using-docker] +# --8<-- [start:pre-built-images] Currently, there are no pre-built XPU images. -### Build image from source +# --8<-- [end:pre-built-images] +# --8<-- [start:build-image-from-source] ```console $ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . @@ -66,7 +71,6 @@ XPU platform supports **tensor parallel** inference/serving and also supports ** python -m vllm.entrypoints.openai.api_server \ --model=facebook/opt-13b \ --dtype=bfloat16 \ - --device=xpu \ --max_model_len=1024 \ --distributed-executor-backend=ray \ --pipeline-parallel-size=2 \ @@ -74,3 +78,4 @@ python -m vllm.entrypoints.openai.api_server \ ``` By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. +# --8<-- [end:extra-information] diff --git a/docs/getting_started/installation/python_env_setup.inc.md b/docs/getting_started/installation/python_env_setup.inc.md new file mode 100644 index 0000000000000..911301d683359 --- /dev/null +++ b/docs/getting_started/installation/python_env_setup.inc.md @@ -0,0 +1,6 @@ +It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands: + +```console +uv venv --python 3.12 --seed +source .venv/bin/activate +``` diff --git a/docs/source/getting_started/quickstart.md b/docs/getting_started/quickstart.md similarity index 68% rename from docs/source/getting_started/quickstart.md rename to docs/getting_started/quickstart.md index 42468ff73c2c3..d24e75e8141d8 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -1,11 +1,12 @@ -(quickstart)= - -# Quickstart +--- +title: Quickstart +--- +[](){ #quickstart } This guide will help you quickly get started with vLLM to perform: -- [Offline batched inference](#quickstart-offline) -- [Online serving using OpenAI-compatible server](#quickstart-online) +- [Offline batched inference][quickstart-offline] +- [Online serving using OpenAI-compatible server][quickstart-online] ## Prerequisites @@ -21,48 +22,49 @@ It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python env ```console uv venv --python 3.12 --seed source .venv/bin/activate -uv pip install vllm +uv pip install vllm --torch-backend=auto ``` -Another delightful way is to use `uv run` with `--with [dependency]` option, which allows you to run commands such as `vllm serve` without creating an environment: +`uv` can [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). + +Another delightful way is to use `uv run` with `--with [dependency]` option, which allows you to run commands such as `vllm serve` without creating any permanent environment: ```console uv run --with vllm vllm --help ``` -You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. +You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. You can install `uv` to the conda environment through `pip` if you want to manage it within the environment. ```console conda create -n myenv python=3.12 -y conda activate myenv -pip install vllm +pip install --upgrade uv +uv pip install vllm --torch-backend=auto ``` -:::{note} -For non-CUDA platforms, please refer [here](#installation-index) for specific instructions on how to install vLLM. -::: +!!! note + For more detail and non-CUDA platforms, please refer [here][installation-index] for specific instructions on how to install vLLM. -(quickstart-offline)= +[](){ #quickstart-offline } ## Offline Batched Inference With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: -The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`: +The first line of this example imports the classes [LLM][vllm.LLM] and [SamplingParams][vllm.SamplingParams]: -- {class}`~vllm.LLM` is the main class for running offline inference with vLLM engine. -- {class}`~vllm.SamplingParams` specifies the parameters for the sampling process. +- [LLM][vllm.LLM] is the main class for running offline inference with vLLM engine. +- [SamplingParams][vllm.SamplingParams] specifies the parameters for the sampling process. ```python from vllm import LLM, SamplingParams ``` -The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](#sampling-params). -:::{important} -By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the Hugging Face model repository if it exists. In most cases, this will provide you with the best results by default if {class}`~vllm.SamplingParams` is not specified. +The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here][sampling-params]. +!!! warning + By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the Hugging Face model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified. -However, if vLLM's default sampling parameters are preferred, please set `generation_config="vllm"` when creating the {class}`~vllm.LLM` instance. -::: + However, if vLLM's default sampling parameters are preferred, please set `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance. ```python prompts = [ @@ -74,20 +76,18 @@ prompts = [ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) ``` -The {class}`~vllm.LLM` class initializes vLLM's engine and the [OPT-125M model](https://arxiv.org/abs/2205.01068) for offline inference. The list of supported models can be found [here](#supported-models). +The [LLM][vllm.LLM] class initializes vLLM's engine and the [OPT-125M model](https://arxiv.org/abs/2205.01068) for offline inference. The list of supported models can be found [here][supported-models]. ```python llm = LLM(model="facebook/opt-125m") ``` -:::{note} -By default, vLLM downloads models from [Hugging Face](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine. +!!! note + By default, vLLM downloads models from [Hugging Face](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine. -```shell -export VLLM_USE_MODELSCOPE=True -``` - -::: + ```shell + export VLLM_USE_MODELSCOPE=True + ``` Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens. @@ -100,7 +100,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -(quickstart-online)= +[](){ #quickstart-online } ## OpenAI-Compatible Server @@ -113,15 +113,13 @@ Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instru vllm serve Qwen/Qwen2.5-1.5B-Instruct ``` -:::{note} -By default, the server uses a predefined chat template stored in the tokenizer. -You can learn about overriding it [here](#chat-template). -::: -:::{important} -By default, the server applies `generation_config.json` from the huggingface model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator. +!!! note + By default, the server uses a predefined chat template stored in the tokenizer. + You can learn about overriding it [here][chat-template]. +!!! warning + By default, the server applies `generation_config.json` from the huggingface model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator. -To disable this behavior, please pass `--generation-config vllm` when launching the server. -::: + To disable this behavior, please pass `--generation-config vllm` when launching the server. This server can be queried in the same format as OpenAI API. For example, to list the models: @@ -212,6 +210,5 @@ Currently, vLLM supports multiple backends for efficient Attention computation a If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`. -```{attention} -There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see for instructions on how to install it. -``` +!!! warning + There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see for instructions on how to install it. diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 747ffb7b30336..0000000000000 --- a/docs/make.bat +++ /dev/null @@ -1,35 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=source -set BUILDDIR=build - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.https://www.sphinx-doc.org/ - exit /b 1 -) - -if "%1" == "" goto help - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py new file mode 100644 index 0000000000000..6f290efe45c2f --- /dev/null +++ b/docs/mkdocs/hooks/generate_examples.py @@ -0,0 +1,162 @@ +# SPDX-License-Identifier: Apache-2.0 +import itertools +from dataclasses import dataclass, field +from pathlib import Path +from typing import Literal + +import regex as re + +ROOT_DIR = Path(__file__).parent.parent.parent.parent +ROOT_DIR_RELATIVE = '../../../../..' +EXAMPLE_DIR = ROOT_DIR / "examples" +EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples" +print(ROOT_DIR.resolve()) +print(EXAMPLE_DIR.resolve()) +print(EXAMPLE_DOC_DIR.resolve()) + + +def fix_case(text: str) -> str: + subs = { + "api": "API", + "cli": "CLI", + "cpu": "CPU", + "llm": "LLM", + "mae": "MAE", + "tpu": "TPU", + "aqlm": "AQLM", + "gguf": "GGUF", + "lora": "LoRA", + "rlhf": "RLHF", + "vllm": "vLLM", + "openai": "OpenAI", + "lmcache": "LMCache", + "multilora": "MultiLoRA", + "mlpspeculator": "MLPSpeculator", + r"fp\d+": lambda x: x.group(0).upper(), # e.g. fp16, fp32 + r"int\d+": lambda x: x.group(0).upper(), # e.g. int8, int16 + } + for pattern, repl in subs.items(): + text = re.sub(rf'\b{pattern}\b', repl, text, flags=re.IGNORECASE) + return text + + +@dataclass +class Example: + """ + Example class for generating documentation content from a given path. + + Attributes: + path (Path): The path to the main directory or file. + category (str): The category of the document. + main_file (Path): The main file in the directory. + other_files (list[Path]): list of other files in the directory. + title (str): The title of the document. + + Methods: + __post_init__(): Initializes the main_file, other_files, and title attributes. + determine_main_file() -> Path: Determines the main file in the given path. + determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file. + determine_title() -> str: Determines the title of the document. + generate() -> str: Generates the documentation content. + """ # noqa: E501 + path: Path + category: str = None + main_file: Path = field(init=False) + other_files: list[Path] = field(init=False) + title: str = field(init=False) + + def __post_init__(self): + self.main_file = self.determine_main_file() + self.other_files = self.determine_other_files() + self.title = self.determine_title() + + def determine_main_file(self) -> Path: + """ + Determines the main file in the given path. + If the path is a file, it returns the path itself. Otherwise, it searches + for Markdown files (*.md) in the directory and returns the first one found. + Returns: + Path: The main file path, either the original path if it's a file or the first + Markdown file found in the directory. + Raises: + IndexError: If no Markdown files are found in the directory. + """ # noqa: E501 + return self.path if self.path.is_file() else list( + self.path.glob("*.md")).pop() + + def determine_other_files(self) -> list[Path]: + """ + Determine other files in the directory excluding the main file. + + This method checks if the given path is a file. If it is, it returns an empty list. + Otherwise, it recursively searches through the directory and returns a list of all + files that are not the main file. + + Returns: + list[Path]: A list of Path objects representing the other files in the directory. + """ # noqa: E501 + if self.path.is_file(): + return [] + is_other_file = lambda file: file.is_file() and file != self.main_file + return [file for file in self.path.rglob("*") if is_other_file(file)] + + def determine_title(self) -> str: + return fix_case(self.path.stem.replace("_", " ").title()) + + def generate(self) -> str: + content = f"---\ntitle: {self.title}\n---\n\n" + content += f"Source .\n\n" + + # Use long code fence to avoid issues with + # included files containing code fences too + code_fence = "``````" + is_code = self.main_file.suffix != ".md" + if is_code: + content += f"{code_fence}{self.main_file.suffix[1:]}\n" + content += f'--8<-- "{self.main_file}"\n' + if is_code: + content += f"{code_fence}\n" + content += "\n" + + if not self.other_files: + return content + + content += "## Example materials\n\n" + for file in sorted(self.other_files): + content += f'??? abstract "{file.relative_to(self.path)}"\n' + if file.suffix != ".md": + content += f" {code_fence}{file.suffix[1:]}\n" + content += f' --8<-- "{file}"\n' + if file.suffix != ".md": + content += f" {code_fence}\n" + + return content + + +def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): + # Create the EXAMPLE_DOC_DIR if it doesn't exist + if not EXAMPLE_DOC_DIR.exists(): + EXAMPLE_DOC_DIR.mkdir(parents=True) + + categories = sorted(p for p in EXAMPLE_DIR.iterdir() if p.is_dir()) + + examples = [] + glob_patterns = ["*.py", "*.md", "*.sh"] + # Find categorised examples + for category in categories: + globs = [category.glob(pattern) for pattern in glob_patterns] + for path in itertools.chain(*globs): + examples.append(Example(path, category.stem)) + # Find examples in subdirectories + for path in category.glob("*/*.md"): + examples.append(Example(path.parent, category.stem)) + + # Generate the example documentation + for example in sorted(examples, key=lambda e: e.path.stem): + example_name = f"{example.path.stem}.md" + doc_path = EXAMPLE_DOC_DIR / example.category / example_name + print(doc_path) + if not doc_path.parent.exists(): + doc_path.parent.mkdir(parents=True) + with open(doc_path, "w+") as f: + f.write(example.generate()) diff --git a/docs/mkdocs/hooks/remove_announcement.py b/docs/mkdocs/hooks/remove_announcement.py new file mode 100644 index 0000000000000..e5f8549d83837 --- /dev/null +++ b/docs/mkdocs/hooks/remove_announcement.py @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: Apache-2.0 +import os +from typing import Literal + + +def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): + # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa + if os.getenv('READTHEDOCS_VERSION_TYPE') == "tag": + # remove the warning banner if the version is a tagged release + docs_dir = os.path.dirname(__file__) + announcement_path = os.path.join(docs_dir, + "mkdocs/overrides/main.html") + # The file might be removed already if the build is triggered multiple + # times (readthedocs build both HTML and PDF versions separately) + if os.path.exists(announcement_path): + os.remove(announcement_path) diff --git a/docs/mkdocs/hooks/url_schemes.py b/docs/mkdocs/hooks/url_schemes.py new file mode 100644 index 0000000000000..c738828085ba7 --- /dev/null +++ b/docs/mkdocs/hooks/url_schemes.py @@ -0,0 +1,53 @@ +# SPDX-License-Identifier: Apache-2.0 +import regex as re +from mkdocs.config.defaults import MkDocsConfig +from mkdocs.structure.files import Files +from mkdocs.structure.pages import Page + + +def on_page_markdown(markdown: str, *, page: Page, config: MkDocsConfig, + files: Files): + gh_icon = ":octicons-mark-github-16:" + gh_url = "https://github.com" + repo_url = f"{gh_url}/vllm-project/vllm" + org_url = f"{gh_url}/orgs/vllm-project" + urls = { + "issue": f"{repo_url}/issues", + "pr": f"{repo_url}/pull", + "project": f"{org_url}/projects", + "dir": f"{repo_url}/tree/main", + "file": f"{repo_url}/blob/main", + } + titles = { + "issue": "Issue #", + "pr": "Pull Request #", + "project": "Project #", + "dir": "", + "file": "", + } + + scheme = r"gh-(?P.+?):(?P.+?)(#(?P.+?))?" + inline_link = re.compile(r"\[(?P[^\[]+?)\]\(" + scheme + r"\)") + auto_link = re.compile(f"<{scheme}>") + + def replace_inline_link(match: re.Match) -> str: + url = f'{urls[match.group("type")]}/{match.group("path")}' + if fragment := match.group("fragment"): + url += f"#{fragment}" + + return f'[{gh_icon} {match.group("title")}]({url})' + + def replace_auto_link(match: re.Match) -> str: + type = match.group("type") + path = match.group("path") + title = f"{titles[type]}{path}" + url = f"{urls[type]}/{path}" + if fragment := match.group("fragment"): + url += f"#{fragment}" + + return f"[{gh_icon} {title}]({url})" + + markdown = inline_link.sub(replace_inline_link, markdown) + markdown = auto_link.sub(replace_auto_link, markdown) + + return markdown diff --git a/docs/source/_static/custom.js b/docs/mkdocs/javascript/run_llm_widget.js similarity index 54% rename from docs/source/_static/custom.js rename to docs/mkdocs/javascript/run_llm_widget.js index 58bc2ebb9614b..d0e5560e92b4e 100644 --- a/docs/source/_static/custom.js +++ b/docs/mkdocs/javascript/run_llm_widget.js @@ -17,22 +17,3 @@ document.addEventListener("DOMContentLoaded", function () { script.async = true; document.head.appendChild(script); }); - -// Update URL search params when tab is clicked - document.addEventListener("DOMContentLoaded", function () { - const tabs = document.querySelectorAll(".sd-tab-label"); - - function updateURL(tab) { - const syncGroup = tab.getAttribute("data-sync-group"); - const syncId = tab.getAttribute("data-sync-id"); - if (syncGroup && syncId) { - const url = new URL(window.location); - url.searchParams.set(syncGroup, syncId); - window.history.replaceState(null, "", url); - } - } - - tabs.forEach(tab => { - tab.addEventListener("click", () => updateURL(tab)); - }); -}); diff --git a/docs/mkdocs/overrides/main.html b/docs/mkdocs/overrides/main.html new file mode 100644 index 0000000000000..bdd62ebc158df --- /dev/null +++ b/docs/mkdocs/overrides/main.html @@ -0,0 +1,5 @@ +{% extends "base.html" %} + +{% block announce %} + <p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p> +{% endblock %} diff --git a/docs/mkdocs/stylesheets/extra.css b/docs/mkdocs/stylesheets/extra.css new file mode 100644 index 0000000000000..088143ed59563 --- /dev/null +++ b/docs/mkdocs/stylesheets/extra.css @@ -0,0 +1,36 @@ +/* Warning for latest docs */ +.md-banner { + background-color: var(--md-warning-bg-color); + color: var(--md-warning-fg-color); +} + +/* https://christianoliff.com/blog/styling-external-links-with-an-icon-in-css/ */ +a:not(:has(svg)):not(.md-icon):not(.autorefs-external) { + align-items: center; + + &[href^="//"]::after, + &[href^="http://"]::after, + &[href^="https://"]::after { + content: ""; + width: 12px; + height: 12px; + margin-left: 4px; + background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='16' height='16' stroke='gray' viewBox='0 0 16 16'%3E%3Cpath fill-rule='evenodd' d='M8.636 3.5a.5.5 0 0 0-.5-.5H1.5A1.5 1.5 0 0 0 0 4.5v10A1.5 1.5 0 0 0 1.5 16h10a1.5 1.5 0 0 0 1.5-1.5V7.864a.5.5 0 0 0-1 0V14.5a.5.5 0 0 1-.5.5h-10a.5.5 0 0 1-.5-.5v-10a.5.5 0 0 1 .5-.5h6.636a.5.5 0 0 0 .5-.5z'/%3E%3Cpath fill-rule='evenodd' d='M16 .5a.5.5 0 0 0-.5-.5h-5a.5.5 0 0 0 0 1h3.793L6.146 9.146a.5.5 0 1 0 .708.708L15 1.707V5.5a.5.5 0 0 0 1 0v-5z'/%3E%3C/svg%3E"); + background-position: center; + background-repeat: no-repeat; + background-size: contain; + display: inline-block; + } +} + +/* Light mode: darker section titles */ +body[data-md-color-scheme="default"] .md-nav__item--section > label.md-nav__link .md-ellipsis { + color: rgba(0, 0, 0, 0.7) !important; + font-weight: 700; +} + +/* Dark mode: lighter gray section titles */ +body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link .md-ellipsis { + color: rgba(255, 255, 255, 0.75) !important; + font-weight: 700; +} diff --git a/docs/source/models/extensions/fastsafetensor.md b/docs/models/extensions/fastsafetensor.md similarity index 100% rename from docs/source/models/extensions/fastsafetensor.md rename to docs/models/extensions/fastsafetensor.md diff --git a/docs/source/models/extensions/runai_model_streamer.md b/docs/models/extensions/runai_model_streamer.md similarity index 61% rename from docs/source/models/extensions/runai_model_streamer.md rename to docs/models/extensions/runai_model_streamer.md index e0daa6f86dde4..6755b574ea67b 100644 --- a/docs/source/models/extensions/runai_model_streamer.md +++ b/docs/models/extensions/runai_model_streamer.md @@ -1,6 +1,7 @@ -(runai-model-streamer)= - -# Loading models with Run:ai Model Streamer +--- +title: Loading models with Run:ai Model Streamer +--- +[](){ #runai-model-streamer } Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory. Further reading can be found in [Run:ai Model Streamer Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md). @@ -15,19 +16,25 @@ pip3 install vllm[runai] To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag: ```console -vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \ + --load-format runai_streamer ``` To run model from AWS S3 object store run: ```console -vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer +vllm serve s3://core-llm/Llama-3-8b \ + --load-format runai_streamer ``` To run model from a S3 compatible object store run: ```console -RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer +RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 \ +AWS_EC2_METADATA_DISABLED=true \ +AWS_ENDPOINT_URL=https://storage.googleapis.com \ +vllm serve s3://core-llm/Llama-3-8b \ + --load-format runai_streamer ``` ## Tunable parameters @@ -38,19 +45,22 @@ You can tune `concurrency` that controls the level of concurrency and number of For reading from S3, it will be the number of client instances the host is opening to the S3 server. ```console -vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \ + --load-format runai_streamer \ + --model-loader-extra-config '{"concurrency":16}' ``` You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit). ```console -vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \ + --load-format runai_streamer \ + --model-loader-extra-config '{"memory_limit":5368709120}' ``` -:::{note} -For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md). -::: +!!! note + For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md). ## Sharded Model Loading @@ -63,7 +73,9 @@ vllm serve /path/to/sharded/model --load-format runai_streamer_sharded The sharded loader expects model files to follow the same naming pattern as the regular sharded state loader: `model-rank-{rank}-part-{part}.safetensors`. You can customize this pattern using the `pattern` parameter in `--model-loader-extra-config`: ```console -vllm serve /path/to/sharded/model --load-format runai_streamer_sharded --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}' +vllm serve /path/to/sharded/model \ + --load-format runai_streamer_sharded \ + --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}' ``` To create sharded model files, you can use the script provided in <gh-file:examples/offline_inference/save_sharded_state.py>. This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader. @@ -71,9 +83,10 @@ To create sharded model files, you can use the script provided in <gh-file:examp The sharded loader supports all the same tunable parameters as the regular Run:ai Model Streamer, including `concurrency` and `memory_limit`. These can be configured in the same way: ```console -vllm serve /path/to/sharded/model --load-format runai_streamer_sharded --model-loader-extra-config '{"concurrency":16, "memory_limit":5368709120}' +vllm serve /path/to/sharded/model \ + --load-format runai_streamer_sharded \ + --model-loader-extra-config '{"concurrency":16, "memory_limit":5368709120}' ``` -:::{note} -The sharded loader is particularly efficient for tensor or pipeline parallel models where each worker only needs to read its own shard rather than the entire checkpoint. -::: +!!! note + The sharded loader is particularly efficient for tensor or pipeline parallel models where each worker only needs to read its own shard rather than the entire checkpoint. diff --git a/docs/source/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md similarity index 69% rename from docs/source/models/extensions/tensorizer.md rename to docs/models/extensions/tensorizer.md index cd94c81e620a2..b6feb405c6cac 100644 --- a/docs/source/models/extensions/tensorizer.md +++ b/docs/models/extensions/tensorizer.md @@ -1,6 +1,7 @@ -(tensorizer)= - -# Loading models with CoreWeave's Tensorizer +--- +title: Loading models with CoreWeave's Tensorizer +--- +[](){ #tensorizer } vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer). vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized @@ -9,8 +10,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor For more information on CoreWeave's Tensorizer, please refer to [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see -the [vLLM example script](https://docs.vllm.ai/en/latest/getting_started/examples/tensorize_vllm_model.html). +the [vLLM example script](https://docs.vllm.ai/en/latest/examples/tensorize_vllm_model.html). -:::{note} -Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. -::: +!!! note + Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. diff --git a/docs/source/models/generative_models.md b/docs/models/generative_models.md similarity index 63% rename from docs/source/models/generative_models.md rename to docs/models/generative_models.md index dd765e4a97658..566b1c29fca9f 100644 --- a/docs/source/models/generative_models.md +++ b/docs/models/generative_models.md @@ -1,24 +1,25 @@ -(generative-models)= - -# Generative Models +--- +title: Generative Models +--- +[](){ #generative-models } vLLM provides first-class support for generative models, which covers most of LLMs. -In vLLM, generative models implement the {class}`~vllm.model_executor.models.VllmModelForTextGeneration` interface. +In vLLM, generative models implement the [VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface. Based on the final hidden states of the input, these models output log probabilities of the tokens to generate, -which are then passed through {class}`~vllm.model_executor.layers.Sampler` to obtain the final text. +which are then passed through [Sampler][vllm.model_executor.layers.Sampler] to obtain the final text. For generative models, the only supported `--task` option is `"generate"`. Usually, this is automatically inferred so you don't have to specify it. ## Offline Inference -The {class}`~vllm.LLM` class provides various methods for offline inference. -See <project:#configuration> for a list of options when initializing the model. +The [LLM][vllm.LLM] class provides various methods for offline inference. +See [configuration][configuration] for a list of options when initializing the model. ### `LLM.generate` -The {class}`~vllm.LLM.generate` method is available to all generative models in vLLM. +The [generate][vllm.LLM.generate] method is available to all generative models in vLLM. It is similar to [its counterpart in HF Transformers](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate), except that tokenization and detokenization are also performed automatically. @@ -34,7 +35,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -You can optionally control the language generation by passing {class}`~vllm.SamplingParams`. +You can optionally control the language generation by passing [SamplingParams][vllm.SamplingParams]. For example, you can use greedy sampling by setting `temperature=0`: ```python @@ -50,16 +51,15 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -:::{important} -By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if {class}`~vllm.SamplingParams` is not specified. +!!! warning + By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified. -However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the {class}`~vllm.LLM` instance. -::: + However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance. A code example can be found here: <gh-file:examples/offline_inference/basic/basic.py> ### `LLM.beam_search` -The {class}`~vllm.LLM.beam_search` method implements [beam search](https://huggingface.co/docs/transformers/en/generation_strategies#beam-search) on top of {class}`~vllm.LLM.generate`. +The [beam_search][vllm.LLM.beam_search] method implements [beam search](https://huggingface.co/docs/transformers/en/generation_strategies#beam-search) on top of [generate][vllm.LLM.generate]. For example, to search using 5 beams and output at most 50 tokens: ```python @@ -77,14 +77,13 @@ for output in outputs: ### `LLM.chat` -The {class}`~vllm.LLM.chat` method implements chat functionality on top of {class}`~vllm.LLM.generate`. +The [chat][vllm.LLM.chat] method implements chat functionality on top of [generate][vllm.LLM.generate]. In particular, it accepts input similar to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat) and automatically applies the model's [chat template](https://huggingface.co/docs/transformers/en/chat_templating) to format the prompt. -:::{important} -In general, only instruction-tuned models have a chat template. -Base models may perform poorly as they are not trained to respond to the chat conversation. -::: +!!! warning + In general, only instruction-tuned models have a chat template. + Base models may perform poorly as they are not trained to respond to the chat conversation. ```python from vllm import LLM @@ -133,7 +132,7 @@ outputs = llm.chat(conversation, chat_template=custom_template) ## Online Serving -Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs: +Our [OpenAI-Compatible Server][openai-compatible-server] provides endpoints that correspond to the offline APIs: -- [Completions API](#completions-api) is similar to `LLM.generate` but only accepts text. -- [Chat API](#chat-api) is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs) for models with a chat template. +- [Completions API][completions-api] is similar to `LLM.generate` but only accepts text. +- [Chat API][chat-api] is similar to `LLM.chat`, accepting both text and [multi-modal inputs][multimodal-inputs] for models with a chat template. diff --git a/docs/source/models/pooling_models.md b/docs/models/pooling_models.md similarity index 62% rename from docs/source/models/pooling_models.md rename to docs/models/pooling_models.md index 3fd35e2e8bd17..89a128915a76c 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -1,70 +1,48 @@ -(pooling-models)= - -# Pooling Models +--- +title: Pooling Models +--- +[](){ #pooling-models } vLLM also supports pooling models, including embedding, reranking and reward models. -In vLLM, pooling models implement the {class}`~vllm.model_executor.models.VllmModelForPooling` interface. -These models use a {class}`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input +In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface. +These models use a [Pooler][vllm.model_executor.layers.Pooler] to extract the final hidden states of the input before returning them. -:::{note} -We currently support pooling models primarily as a matter of convenience. -As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM features are not applicable to -pooling models as they only work on the generation or decode stage, so performance may not improve as much. -::: +!!! note + We currently support pooling models primarily as a matter of convenience. + As shown in the [Compatibility Matrix][compatibility-matrix], most vLLM features are not applicable to + pooling models as they only work on the generation or decode stage, so performance may not improve as much. For pooling models, we support the following `--task` options. The selected option sets the default pooler used to extract the final hidden states: -:::{list-table} -:widths: 50 25 25 25 -:header-rows: 1 - -- * Task - * Pooling Type - * Normalization - * Softmax -- * Embedding (`embed`) - * `LAST` - * โœ…๏ธŽ - * โŒ -- * Classification (`classify`) - * `LAST` - * โŒ - * โœ…๏ธŽ -- * Sentence Pair Scoring (`score`) - * \* - * \* - * \* -- * Reward Modeling (`reward`) - * `ALL` - * โŒ - * โŒ -::: +| Task | Pooling Type | Normalization | Softmax | +|---------------------------------|----------------|-----------------|-----------| +| Embedding (`embed`) | `LAST` | โœ…๏ธŽ | โŒ | +| Classification (`classify`) | `LAST` | โŒ | โœ…๏ธŽ | +| Sentence Pair Scoring (`score`) | \* | \* | \* | \*The default pooler is always defined by the model. -:::{note} -If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table. -::: +!!! note + If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table. When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`). -:::{tip} -You can customize the model's pooling method via the `--override-pooler-config` option, -which takes priority over both the model's and Sentence Transformers's defaults. -::: +!!! tip + You can customize the model's pooling method via the `--override-pooler-config` option, + which takes priority over both the model's and Sentence Transformers's defaults. ## Offline Inference -The {class}`~vllm.LLM` class provides various methods for offline inference. -See <project:#configuration> for a list of options when initializing the model. +The [LLM][vllm.LLM] class provides various methods for offline inference. +See [configuration][configuration] for a list of options when initializing the model. ### `LLM.encode` -The {class}`~vllm.LLM.encode` method is available to all pooling models in vLLM. +The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM. It returns the extracted hidden states directly, which is useful for reward models. ```python @@ -79,7 +57,7 @@ print(f"Data: {data!r}") ### `LLM.embed` -The {class}`~vllm.LLM.embed` method outputs an embedding vector for each prompt. +The [embed][vllm.LLM.embed] method outputs an embedding vector for each prompt. It is primarily designed for embedding models. ```python @@ -96,7 +74,7 @@ A code example can be found here: <gh-file:examples/offline_inference/basic/embe ### `LLM.classify` -The {class}`~vllm.LLM.classify` method outputs a probability vector for each prompt. +The [classify][vllm.LLM.classify] method outputs a probability vector for each prompt. It is primarily designed for classification models. ```python @@ -113,13 +91,12 @@ A code example can be found here: <gh-file:examples/offline_inference/basic/clas ### `LLM.score` -The {class}`~vllm.LLM.score` method outputs similarity scores between sentence pairs. +The [score][vllm.LLM.score] method outputs similarity scores between sentence pairs. It is designed for embedding models and cross encoder models. Embedding models use cosine similarity, and [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html) serve as rerankers between candidate query-document pairs in RAG systems. -:::{note} -vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG. -To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain). -::: +!!! note + vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG. + To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain). ```python from vllm import LLM @@ -136,27 +113,25 @@ A code example can be found here: <gh-file:examples/offline_inference/basic/scor ## Online Serving -Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs: +Our [OpenAI-Compatible Server][openai-compatible-server] provides endpoints that correspond to the offline APIs: -- [Pooling API](#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models. -- [Embeddings API](#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs) for embedding models. -- [Classification API](#classification-api) is similar to `LLM.classify` and is applicable to sequence classification models. -- [Score API](#score-api) is similar to `LLM.score` for cross-encoder models. +- [Pooling API][pooling-api] is similar to `LLM.encode`, being applicable to all types of pooling models. +- [Embeddings API][embeddings-api] is similar to `LLM.embed`, accepting both text and [multi-modal inputs][multimodal-inputs] for embedding models. +- [Classification API][classification-api] is similar to `LLM.classify` and is applicable to sequence classification models. +- [Score API][score-api] is similar to `LLM.score` for cross-encoder models. ## Matryoshka Embeddings [Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows user to trade off between performance and cost. -:::{warning} -Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings. +!!! warning + Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings. -For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model will result in the following error. + For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model will result in the following error. -```json -{"object":"error","message":"Model \"BAAI/bge-m3\" does not support matryoshka representation, changing output dimensions will lead to poor results.","type":"BadRequestError","param":null,"code":400} -``` - -::: + ```json + {"object":"error","message":"Model \"BAAI/bge-m3\" does not support matryoshka representation, changing output dimensions will lead to poor results.","type":"BadRequestError","param":null,"code":400} + ``` ### Manually enable Matryoshka Embeddings @@ -172,7 +147,7 @@ vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"matryoshka_ ### Offline Inference -You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in {class}`~vllm.PoolingParams`. +You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in [PoolingParams][vllm.PoolingParams]. ```python from vllm import LLM, PoolingParams diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md new file mode 100644 index 0000000000000..b60fefdda2793 --- /dev/null +++ b/docs/models/supported_models.md @@ -0,0 +1,690 @@ +--- +title: Supported Models +--- +[](){ #supported-models } + +vLLM supports [generative](./generative_models.md) and [pooling](./pooling_models.md) models across various tasks. +If a model supports more than one task, you can set the task via the `--task` argument. + +For each task, we list the model architectures that have been implemented in vLLM. +Alongside each architecture, we include some popular models that use it. + +## Model Implementation + +### vLLM + +If vLLM natively supports a model, its implementation can be found in <gh-file:vllm/model_executor/models>. + +These models are what we list in [supported-text-models][supported-text-models] and [supported-mm-models][supported-mm-models]. + +[](){ #transformers-backend } + +### Transformers + +vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models are supported, and vision language model support is planned! + +To check if the modeling backend is Transformers, you can simply do this: + +```python +from vllm import LLM +llm = LLM(model=..., task="generate") # Name or path of your model +llm.apply_model(lambda model: print(type(model))) +``` + +If it is `TransformersForCausalLM` then it means it's based on Transformers! + +!!! tip + You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference][offline-inference] or `--model-impl transformers` for the [openai-compatible-server][openai-compatible-server]. + +!!! note + vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM. + +#### Custom models + +If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM! + +For a model to be compatible with the Transformers backend for vLLM it must: + +- be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)): + * The model directory must have the correct structure (e.g. `config.json` is present). + * `config.json` must contain `auto_map.AutoModel`. +- be a Transformers backend for vLLM compatible model (see [writing-custom-models][writing-custom-models]): + * Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`). + +If the compatible model is: + +- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference][offline-inference] or `--trust-remote-code` for the [openai-compatible-server][openai-compatible-server]. +- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for [offline-inference][offline-inference] or `vllm serve <MODEL_DIR>` for the [openai-compatible-server][openai-compatible-server]. + +This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM! + +[](){ #writing-custom-models } + +#### Writing custom models + +This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)). + +To make your model compatible with the Transformers backend, it needs: + +1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`. +2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention. +3. `MyModel` must contain `_supports_attention_backend = True`. + +```python title="modeling_my_model.py" + +from transformers import PreTrainedModel +from torch import nn + +class MyAttention(nn.Module): + + def forward(self, hidden_states, **kwargs): + ... + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + **kwargs, + ) + ... + +class MyModel(PreTrainedModel): + _supports_attention_backend = True +``` + +Here is what happens in the background when this model is loaded: + +1. The config is loaded. +2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`. +3. `MyModel` is loaded into `TransformersForCausalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used. + +That's it! + +For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class: + +```python title="configuration_my_model.py" + +from transformers import PretrainedConfig + +class MyConfig(PretrainedConfig): + base_model_tp_plan = { + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } +``` + +- `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported). +- `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s: + * You only need to do this for layers which are not present on all pipeline stages + * vLLM assumes that there will be only one `nn.ModuleList`, which is distributed across the pipeline stages + * The `list` in the first element of the `tuple` contains the names of the input arguments + * The `list` in the last element of the `tuple` contains the names of the variables the layer outputs to in your modeling code + +## Loading a Model + +### Hugging Face Hub + +By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models). To change the download path for models, you can set the `HF_HOME` environment variable; for more details, refer to [their official documentation](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhome). + +To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository. +If the `"architectures"` field contains a model architecture listed below, then it should be natively supported. + +Models do not _need_ to be natively supported to be used in vLLM. +The [Transformers backend][transformers-backend] enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!). + +!!! tip + The easiest way to check if your model is really supported at runtime is to run the program below: + + ```python + from vllm import LLM + + # For generative models (task=generate) only + llm = LLM(model=..., task="generate") # Name or path of your model + output = llm.generate("Hello, my name is") + print(output) + + # For pooling models (task={embed,classify,reward,score}) only + llm = LLM(model=..., task="embed") # Name or path of your model + output = llm.encode("Hello, my name is") + print(output) + ``` + + If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. + +Otherwise, please refer to [Adding a New Model][new-model] for instructions on how to implement your model in vLLM. +Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support. + +#### Download a model + +If you prefer, you can use the Hugging Face CLI to [download a model](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-download) or specific files from a model repository: + +```console +# Download a model +huggingface-cli download HuggingFaceH4/zephyr-7b-beta + +# Specify a custom cache directory +huggingface-cli download HuggingFaceH4/zephyr-7b-beta --cache-dir ./path/to/cache + +# Download a specific file from a model repo +huggingface-cli download HuggingFaceH4/zephyr-7b-beta eval_results.json +``` + +#### List the downloaded models + +Use the Hugging Face CLI to [manage models](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#scan-your-cache) stored in local cache: + +```console +# List cached models +huggingface-cli scan-cache + +# Show detailed (verbose) output +huggingface-cli scan-cache -v + +# Specify a custom cache directory +huggingface-cli scan-cache --dir ~/.cache/huggingface/hub +``` + +#### Delete a cached model + +Use the Hugging Face CLI to interactively [delete downloaded model](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#clean-your-cache) from the cache: + +```console +# The `delete-cache` command requires extra dependencies to work with the TUI. +# Please run `pip install huggingface_hub[cli]` to install them. + +# Launch the interactive TUI to select models to delete +$ huggingface-cli delete-cache +? Select revisions to delete: 1 revisions selected counting for 438.9M. + โ—‹ None of the following (if selected, nothing will be deleted). +Model BAAI/bge-base-en-v1.5 (438.9M, used 1 week ago) +โฏ โ—‰ a5beb1e3: main # modified 1 week ago + +Model BAAI/bge-large-en-v1.5 (1.3G, used 1 week ago) + โ—‹ d4aa6901: main # modified 1 week ago + +Model BAAI/bge-reranker-base (1.1G, used 4 weeks ago) + โ—‹ 2cfc18c9: main # modified 4 weeks ago + +Press <space> to select, <enter> to validate and <ctrl+c> to quit without modification. + +# Need to confirm after selected +? Select revisions to delete: 1 revision(s) selected. +? 1 revisions selected counting for 438.9M. Confirm deletion ? Yes +Start deletion. +Done. Deleted 1 repo(s) and 0 revision(s) for a total of 438.9M. +``` + +#### Using a proxy + +Here are some tips for loading/downloading models from Hugging Face using a proxy: + +- Set the proxy globally for your session (or set it in the profile file): + +```shell +export http_proxy=http://your.proxy.server:port +export https_proxy=http://your.proxy.server:port +``` + +- Set the proxy for just the current command: + +```shell +https_proxy=http://your.proxy.server:port huggingface-cli download <model_name> + +# or use vllm cmd directly +https_proxy=http://your.proxy.server:port vllm serve <model_name> --disable-log-requests +``` + +- Set the proxy in Python interpreter: + +```python +import os + +os.environ['http_proxy'] = 'http://your.proxy.server:port' +os.environ['https_proxy'] = 'http://your.proxy.server:port' +``` + +### ModelScope + +To use models from [ModelScope](https://www.modelscope.cn) instead of Hugging Face Hub, set an environment variable: + +```shell +export VLLM_USE_MODELSCOPE=True +``` + +And use with `trust_remote_code=True`. + +```python +from vllm import LLM + +llm = LLM(model=..., revision=..., task=..., trust_remote_code=True) + +# For generative models (task=generate) only +output = llm.generate("Hello, my name is") +print(output) + +# For pooling models (task={embed,classify,reward,score}) only +output = llm.encode("Hello, my name is") +print(output) +``` + +[](){ #feature-status-legend } + +## Feature Status Legend + +- โœ…๏ธŽ indicates that the feature is supported for the model. + +- ๐Ÿšง indicates that the feature is planned but not yet supported for the model. + +- โš ๏ธ indicates that the feature is available but may have known issues or limitations. + +[](){ #supported-text-models } + +## List of Text-only Language Models + +### Generative Models + +See [this page][generative-models] for more information on how to use generative models. + +#### Text Generation + +Specified using `--task generate`. + +| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | +|---------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------| +| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | โœ…๏ธŽ | +| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | โœ…๏ธŽ | โœ…๏ธŽ | +| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | โœ…๏ธŽ | +| `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | +| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | โœ…๏ธŽ | +| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. | | โœ…๏ธŽ | +| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. | | โœ…๏ธŽ | +| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. | | โœ…๏ธŽ | +| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | โœ…๏ธŽ | +| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | โœ…๏ธŽ | +| `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `GlmForCausalLM` | GLM-4 | `THUDM/glm-4-9b-chat-hf`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `Glm4ForCausalLM` | GLM-4-0414 | `THUDM/GLM-4-32B-0414`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | โœ…๏ธŽ | +| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | โœ…๏ธŽ | +| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | โœ…๏ธŽ | +| `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | โœ…๏ธŽ | โœ…๏ธŽ | +| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | โœ…๏ธŽ | โœ…๏ธŽ | +| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | โœ…๏ธŽ | โœ…๏ธŽ | +| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | โœ…๏ธŽ | +| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | โœ…๏ธŽ | +| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | โœ…๏ธŽ | +| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | | โœ…๏ธŽ | +| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | | โœ…๏ธŽ | +| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | โœ…๏ธŽ | +| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | โœ…๏ธŽ | +| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | โœ…๏ธŽ | +| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `Phi3SmallForCausalLM` | Phi-3-Small | `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. | | โœ…๏ธŽ | +| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | โœ…๏ธŽ | +| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | | +| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | | โœ…๏ธŽ | +| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | | โœ…๏ธŽ | +| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | +| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | โœ…๏ธŽ | +| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | +| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | + +!!! note + Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. + +### Pooling Models + +See [this page](./pooling_models.md) for more information on how to use pooling models. + +!!! warning + Since some model architectures support both generative and pooling tasks, + you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. + +#### Text Embedding + +Specified using `--task embed`. + +| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | +|--------------------------------------------------------|---------------------|---------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------| +| `BertModel` | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | +| `Gemma2Model` | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | โœ…๏ธŽ | | +| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | โœ…๏ธŽ | โœ…๏ธŽ | +| `GteModel` | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | ๏ธŽ | | +| `GteNewModel` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | ๏ธŽ | ๏ธŽ | +| `ModernBertModel` | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | ๏ธŽ | ๏ธŽ | +| `NomicBertModel` | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | ๏ธŽ | ๏ธŽ | +| `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `Qwen2Model`, `Qwen2ForCausalLM` | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | + +!!! note + `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. + You need to manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`. + +!!! note + For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded. + See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882). + +!!! note + `jinaai/jina-embeddings-v3` supports multiple tasks through lora, while vllm temporarily only supports text-matching tasks by merging lora weights. + +!!! note + The second-generation GTE model (mGTE-TRM) is named `NewModel`. The name `NewModel` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewModel"]}'` to specify the use of the `GteNewModel` architecture. + +If your model is not in the above list, we will try to automatically convert the model using +[as_embedding_model][vllm.model_executor.models.adapters.as_embedding_model]. By default, the embeddings +of the whole prompt are extracted from the normalized hidden state corresponding to the last token. + +#### Reward Modeling + +Specified using `--task reward`. + +| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | +|---------------------------|-----------------|------------------------------------------------------------------------|------------------------|-----------------------------| +| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | +| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | + +If your model is not in the above list, we will try to automatically convert the model using +[as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly. + +!!! warning + For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, + e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. + +#### Classification + +Specified using `--task classify`. + +| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | +|----------------------------------|----------|----------------------------------------|------------------------|-----------------------------| +| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | + +If your model is not in the above list, we will try to automatically convert the model using +[as_classification_model][vllm.model_executor.models.adapters.as_classification_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. + +#### Sentence Pair Scoring + +Specified using `--task score`. + +| Architecture | Models | Example HF Models | +|---------------------------------------|-------------------|----------------------------------------------| +| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | +| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | +| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | + +[](){ #supported-mm-models } + +## List of Multimodal Language Models + +The following modalities are supported depending on the model: + +- **T**ext +- **I**mage +- **V**ideo +- **A**udio + +Any combination of modalities joined by `+` are supported. + +- e.g.: `T + I` means that the model supports text-only, image-only, and text-with-image inputs. + +On the other hand, modalities separated by `/` are mutually exclusive. + +- e.g.: `T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. + +See [this page][multimodal-inputs] on how to pass multi-modal inputs to the model. + +!!! warning + **To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference) + or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt: + + Offline inference: + + ```python + from vllm import LLM + + llm = LLM( + model="Qwen/Qwen2-VL-7B-Instruct", + limit_mm_per_prompt={"image": 4}, + ) + ``` + + Online serving: + + ```bash + vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt '{"image":4}' + ``` + + **This is no longer required if you are using vLLM V1.** + +!!! note + vLLM currently only supports adding LoRA to the language backbone of multimodal models. + +### Generative Models + +See [this page][generative-models] for more information on how to use generative models. + +#### Text Generation + +Specified using `--task generate`. + +| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | +|----------------------------------------------|--------------------------------------------------------------------------|-----------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------| +| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | | โœ…๏ธŽ | +| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | โœ…๏ธŽ | โœ…๏ธŽ | +| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | โœ…๏ธŽ | โœ…๏ธŽ | +| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b` etc. | | โœ…๏ธŽ | โœ…๏ธŽ | +| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. | | โœ…๏ธŽ | โœ…๏ธŽ | +| `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large` etc. | | | | +| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b` etc. | | โœ…๏ธŽ | โœ…๏ธŽ | +| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | โš ๏ธ | +| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220` etc. | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | +| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | +| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | โœ…๏ธŽ | โœ…๏ธŽ\* | +| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3` etc. | โœ…๏ธŽ | | โœ…๏ธŽ | +| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | | โœ…๏ธŽ | โœ…๏ธŽ | +| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | โœ…๏ธŽ | +| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | โœ…๏ธŽ | โœ…๏ธŽ | +| `LlavaForConditionalGeneration` | LLaVA-1.5 | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | | โœ…๏ธŽ | โœ…๏ธŽ | +| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | โœ…๏ธŽ | โœ…๏ธŽ | +| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | โœ…๏ธŽ | โœ…๏ธŽ | +| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | โœ…๏ธŽ | โœ…๏ธŽ | +| `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | +| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | โœ…๏ธŽ | | โœ…๏ธŽ | +| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | โœ…๏ธŽ | | +| `Mistral3ForConditionalGeneration` | Mistral3 | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | +| `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | | +| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | +| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | โœ…๏ธŽ | โœ…๏ธŽ | +| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | โœ…๏ธŽ | โœ…๏ธŽ | +| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | โœ…๏ธŽ | โš ๏ธ | +| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | โœ…๏ธŽ | โœ…๏ธŽ | +| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | +| `PixtralForConditionalGeneration` | Pixtral | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc. | | โœ…๏ธŽ | โœ…๏ธŽ | +| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | +| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | โœ…๏ธŽ | โœ…๏ธŽ | +| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | +| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | โœ…๏ธŽ | โœ…๏ธŽ | โœ…๏ธŽ | +| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-7B` | | โœ…๏ธŽ | โœ…๏ธŽ\* | +| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | โœ…๏ธŽ | โœ…๏ธŽ | +| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | โœ…๏ธŽ | | โœ…๏ธŽ | + +<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM. +    โ€ข For example, to use DeepSeek-VL2 series models: +      `--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` +<sup>E</sup> Pre-computed embeddings can be inputted for this modality. +<sup>+</sup> Multiple items can be inputted per text prompt for this modality. + +!!! warning + Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs. + However, there are differences in how they handle text + image inputs: + + V0 correctly implements the model's attention pattern: + - Uses bidirectional attention between the image tokens corresponding to the same image + - Uses causal attention for other tokens + - Implemented via (naive) PyTorch SDPA with masking tensors + - Note: May use significant memory for long prompts with image + + V1 currently uses a simplified attention pattern: + - Uses causal attention for all tokens, including image tokens + - Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}` + - Will be updated in the future to support the correct behavior + + This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends. + +!!! note + Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently. + +!!! note + `h2oai/h2ovl-mississippi-2b` will be available in V1 once we support head size 80. + +!!! note + To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. + +!!! warning + The output quality of `AllenAI/Molmo-7B-D-0924` (especially in object localization tasks) has deteriorated in recent updates. + + For the best results, we recommend using the following dependency versions (tested on A10 and L40): + + ```text + # Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40) + torch==2.5.1 + torchvision==0.20.1 + transformers==4.48.1 + tokenizers==0.21.0 + tiktoken==0.7.0 + vllm==0.7.0 + + # Optional but recommended for improved performance and stability + triton==3.1.0 + xformers==0.0.28.post3 + uvloop==0.21.0 + protobuf==5.29.3 + openai==1.60.2 + opencv-python-headless==4.11.0.86 + pillow==10.4.0 + + # Installed FlashAttention (for float16 only) + flash-attn>=2.5.6 # Not used in float32, but should be documented + ``` + + **Note:** Make sure you understand the security implications of using outdated packages. + +!!! note + The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now. + For more details, please see: <gh-pr:4087#issuecomment-2250397630> + +!!! warning + Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1. + +!!! note + To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from source via + `pip install git+https://github.com/huggingface/transformers.git`. + + Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1. + `--mm-processor-kwargs '{"use_audio_in_video": true}'`. + +### Pooling Models + +See [this page](./pooling_models.md) for more information on how to use pooling models. + +!!! warning + Since some model architectures support both generative and pooling tasks, + you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. + +#### Text Embedding + +Specified using `--task embed`. + +Any text generation model can be converted into an embedding model by passing `--task embed`. + +!!! note + To get the best results, you should use pooling models that are specifically trained as such. + +The following table lists those that are tested in vLLM. + +| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | +|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------| +| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | +| `Phi3VForCausalLM` | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | ๐Ÿšง | โœ…๏ธŽ | + +#### Transcription + +Specified using `--task transcription`. + +Speech2Text models trained specifically for Automatic Speech Recognition. + +| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | +|----------------|----------|---------------------|------------------------|-----------------------------| + +--- + +## Model Support Policy + +At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Hereโ€™s how we manage third-party model support: + +1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated! + +2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results. + + !!! tip + When comparing the output of `model.generate` from Hugging Face Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. + +3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback. + +4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use. + +5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement. + +Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem. + +Note that, as an inference engine, vLLM does not introduce new models. Therefore, all models supported by vLLM are third-party models in this regard. + +We have the following levels of testing for models: + +1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test. +2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test. +3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:examples) for the models that have passed this test. +4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category. diff --git a/docs/seed_parameter_behavior.md b/docs/seed_parameter_behavior.md deleted file mode 100644 index ff17525cf8e2f..0000000000000 --- a/docs/seed_parameter_behavior.md +++ /dev/null @@ -1,51 +0,0 @@ -# Seed Parameter Behavior in vLLM - -## Overview - -The `seed` parameter in vLLM is used to control the random states for various random number generators. This parameter can affect the behavior of random operations in user code, especially when working with models in vLLM. - -## Default Behavior - -By default, the `seed` parameter is set to `None`. When the `seed` parameter is `None`, the global random states for `random`, `np.random`, and `torch.manual_seed` are not set. This means that the random operations will behave as expected, without any fixed random states. - -## Specifying a Seed - -If a specific seed value is provided, the global random states for `random`, `np.random`, and `torch.manual_seed` will be set accordingly. This can be useful for reproducibility, as it ensures that the random operations produce the same results across multiple runs. - -## Example Usage - -### Without Specifying a Seed - -```python -import random -from vllm import LLM - -# Initialize a vLLM model without specifying a seed -model = LLM(model="Qwen/Qwen2.5-0.5B-Instruct") - -# Try generating random numbers -print(random.randint(0, 100)) # Outputs different numbers across runs -``` - -### Specifying a Seed - -```python -import random -from vllm import LLM - -# Initialize a vLLM model with a specific seed -model = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", seed=42) - -# Try generating random numbers -print(random.randint(0, 100)) # Outputs the same number across runs -``` - -## Important Notes - -- If the `seed` parameter is not specified, the behavior of global random states remains unaffected. -- If a specific seed value is provided, the global random states for `random`, `np.random`, and `torch.manual_seed` will be set to that value. -- This behavior can be useful for reproducibility but may lead to non-intuitive behavior if the user is not explicitly aware of it. - -## Conclusion - -Understanding the behavior of the `seed` parameter in vLLM is crucial for ensuring the expected behavior of random operations in your code. By default, the `seed` parameter is set to `None`, which means that the global random states are not affected. However, specifying a seed value can help achieve reproducibility in your experiments. diff --git a/docs/source/serving/distributed_serving.md b/docs/serving/distributed_serving.md similarity index 73% rename from docs/source/serving/distributed_serving.md rename to docs/serving/distributed_serving.md index c285ef3e8e1c1..259af5cabcb8f 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/serving/distributed_serving.md @@ -1,6 +1,7 @@ -(distributed-serving)= - -# Distributed Inference and Serving +--- +title: Distributed Inference and Serving +--- +[](){ #distributed-serving } ## How to decide the distributed inference strategy? @@ -14,9 +15,8 @@ In short, you should increase the number of GPUs and the number of nodes until y After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like `# GPU blocks: 790`. Multiply the number by `16` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough. -:::{note} -There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs. -::: +!!! note + There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs. ## Running vLLM on a single node @@ -77,13 +77,11 @@ bash run_cluster.sh \ Then you get a ray cluster of **containers**. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. The IP addresses of each worker node should be specified in the `VLLM_HOST_IP` environment variable, and should be different for each worker node. Please check the network configuration of your cluster to make sure the nodes can communicate with each other through the specified IP addresses. -:::{warning} -It is considered best practice to set `VLLM_HOST_IP` to an address on a private network segment for the vLLM cluster. The traffic sent here is not encrypted. The endpoints are also exchanging data in a format that could be exploited to execute arbitrary code should a malicious party gain access to the network. Please ensure that this network is not reachable by any untrusted parties. -::: +!!! warning + It is considered best practice to set `VLLM_HOST_IP` to an address on a private network segment for the vLLM cluster. The traffic sent here is not encrypted. The endpoints are also exchanging data in a format that could be exploited to execute arbitrary code should a malicious party gain access to the network. Please ensure that this network is not reachable by any untrusted parties. -:::{warning} -Since this is a ray cluster of **containers**, all the following commands should be executed in the **containers**, otherwise you are executing the commands on the host machine, which is not connected to the ray cluster. To enter the container, you can use `docker exec -it node /bin/bash`. -::: +!!! warning + Since this is a ray cluster of **containers**, all the following commands should be executed in the **containers**, otherwise you are executing the commands on the host machine, which is not connected to the ray cluster. To enter the container, you can use `docker exec -it node /bin/bash`. Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` and `ray list nodes` to check the status of the Ray cluster. You should see the right number of nodes and GPUs. @@ -104,16 +102,13 @@ vllm serve /path/to/the/model/in/the/container \ To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. -:::{warning} -After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](#troubleshooting-incorrect-hardware-driver) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information. -::: +!!! warning + After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script][troubleshooting-incorrect-hardware-driver] for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information. -:::{warning} -Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes. +!!! warning + Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes. -When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model. -::: + When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model. -:::{warning} -If you keep receiving the error message `Error: No available node types can fulfill resource request` but you have enough GPUs in the cluster, chances are your nodes have multiple IP addresses and vLLM cannot find the right one, especially when you are using multi-node inference. Please make sure vLLM and ray use the same IP address. You can set the `VLLM_HOST_IP` environment variable to the right IP address in the `run_cluster.sh` script (different for each node!), and check `ray status` and `ray list nodes` to see the IP address used by Ray. See <gh-issue:7815> for more information. -::: +!!! warning + If you keep receiving the error message `Error: No available node types can fulfill resource request` but you have enough GPUs in the cluster, chances are your nodes have multiple IP addresses and vLLM cannot find the right one, especially when you are using multi-node inference. Please make sure vLLM and ray use the same IP address. You can set the `VLLM_HOST_IP` environment variable to the right IP address in the `run_cluster.sh` script (different for each node!), and check `ray status` and `ray list nodes` to see the IP address used by Ray. See <gh-issue:7815> for more information. diff --git a/docs/source/serving/integrations/langchain.md b/docs/serving/integrations/langchain.md similarity index 93% rename from docs/source/serving/integrations/langchain.md rename to docs/serving/integrations/langchain.md index 03142d23b145a..14ea6a0443415 100644 --- a/docs/source/serving/integrations/langchain.md +++ b/docs/serving/integrations/langchain.md @@ -1,6 +1,7 @@ -(serving-langchain)= - -# LangChain +--- +title: LangChain +--- +[](){ #serving-langchain } vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain) . diff --git a/docs/source/serving/integrations/llamaindex.md b/docs/serving/integrations/llamaindex.md similarity index 91% rename from docs/source/serving/integrations/llamaindex.md rename to docs/serving/integrations/llamaindex.md index 8c72605202cf5..251b7155c5567 100644 --- a/docs/source/serving/integrations/llamaindex.md +++ b/docs/serving/integrations/llamaindex.md @@ -1,6 +1,7 @@ -(serving-llamaindex)= - -# LlamaIndex +--- +title: LlamaIndex +--- +[](){ #serving-llamaindex } vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index) . diff --git a/docs/serving/offline_inference.md b/docs/serving/offline_inference.md new file mode 100644 index 0000000000000..b238199e41446 --- /dev/null +++ b/docs/serving/offline_inference.md @@ -0,0 +1,29 @@ +--- +title: Offline Inference +--- +[](){ #offline-inference } + +You can run vLLM in your own code on a list of prompts. + +The offline API is based on the [LLM][vllm.LLM] class. +To initialize the vLLM engine, create a new instance of `LLM` and specify the model to run. + +For example, the following code downloads the [`facebook/opt-125m`](https://huggingface.co/facebook/opt-125m) model from HuggingFace +and runs it in vLLM using the default configuration. + +```python +from vllm import LLM + +llm = LLM(model="facebook/opt-125m") +``` + +After initializing the `LLM` instance, you can perform model inference using various APIs. +The available APIs depend on the type of model that is being run: + +- [Generative models][generative-models] output logprobs which are sampled from to obtain the final output text. +- [Pooling models][pooling-models] output their hidden states directly. + +Please refer to the above pages for more details about each API. + +!!! info + [API Reference][offline-inference-api] diff --git a/docs/source/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md similarity index 60% rename from docs/source/serving/openai_compatible_server.md rename to docs/serving/openai_compatible_server.md index 61f7e98bf1088..c2e39d029dd5a 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -1,13 +1,16 @@ -(openai-compatible-server)= - -# OpenAI-Compatible Server +--- +title: OpenAI-Compatible Server +--- +[](){ #openai-compatible-server } vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! This functionality lets you serve models and interact with them using an HTTP client. -In your terminal, you can [install](../getting_started/installation.md) vLLM, then start the server with the [`vllm serve`](#serve-args) command. (You can also use our [Docker](#deployment-docker) image.) +In your terminal, you can [install](../getting_started/installation/README.md) vLLM, then start the server with the [`vllm serve`][serve-args] command. (You can also use our [Docker][deployment-docker] image.) ```bash -vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 +vllm serve NousResearch/Meta-Llama-3-8B-Instruct \ + --dtype auto \ + --api-key token-abc123 ``` To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python). @@ -20,58 +23,56 @@ client = OpenAI( ) completion = client.chat.completions.create( - model="NousResearch/Meta-Llama-3-8B-Instruct", - messages=[ - {"role": "user", "content": "Hello!"} - ] + model="NousResearch/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "user", "content": "Hello!"} + ] ) print(completion.choices[0].message) ``` -:::{tip} -vLLM supports some parameters that are not supported by OpenAI, `top_k` for example. -You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`. -::: +!!! tip + vLLM supports some parameters that are not supported by OpenAI, `top_k` for example. + You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`. -:::{important} -By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator. +!!! warning + By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator. -To disable this behavior, please pass `--generation-config vllm` when launching the server. -::: + To disable this behavior, please pass `--generation-config vllm` when launching the server. ## Supported APIs We currently support the following OpenAI APIs: -- [Completions API](#completions-api) (`/v1/completions`) - - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`). - - *Note: `suffix` parameter is not supported.* -- [Chat Completions API](#chat-api) (`/v1/chat/completions`) - - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`) with a [chat template](#chat-template). - - *Note: `parallel_tool_calls` and `user` parameters are ignored.* -- [Embeddings API](#embeddings-api) (`/v1/embeddings`) - - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`). -- [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`) - - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`). +- [Completions API][completions-api] (`/v1/completions`) + - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`). + - *Note: `suffix` parameter is not supported.* +- [Chat Completions API][chat-api] (`/v1/chat/completions`) + - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`) with a [chat template][chat-template]. + - *Note: `parallel_tool_calls` and `user` parameters are ignored.* +- [Embeddings API][embeddings-api] (`/v1/embeddings`) + - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`). +- [Transcriptions API][transcriptions-api] (`/v1/audio/transcriptions`) + - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`). In addition, we have the following custom APIs: -- [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`) - - Applicable to any model with a tokenizer. -- [Pooling API](#pooling-api) (`/pooling`) - - Applicable to all [pooling models](../models/pooling_models.md). -- [Classification API](#classification-api) (`/classify`) - - Only applicable to [classification models](../models/pooling_models.md) (`--task classify`). -- [Score API](#score-api) (`/score`) - - Applicable to embedding models and [cross-encoder models](../models/pooling_models.md) (`--task score`). -- [Re-rank API](#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`) - - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/) - - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank) - - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response. - - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). +- [Tokenizer API][tokenizer-api] (`/tokenize`, `/detokenize`) + - Applicable to any model with a tokenizer. +- [Pooling API][pooling-api] (`/pooling`) + - Applicable to all [pooling models](../models/pooling_models.md). +- [Classification API][classification-api] (`/classify`) + - Only applicable to [classification models](../models/pooling_models.md) (`--task classify`). +- [Score API][score-api] (`/score`) + - Applicable to embedding models and [cross-encoder models](../models/pooling_models.md) (`--task score`). +- [Re-rank API][rerank-api] (`/rerank`, `/v1/rerank`, `/v2/rerank`) + - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/) + - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank) + - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response. + - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). -(chat-template)= +[](){ #chat-template } ## Chat Template @@ -97,10 +98,10 @@ both a `type` and a `text` field. An example is provided below: ```python completion = client.chat.completions.create( - model="NousResearch/Meta-Llama-3-8B-Instruct", - messages=[ - {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]} - ] + model="NousResearch/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]} + ] ) ``` @@ -111,9 +112,9 @@ request. vLLM provides best-effort support to detect this automatically, which i the detected format, which can be one of: - `"string"`: A string. - - Example: `"Hello world"` + - Example: `"Hello world"` - `"openai"`: A list of dictionaries, similar to OpenAI schema. - - Example: `[{"type": "text", "text": "Hello world!"}]` + - Example: `[{"type": "text", "text": "Hello world!"}]` If the result is not what you expect, you can set the `--chat-template-content-format` CLI argument to override which format to use. @@ -126,13 +127,13 @@ Or directly merge them into the JSON payload if you are using HTTP call directly ```python completion = client.chat.completions.create( - model="NousResearch/Meta-Llama-3-8B-Instruct", - messages=[ - {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} - ], - extra_body={ - "guided_choice": ["positive", "negative"] - } + model="NousResearch/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + ], + extra_body={ + "guided_choice": ["positive", "negative"] + } ) ``` @@ -148,29 +149,29 @@ with `--enable-request-id-headers`. ```python completion = client.chat.completions.create( - model="NousResearch/Meta-Llama-3-8B-Instruct", - messages=[ - {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} - ], - extra_headers={ - "x-request-id": "sentiment-classification-00001", - } + model="NousResearch/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + ], + extra_headers={ + "x-request-id": "sentiment-classification-00001", + } ) print(completion._request_id) completion = client.completions.create( - model="NousResearch/Meta-Llama-3-8B-Instruct", - prompt="A robot may not injure a human being", - extra_headers={ - "x-request-id": "completion-test", - } + model="NousResearch/Meta-Llama-3-8B-Instruct", + prompt="A robot may not injure a human being", + extra_headers={ + "x-request-id": "completion-test", + } ) print(completion._request_id) ``` ## API Reference -(completions-api)= +[](){ #completions-api } ### Completions API @@ -181,23 +182,19 @@ Code example: <gh-file:examples/online_serving/openai_completion_client.py> #### Extra parameters -The following [sampling parameters](#sampling-params) are supported. +The following [sampling parameters][sampling-params] are supported. -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-completion-sampling-params -:end-before: end-completion-sampling-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:completion-sampling-params" +``` The following extra parameters are supported: -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-completion-extra-params -:end-before: end-completion-extra-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params" +``` -(chat-api)= +[](){ #chat-api } ### Chat API @@ -206,37 +203,33 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai We support both [Vision](https://platform.openai.com/docs/guides/vision)- and [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters; -see our [Multimodal Inputs](#multimodal-inputs) guide for more information. +see our [Multimodal Inputs][multimodal-inputs] guide for more information. - *Note: `image_url.detail` parameter is not supported.* Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py> #### Extra parameters -The following [sampling parameters](#sampling-params) are supported. +The following [sampling parameters][sampling-params] are supported. -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-chat-completion-sampling-params -:end-before: end-chat-completion-sampling-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-sampling-params" +``` The following extra parameters are supported: -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-chat-completion-extra-params -:end-before: end-chat-completion-extra-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params" +``` -(embeddings-api)= +[](){ #embeddings-api } ### Embeddings API Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings); you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. -If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api)) +If the model has a [chat template][chat-template], you can replace `inputs` with a list of `messages` (same schema as [Chat API][chat-api]) which will be treated as a single prompt to the model. Code example: <gh-file:examples/online_serving/openai_embedding_client.py> @@ -246,138 +239,121 @@ Code example: <gh-file:examples/online_serving/openai_embedding_client.py> You can pass multi-modal inputs to embedding models by defining a custom chat template for the server and passing a list of `messages` in the request. Refer to the examples below for illustration. -:::::{tab-set} -::::{tab-item} VLM2Vec +=== "VLM2Vec" -To serve the model: + To serve the model: -```bash -vllm serve TIGER-Lab/VLM2Vec-Full --task embed \ - --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja -``` + ```bash + vllm serve TIGER-Lab/VLM2Vec-Full --task embed \ + --trust-remote-code \ + --max-model-len 4096 \ + --chat-template examples/template_vlm2vec.jinja + ``` -:::{important} -Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed` -to run this model in embedding mode instead of text generation mode. + !!! warning + Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed` + to run this model in embedding mode instead of text generation mode. -The custom chat template is completely different from the original one for this model, -and can be found here: <gh-file:examples/template_vlm2vec.jinja> -::: + The custom chat template is completely different from the original one for this model, + and can be found here: <gh-file:examples/template_vlm2vec.jinja> -Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: + Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: -```python -import requests + ```python + import requests -image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" -response = requests.post( - "http://localhost:8000/v1/embeddings", - json={ - "model": "TIGER-Lab/VLM2Vec-Full", - "messages": [{ - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": image_url}}, - {"type": "text", "text": "Represent the given image."}, - ], - }], - "encoding_format": "float", - }, -) -response.raise_for_status() -response_json = response.json() -print("Embedding output:", response_json["data"][0]["embedding"]) -``` + response = requests.post( + "http://localhost:8000/v1/embeddings", + json={ + "model": "TIGER-Lab/VLM2Vec-Full", + "messages": [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Represent the given image."}, + ], + }], + "encoding_format": "float", + }, + ) + response.raise_for_status() + response_json = response.json() + print("Embedding output:", response_json["data"][0]["embedding"]) + ``` -:::: +=== "DSE-Qwen2-MRL" -::::{tab-item} DSE-Qwen2-MRL + To serve the model: -To serve the model: + ```bash + vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \ + --trust-remote-code \ + --max-model-len 8192 \ + --chat-template examples/template_dse_qwen2_vl.jinja + ``` -```bash -vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \ - --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja -``` + !!! warning + Like with VLM2Vec, we have to explicitly pass `--task embed`. -:::{important} -Like with VLM2Vec, we have to explicitly pass `--task embed`. + Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled + by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja> -Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled -by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja> -::: - -:::{important} -`MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code -example below for details. -::: - -:::: - -::::: + !!! warning + `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code + example below for details. Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py> #### Extra parameters -The following [pooling parameters](#pooling-params) are supported. +The following [pooling parameters][pooling-params] are supported. -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-embedding-pooling-params -:end-before: end-embedding-pooling-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:embedding-pooling-params" +``` The following extra parameters are supported by default: -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-embedding-extra-params -:end-before: end-embedding-extra-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params" +``` For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead: -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-chat-embedding-extra-params -:end-before: end-chat-embedding-extra-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params" +``` -(transcriptions-api)= +[](){ #transcriptions-api } ### Transcriptions API Our Transcriptions API is compatible with [OpenAI's Transcriptions API](https://platform.openai.com/docs/api-reference/audio/createTranscription); you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. -:::{note} -To use the Transcriptions API, please install with extra audio dependencies using `pip install vllm[audio]`. -::: +!!! note + To use the Transcriptions API, please install with extra audio dependencies using `pip install vllm[audio]`. Code example: <gh-file:examples/online_serving/openai_transcription_client.py> <!-- TODO: api enforced limits + uploading audios --> #### Extra Parameters -The following [sampling parameters](#sampling-params) are supported. +The following [sampling parameters][sampling-params] are supported. -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-transcription-sampling-params -:end-before: end-transcription-sampling-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params" +``` The following extra parameters are supported: -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-transcription-extra-params -:end-before: end-transcription-extra-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params" +``` -(tokenizer-api)= +[](){ #tokenizer-api } ### Tokenizer API @@ -387,17 +363,17 @@ It consists of two endpoints: - `/tokenize` corresponds to calling `tokenizer.encode()`. - `/detokenize` corresponds to calling `tokenizer.decode()`. -(pooling-api)= +[](){ #pooling-api } ### Pooling API Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states. -The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats. +The input format is the same as [Embeddings API][embeddings-api], but the output data can contain an arbitrary nested list, not just a 1-D list of floats. Code example: <gh-file:examples/online_serving/openai_pooling_client.py> -(classification-api)= +[](){ #classification-api } ### Classification API @@ -505,23 +481,19 @@ Response: #### Extra parameters -The following [pooling parameters](#pooling-params) are supported. +The following [pooling parameters][pooling-params] are supported. -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-classification-pooling-params -:end-before: end-classification-pooling-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:classification-pooling-params" +``` The following extra parameters are supported: -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-classification-extra-params -:end-before: end-classification-extra-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:classification-extra-params" +``` -(score-api)= +[](){ #score-api } ### Score API @@ -668,23 +640,19 @@ Response: #### Extra parameters -The following [pooling parameters](#pooling-params) are supported. +The following [pooling parameters][pooling-params] are supported. -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-score-pooling-params -:end-before: end-score-pooling-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:score-pooling-params" +``` The following extra parameters are supported: -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-score-extra-params -:end-before: end-score-extra-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:score-extra-params" +``` -(rerank-api)= +[](){ #rerank-api } ### Re-rank API @@ -755,18 +723,14 @@ Response: #### Extra parameters -The following [pooling parameters](#pooling-params) are supported. +The following [pooling parameters][pooling-params] are supported. -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-rerank-pooling-params -:end-before: end-rerank-pooling-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:rerank-pooling-params" +``` The following extra parameters are supported: -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-rerank-extra-params -:end-before: end-rerank-extra-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:rerank-extra-params" +``` diff --git a/docs/source/_static/custom.css b/docs/source/_static/custom.css deleted file mode 100644 index 79bd2082b49e8..0000000000000 --- a/docs/source/_static/custom.css +++ /dev/null @@ -1,8 +0,0 @@ -.vertical-table-header th.head:not(.stub) { - writing-mode: sideways-lr; - white-space: nowrap; - max-width: 0; - p { - margin: 0; - } -} diff --git a/docs/source/_templates/sections/header.html b/docs/source/_templates/sections/header.html deleted file mode 100644 index 7174431b10272..0000000000000 --- a/docs/source/_templates/sections/header.html +++ /dev/null @@ -1,39 +0,0 @@ -<style> - .notification-bar { - width: 100vw; - display: flex; - justify-content: center; - align-items: center; - font-size: 16px; - padding: 0 6px 0 6px; - } - .notification-bar p { - margin: 0; - } - .notification-bar a { - font-weight: bold; - text-decoration: none; - } - - /* Light mode styles (default) */ - .notification-bar { - background-color: #fff3cd; - color: #856404; - } - .notification-bar a { - color: #d97706; - } - - /* Dark mode styles */ - html[data-theme=dark] .notification-bar { - background-color: #333; - color: #ddd; - } - html[data-theme=dark] .notification-bar a { - color: #ffa500; /* Brighter color for visibility */ - } -</style> - -<div class="notification-bar"> - <p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p> -</div> diff --git a/docs/source/api/summary.md b/docs/source/api/summary.md deleted file mode 100644 index 46de545f9ded4..0000000000000 --- a/docs/source/api/summary.md +++ /dev/null @@ -1,133 +0,0 @@ -# Summary - -(configuration)= - -## Configuration - -API documentation for vLLM's configuration classes. - -```{autodoc2-summary} - vllm.config.ModelConfig - vllm.config.CacheConfig - vllm.config.TokenizerPoolConfig - vllm.config.LoadConfig - vllm.config.ParallelConfig - vllm.config.SchedulerConfig - vllm.config.DeviceConfig - vllm.config.SpeculativeConfig - vllm.config.LoRAConfig - vllm.config.PromptAdapterConfig - vllm.config.MultiModalConfig - vllm.config.PoolerConfig - vllm.config.DecodingConfig - vllm.config.ObservabilityConfig - vllm.config.KVTransferConfig - vllm.config.CompilationConfig - vllm.config.VllmConfig -``` - -(offline-inference-api)= - -## Offline Inference - -LLM Class. - -```{autodoc2-summary} - vllm.LLM -``` - -LLM Inputs. - -```{autodoc2-summary} - vllm.inputs.PromptType - vllm.inputs.TextPrompt - vllm.inputs.TokensPrompt -``` - -## vLLM Engines - -Engine classes for offline and online inference. - -```{autodoc2-summary} - vllm.LLMEngine - vllm.AsyncLLMEngine -``` - -## Inference Parameters - -Inference parameters for vLLM APIs. - -(sampling-params)= -(pooling-params)= - -```{autodoc2-summary} - vllm.SamplingParams - vllm.PoolingParams -``` - -(multi-modality)= - -## Multi-Modality - -vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package. - -Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models) -via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`. - -Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal). - -```{autodoc2-summary} - vllm.multimodal.MULTIMODAL_REGISTRY -``` - -### Inputs - -User-facing inputs. - -```{autodoc2-summary} - vllm.multimodal.inputs.MultiModalDataDict -``` - -Internal data structures. - -```{autodoc2-summary} - vllm.multimodal.inputs.PlaceholderRange - vllm.multimodal.inputs.NestedTensors - vllm.multimodal.inputs.MultiModalFieldElem - vllm.multimodal.inputs.MultiModalFieldConfig - vllm.multimodal.inputs.MultiModalKwargsItem - vllm.multimodal.inputs.MultiModalKwargs - vllm.multimodal.inputs.MultiModalInputs -``` - -### Data Parsing - -```{autodoc2-summary} - vllm.multimodal.parse -``` - -### Data Processing - -```{autodoc2-summary} - vllm.multimodal.processing -``` - -### Memory Profiling - -```{autodoc2-summary} - vllm.multimodal.profiling -``` - -### Registry - -```{autodoc2-summary} - vllm.multimodal.registry -``` - -## Model Development - -```{autodoc2-summary} - vllm.model_executor.models.interfaces_base - vllm.model_executor.models.interfaces - vllm.model_executor.models.adapters -``` diff --git a/docs/source/autodoc2_docstring_parser.py b/docs/source/autodoc2_docstring_parser.py deleted file mode 100644 index 41c49ed1c545a..0000000000000 --- a/docs/source/autodoc2_docstring_parser.py +++ /dev/null @@ -1,21 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -from docutils import nodes -from myst_parser.parsers.sphinx_ import MystParser -from sphinx.ext.napoleon import docstring - - -class NapoleonParser(MystParser): - - def parse(self, input_string: str, document: nodes.document) -> None: - # Get the Sphinx configuration - config = document.settings.env.config - - parsed_content = str( - docstring.GoogleDocstring( - str(docstring.NumpyDocstring(input_string, config)), - config, - )) - return super().parse(parsed_content, document) - - -Parser = NapoleonParser diff --git a/docs/source/community/blog.md b/docs/source/community/blog.md deleted file mode 100644 index e8030edfa02ee..0000000000000 --- a/docs/source/community/blog.md +++ /dev/null @@ -1,3 +0,0 @@ -# vLLM Blog - -vLLM blog posts are published [here](https://blog.vllm.ai/). diff --git a/docs/source/conf.py b/docs/source/conf.py deleted file mode 100644 index 5620d6de2c59b..0000000000000 --- a/docs/source/conf.py +++ /dev/null @@ -1,263 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. - -import datetime -import logging -import os -import re -import sys -from pathlib import Path - -import requests - -logger = logging.getLogger(__name__) -REPO_ROOT = Path(__file__).resolve().parent.parent.parent -sys.path.append(os.path.abspath(REPO_ROOT)) - -# -- Project information ----------------------------------------------------- - -project = 'vLLM' -copyright = f'{datetime.datetime.now().year}, vLLM Team' -author = 'the vLLM Team' - -# -- General configuration --------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - "sphinx.ext.napoleon", - "sphinx.ext.linkcode", - "sphinx.ext.intersphinx", - "sphinx_copybutton", - "autodoc2", - "myst_parser", - "sphinxarg.ext", - "sphinx_design", - "sphinx_togglebutton", -] -myst_enable_extensions = [ - "colon_fence", - "fieldlist", -] -autodoc2_packages = [ - { - "path": "../../vllm", - "exclude_dirs": ["__pycache__", "third_party"], - }, -] -autodoc2_output_dir = "api" -autodoc2_render_plugin = "myst" -autodoc2_hidden_objects = ["dunder", "private", "inherited"] -autodoc2_sort_names = True -autodoc2_index_template = None - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns: list[str] = ["**/*.template.md", "**/*.inc.md"] - -# Exclude the prompt "$" when copying code -copybutton_prompt_text = r"\$ " -copybutton_prompt_is_regexp = True - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_title = project -html_theme = 'sphinx_book_theme' -html_logo = 'assets/logos/vllm-logo-text-light.png' -html_favicon = 'assets/logos/vllm-logo-only-light.ico' -html_theme_options = { - 'path_to_docs': 'docs/source', - 'repository_url': 'https://github.com/vllm-project/vllm', - 'use_repository_button': True, - 'use_edit_page_button': True, - # Prevents the full API being added to the left sidebar of every page. - # Reduces build time by 2.5x and reduces build size from ~225MB to ~95MB. - 'collapse_navbar': True, - # Makes API visible in the right sidebar on API reference pages. - 'show_toc_level': 3, -} -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_static"] -html_js_files = ["custom.js"] -html_css_files = ["custom.css"] - -myst_heading_anchors = 2 -myst_url_schemes = { - 'http': None, - 'https': None, - 'mailto': None, - 'ftp': None, - "gh-issue": { - "url": - "https://github.com/vllm-project/vllm/issues/{{path}}#{{fragment}}", - "title": "Issue #{{path}}", - "classes": ["github"], - }, - "gh-pr": { - "url": - "https://github.com/vllm-project/vllm/pull/{{path}}#{{fragment}}", - "title": "Pull Request #{{path}}", - "classes": ["github"], - }, - "gh-project": { - "url": "https://github.com/orgs/vllm-project/projects/{{path}}", - "title": "Project #{{path}}", - "classes": ["github"], - }, - "gh-dir": { - "url": "https://github.com/vllm-project/vllm/tree/main/{{path}}", - "title": "{{path}}", - "classes": ["github"], - }, - "gh-file": { - "url": "https://github.com/vllm-project/vllm/blob/main/{{path}}", - "title": "{{path}}", - "classes": ["github"], - }, -} - -# see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa -READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE') -if READTHEDOCS_VERSION_TYPE == "tag": - # remove the warning banner if the version is a tagged release - header_file = os.path.join(os.path.dirname(__file__), - "_templates/sections/header.html") - # The file might be removed already if the build is triggered multiple times - # (readthedocs build both HTML and PDF versions separately) - if os.path.exists(header_file): - os.remove(header_file) - - -# Generate additional rst documentation here. -def setup(app): - from docs.source.generate_examples import generate_examples - generate_examples() - - -_cached_base: str = "" -_cached_branch: str = "" - - -def get_repo_base_and_branch(pr_number): - global _cached_base, _cached_branch - if _cached_base and _cached_branch: - return _cached_base, _cached_branch - - url = f"https://api.github.com/repos/vllm-project/vllm/pulls/{pr_number}" - response = requests.get(url) - if response.status_code == 200: - data = response.json() - _cached_base = data['head']['repo']['full_name'] - _cached_branch = data['head']['ref'] - return _cached_base, _cached_branch - else: - logger.error("Failed to fetch PR details: %s", response) - return None, None - - -def linkcode_resolve(domain, info): - if domain != 'py': - return None - if not info['module']: - return None - - # Get path from module name - file = Path(f"{info['module'].replace('.', '/')}.py") - path = REPO_ROOT / file - if not path.exists(): - path = REPO_ROOT / file.with_suffix("") / "__init__.py" - if not path.exists(): - return None - - # Get the line number of the object - with open(path) as f: - lines = f.readlines() - name = info['fullname'].split(".")[-1] - pattern = fr"^( {{4}})*((def|class) )?{name}\b.*" - for lineno, line in enumerate(lines, 1): - if not line or line.startswith("#"): - continue - if re.match(pattern, line): - break - - # If the line number is not found, return None - if lineno == len(lines): - return None - - # If the line number is found, create the URL - filename = path.relative_to(REPO_ROOT) - if "checkouts" in path.parts: - # a PR build on readthedocs - pr_number = REPO_ROOT.name - base, branch = get_repo_base_and_branch(pr_number) - if base and branch: - return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}" - # Otherwise, link to the source file on the main branch - return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}" - - -# Mock out external dependencies here, otherwise sphinx-argparse won't work. -autodoc_mock_imports = [ - "huggingface_hub", - "pydantic", - "zmq", - "cloudpickle", - "aiohttp", - "starlette", - "blake3", - "cpuinfo", - "transformers", - "psutil", - "vllm._C", - "PIL", - "numpy", - "tqdm", - # The mocks below are required by - # docs/source/serving/openai_compatible_server.md's - # vllm.entrypoints.openai.cli_args - "openai", - "fastapi", - "partial_json_parser", -] - -for mock_target in autodoc_mock_imports: - if mock_target in sys.modules: - logger.info( - "Potentially problematic mock target (%s) found; " - "autodoc_mock_imports cannot mock modules that have already " - "been loaded into sys.modules when the sphinx build starts.", - mock_target) - -intersphinx_mapping = { - "python": ("https://docs.python.org/3", None), - "typing_extensions": - ("https://typing-extensions.readthedocs.io/en/latest", None), - "aiohttp": ("https://docs.aiohttp.org/en/stable", None), - "pillow": ("https://pillow.readthedocs.io/en/stable", None), - "numpy": ("https://numpy.org/doc/stable", None), - "torch": ("https://pytorch.org/docs/stable", None), - "psutil": ("https://psutil.readthedocs.io/en/stable", None), -} - -navigation_with_keys = False diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md deleted file mode 100644 index 721ee3cd2047c..0000000000000 --- a/docs/source/contributing/model/index.md +++ /dev/null @@ -1,27 +0,0 @@ -(new-model)= - -# Adding a New Model - -This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM. - -:::{toctree} -:caption: Contents -:maxdepth: 1 - -basic -registration -tests -multimodal -::: - -:::{note} -The complexity of adding a new model depends heavily on the model's architecture. -The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. -However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. -::: - -:::{tip} -If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues) -or ask on our [developer slack](https://slack.vllm.ai). -We will be happy to help you out! -::: diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md deleted file mode 100644 index b42536f054d76..0000000000000 --- a/docs/source/contributing/model/multimodal.md +++ /dev/null @@ -1,834 +0,0 @@ -(supports-multimodal)= - -# Multi-Modal Support - -This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](#multimodal-inputs). - -## 1. Update the base vLLM model - -It is assumed that you have already implemented the model in vLLM according to [these steps](#new-model-basic). -Further update the model as follows: - -- Reserve a keyword parameter in {meth}`~torch.nn.Module.forward` for each input tensor that corresponds to a multi-modal input, as shown in the following example: - - ```diff - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - + pixel_values: torch.Tensor, - ) -> SamplerOutput: - ``` - - More conveniently, you can simply pass `**kwargs` to the {meth}`~torch.nn.Module.forward` method and retrieve the keyword parameters for multimodal inputs from it. - -- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings` that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs. - - ```python - class YourModelForImage2Seq(nn.Module): - ... - - def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor: - - assert self.vision_encoder is not None - image_features = self.vision_encoder(image_input) - return self.multi_modal_projector(image_features) - - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: - - # Validate the multimodal input keyword arguments - image_input = self._parse_and_validate_image_input(**kwargs) - if image_input is None: - return None - - # Run multimodal inputs through encoder and projector - vision_embeddings = self._process_image_input(image_input) - return vision_embeddings - ``` - - :::{important} - The returned `multimodal_embeddings` must be either a **3D {class}`torch.Tensor`** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D {class}`torch.Tensor`'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request. - ::: - -- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings` to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings. - - ```python - from .utils import merge_multimodal_embeddings - - class YourModelForImage2Seq(nn.Module): - ... - - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - - # `get_input_embeddings` should already be implemented for the language - # model as one of the requirements of basic vLLM model implementation. - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - - if multimodal_embeddings is not None: - inputs_embeds = merge_multimodal_embeddings( - input_ids=input_ids, - inputs_embeds=inputs_embeds, - multimodal_embeddings=multimodal_embeddings, - placeholder_token_id=self.config.image_token_index) - - return inputs_embeds - ``` - -- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model` getter to provide stable access to the underlying language model. - - ```python - class YourModelForImage2Seq(nn.Module): - ... - - def get_language_model(self) -> torch.nn.Module: - # Change `language_model` according to your implementation. - return self.language_model - ``` - -- Once the above steps are done, update the model class with the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. - - ```diff - + from vllm.model_executor.models.interfaces import SupportsMultiModal - - - class YourModelForImage2Seq(nn.Module): - + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): - ``` - - :::{note} - The model class does not have to be named {code}`*ForCausalLM`. - Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples. - ::: - -## 2. Specify processing information - -Next, create a subclass of {class}`~vllm.multimodal.processing.BaseProcessingInfo` -to provide basic information related to HF processing. - -### Maximum number of input items - -You need to override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_supported_mm_limits` -to return the maximum number of input items for each modality supported by the model. - -For example, if the model supports any number of images but only one video per prompt: - -```python -def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None, "video": 1} -``` - -## 3. Specify dummy inputs - -Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for -HF processing as well as memory profiling. - -### For memory profiling - -Override the abstract methods {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text` and {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_mm_data` to construct dummy inputs for memory profiling. These dummy inputs should result in the worst-case memory usage of the model so that vLLM can reserve the correct amount of memory for it. - -Assuming that the memory usage increases with the number of tokens, the dummy inputs can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens. - -::::{tab-set} -:::{tab-item} Basic example: LLaVA -:sync: llava - -Looking at the code of HF's `LlavaForConditionalGeneration`: - -```python -# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544 -n_image_tokens = (input_ids == self.config.image_token_index).sum().item() -n_image_features = image_features.shape[0] * image_features.shape[1] - -if n_image_tokens != n_image_features: - raise ValueError( - f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" - ) -special_image_mask = ( - (input_ids == self.config.image_token_index) - .unsqueeze(-1) - .expand_as(inputs_embeds) - .to(inputs_embeds.device) -) -image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) -inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) -``` - -The number of placeholder feature tokens per image is `image_features.shape[1]`. -`image_features` is calculated inside the `get_image_features` method: - -```python -# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300 -image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) - -selected_image_feature = image_outputs.hidden_states[vision_feature_layer] -if vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, 1:] -elif vision_feature_select_strategy == "full": - selected_image_feature = selected_image_feature -else: - raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") -image_features = self.multi_modal_projector(selected_image_feature) -return image_features -``` - -We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower -(`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model). -Moreover, we only need the sequence length (the second dimension of the tensor) to get `image_features.shape[1]`. -The sequence length is determined by the initial hidden states in `CLIPVisionTransformer` since the attention -mechanism doesn't change the sequence length of the output hidden states. - -```python -# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L1094-L1102 -hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) -hidden_states = self.pre_layrnorm(hidden_states) - -encoder_outputs = self.encoder( - inputs_embeds=hidden_states, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, -) -``` - -To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`: - -```python -# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257 -target_dtype = self.patch_embedding.weight.dtype -patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] -patch_embeds = patch_embeds.flatten(2).transpose(1, 2) - -class_embeds = self.class_embedding.expand(batch_size, 1, -1) -embeddings = torch.cat([class_embeds, patch_embeds], dim=1) -if interpolate_pos_encoding: - embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) -else: - embeddings = embeddings + self.position_embedding(self.position_ids) -return embeddings -``` - -We can infer that `embeddings.shape[1] == self.num_positions`, where - -```python -# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L195-L196 -self.num_patches = (self.image_size // self.patch_size) ** 2 -self.num_positions = self.num_patches + 1 -``` - -Overall, the number of placeholder feature tokens for an image can be calculated as: - -```python -def get_num_image_tokens( - self, - *, - image_width: int, - image_height: int, -) -> int: - hf_config = self.get_hf_config() - hf_processor = self.get_hf_processor() - - image_size = hf_config.vision_config.image_size - patch_size = hf_config.vision_config.patch_size - - num_image_tokens = (image_size // patch_size) ** 2 + 1 - if hf_processor.vision_feature_select_strategy == "default": - num_image_tokens -= 1 - - return num_image_tokens -``` - -Notice that the number of image tokens doesn't depend on the image width and height. -We can simply use a dummy `image_size` to calculate the multimodal profiling data: - -```python -# NOTE: In actuality, this is usually implemented as part of the -# model's subclass of `BaseProcessingInfo`, but we show it as is -# here for simplicity. -def get_image_size_with_most_features(self) -> ImageSize: - hf_config = self.get_hf_config() - width = height = hf_config.image_size - return ImageSize(width=width, height=height) - -def get_dummy_mm_data( - self, - seq_len: int, - mm_counts: Mapping[str, int], -) -> MultiModalDataDict: - num_images = mm_counts.get("image", 0) - - target_width, target_height = \ - self.info.get_image_size_with_most_features() - - return { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images) - } -``` - -For the text, we simply expand the multimodal image token from the model config to match the desired number of images. - -```python -def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: - num_images = mm_counts.get("image", 0) - - processor = self.info.get_hf_processor() - image_token = processor.image_token - - return image_token * num_images -``` - -::: - -:::{tab-item} No input placeholders: Fuyu -:sync: fuyu - -Looking at the code of HF's `FuyuForCausalLM`: - -```python -# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322 -if image_patches is not None and past_key_values is None: - patch_embeddings = [ - self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype)) - .squeeze(0) - .to(inputs_embeds.device) - for patch in image_patches - ] - inputs_embeds = self.gather_continuous_embeddings( - word_embeddings=inputs_embeds, - continuous_embeddings=patch_embeddings, - image_patch_input_indices=image_patches_indices, - ) -``` - -The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`, -which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`. - -Unlike LLaVA, Fuyu does not define the number of patches inside the modeling file. Where can we get more information? -Considering that the model input comes from the output of `FuyuProcessor`, let's **look at the preprocessing files**. - -The image outputs are obtained by calling `FuyuImageProcessor.preprocess` and then -`FuyuImageProcessor.preprocess_with_tokenizer_info` inside `FuyuProcessor`. - -In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`, -returning the dimensions after resizing (but before padding) as metadata. - -```python -# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544 -image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"]) -batch_images = image_encoding["images"] -image_unpadded_heights = image_encoding["image_unpadded_heights"] -image_unpadded_widths = image_encoding["image_unpadded_widths"] - -# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L -if do_resize: - batch_images = [ - [self.resize(image, size=size, input_data_format=input_data_format) for image in images] - for images in batch_images - ] - -image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images] -image_unpadded_heights = [[image_size[0]] for image_size in image_sizes] -image_unpadded_widths = [[image_size[1]] for image_size in image_sizes] - -if do_pad: - batch_images = [ - [ - self.pad_image( - image, - size=size, - mode=padding_mode, - constant_values=padding_value, - input_data_format=input_data_format, - ) - for image in images - ] - for images in batch_images - ] -``` - -In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata: - -```python -# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425 -model_image_input = self.image_processor.preprocess_with_tokenizer_info( - image_input=tensor_batch_images, - image_present=image_present, - image_unpadded_h=image_unpadded_heights, - image_unpadded_w=image_unpadded_widths, - image_placeholder_id=image_placeholder_id, - image_newline_id=image_newline_id, - variable_sized=True, -) - -# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658 -image_height, image_width = image.shape[1], image.shape[2] -if variable_sized: # variable_sized=True - new_h = min( - image_height, - math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height, - ) - new_w = min( - image_width, - math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width, - ) - image = image[:, :new_h, :new_w] - image_height, image_width = new_h, new_w - -num_patches = self.get_num_patches(image_height=image_height, image_width=image_width) -tensor_of_image_ids = torch.full( - [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device -) -patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0) -assert num_patches == patches.shape[0] -``` - -The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`: - -```python -# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562 -patch_size = patch_size if patch_size is not None else self.patch_size -patch_height, patch_width = self.patch_size["height"], self.patch_size["width"] - -if image_height % patch_height != 0: - raise ValueError(f"{image_height=} must be divisible by {patch_height}") -if image_width % patch_width != 0: - raise ValueError(f"{image_width=} must be divisible by {patch_width}") - -num_patches_per_dim_h = image_height // patch_height -num_patches_per_dim_w = image_width // patch_width -num_patches = num_patches_per_dim_h * num_patches_per_dim_w -``` - -These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized -to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`. - -```python -def get_image_size_with_most_features(self) -> ImageSize: - image_processor = self.get_image_processor() - return ImageSize(width=image_processor.size["width"], - height=image_processor.size["height"]) -``` - -Fuyu does not expect image placeholders in the inputs to HF processor, so -the dummy prompt text is empty regardless of the number of images. - -```python -def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: - return "" -``` - -For the multimodal image profiling data, the logic is very similar to LLaVA: - -```python -def get_dummy_mm_data( - self, - seq_len: int, - mm_counts: Mapping[str, int], -) -> MultiModalDataDict: - target_width, target_height = \ - self.info.get_image_size_with_most_features() - num_images = mm_counts.get("image", 0) - - return { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images) - } -``` - -::: - -:::: - -## 4. Specify processing details - -Afterwards, create a subclass of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` -to fill in the missing details about HF processing. - -:::{seealso} -[Multi-Modal Data Processing](#mm-processing) -::: - -### Multi-modal fields - -Override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` to -return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items. - -:::::{tab-set} -::::{tab-item} Basic example: LLaVA -:sync: llava - -The output of `CLIPImageProcessor` is a simple tensor with shape -`(num_images, num_channels, image_height, image_width)`: - -```python -# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345 -images = [ - to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) - for image in all_images -] - -data = {"pixel_values": images} -return BatchFeature(data=data, tensor_type=return_tensors) -``` - -So, we override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` as follows: - -```python -def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], -) -> Mapping[str, MultiModalFieldConfig]: - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - ) -``` - -:::{note} -Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports -pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument. -::: - -:::: - -::::{tab-item} With postprocessing: Fuyu -:sync: fuyu - -The `image_patches` output of `FuyuImageProcessor.preprocess_with_tokenizer_info` concatenates -the patches from each image belonging to an item in the batch: - -```python -# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L673-L679 - image_input_ids.append(tensor_of_image_ids) - image_patches.append(patches) - else: - image_input_ids.append(torch.tensor([], dtype=torch.int32, device=image_input.device)) - -batch_image_input_ids.append(image_input_ids) -batch_image_patches.append(image_patches) -``` - -The shape of `image_patches` outputted by `FuyuImageProcessor` is therefore -`(1, num_images, num_patches, patch_width * patch_height * num_channels)`. - -In order to support the use of {func}`MultiModalFieldConfig.batched` like in LLaVA, -we remove the extra batch dimension by overriding {meth}`BaseMultiModalProcessor._call_hf_processor`: - -```python -def _call_hf_processor( - self, - prompt: str, - mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], -) -> BatchFeature: - processed_outputs = super()._call_hf_processor( - prompt=prompt, - mm_data=mm_data, - mm_kwargs=mm_kwargs, - ) - - image_patches = processed_outputs.get("image_patches") - if image_patches is not None: - images = mm_data["images"] - assert isinstance(images, list) - - # Original output: (1, num_images, Pn, Px * Py * C) - # New output: (num_images, Pn, Px * Py * C) - assert (isinstance(image_patches, list) - and len(image_patches) == 1) - assert (isinstance(image_patches[0], torch.Tensor) - and len(image_patches[0]) == len(images)) - - processed_outputs["image_patches"] = image_patches[0] - - return processed_outputs -``` - -:::{note} -Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling -for text-only inputs to prevent unnecessary warnings from HF processor. -::: - -This lets us override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` as follows: - -```python -def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], -) -> Mapping[str, MultiModalFieldConfig]: - return dict(image_patches=MultiModalFieldConfig.batched("image")) -``` - -:::: - -::::: - -### Prompt updates - -Override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` to -return a list of {class}`~vllm.multimodal.processing.PromptUpdate` instances. - -Each {class}`~vllm.multimodal.processing.PromptUpdate` instance specifies an update operation -(e.g.: insertion, replacement) performed by the HF processor. - -::::{tab-set} -:::{tab-item} Basic example: LLaVA -:sync: llava - -Looking at HF's `LlavaProcessor`: - -```python -# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/processing_llava.py#L167-L170 -prompt_strings = [] -for sample in text: - sample = sample.replace(self.image_token, self.image_token * num_image_tokens) - prompt_strings.append(sample) -``` - -It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`). -Based on this, we override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` as follows: - -```python -def _get_prompt_updates( - self, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, -) -> Sequence[PromptUpdate]: - hf_config = self.info.get_hf_config() - image_token_id = hf_config.image_token_index - - def get_replacement(item_idx: int): - images = mm_items.get_items("image", ImageProcessorItems) - - image_size = images.get_image_size(item_idx) - num_image_tokens = self.info.get_num_image_tokens( - image_width=image_size.width, - image_height=image_size.height, - ) - - return [image_token_id] * num_image_tokens - - return [ - PromptReplacement( - modality="image", - target=[image_token_id], - replacement=get_replacement, - ), - ] -``` - -::: - -:::{tab-item} Handling additional tokens: Fuyu -:sync: fuyu - -Recall the layout of feature tokens from Step 2: - -``` -|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE| -|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE| -... -|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE| -``` - -We define a helper function to return `ncols` and `nrows` directly: - -```python -def get_image_feature_grid_size( - self, - *, - image_width: int, - image_height: int, -) -> tuple[int, int]: - image_processor = self.get_image_processor() - target_width = image_processor.size["width"] - target_height = image_processor.size["height"] - patch_width = image_processor.patch_size["width"] - patch_height = image_processor.patch_size["height"] - - if not (image_width <= target_width and image_height <= target_height): - height_scale_factor = target_height / image_height - width_scale_factor = target_width / image_width - optimal_scale_factor = min(height_scale_factor, width_scale_factor) - - image_height = int(image_height * optimal_scale_factor) - image_width = int(image_width * optimal_scale_factor) - - ncols = math.ceil(image_width / patch_width) - nrows = math.ceil(image_height / patch_height) - return ncols, nrows -``` - -Based on this, we can initially define our replacement tokens as: - -```python -def get_replacement(item_idx: int): - images = mm_items.get_items("image", ImageProcessorItems) - image_size = images.get_image_size(item_idx) - - ncols, nrows = self.info.get_image_feature_grid_size( - image_width=image_size.width, - image_height=image_size.height, - ) - - # `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|` - # `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|` - return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows -``` - -However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called, -a BOS token (`<s>`) is also added to the promopt: - -```python -# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435 -model_image_input = self.image_processor.preprocess_with_tokenizer_info( - image_input=tensor_batch_images, - image_present=image_present, - image_unpadded_h=image_unpadded_heights, - image_unpadded_w=image_unpadded_widths, - image_placeholder_id=image_placeholder_id, - image_newline_id=image_newline_id, - variable_sized=True, -) -prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch( - tokenizer=self.tokenizer, - prompts=prompts, - scale_factors=scale_factors, - max_tokens_to_generate=self.max_tokens_to_generate, - max_position_embeddings=self.max_position_embeddings, - add_BOS=True, - add_beginning_of_answer_token=True, -) -``` - -To assign the vision embeddings to only the image tokens, instead of a string -you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`: - -```python -hf_config = self.info.get_hf_config() -bos_token_id = hf_config.bos_token_id # `<s>` -assert isinstance(bos_token_id, int) - -def get_replacement_fuyu(item_idx: int): - images = mm_items.get_items("image", ImageProcessorItems) - image_size = images.get_image_size(item_idx) - - ncols, nrows = self.info.get_image_feature_grid_size( - image_width=image_size.width, - image_height=image_size.height, - ) - image_tokens = ([_IMAGE_TOKEN_ID] * ncols + - [_NEWLINE_TOKEN_ID]) * nrows - - return PromptUpdateDetails.select_token_id( - image_tokens + [bos_token_id], - embed_token_id=_IMAGE_TOKEN_ID, - ) -``` - -Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt, -we can search for it to conduct the replacement at the start of the string: - -```python -def _get_prompt_updates( - self, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, -) -> Sequence[PromptUpdate]: - hf_config = self.info.get_hf_config() - bos_token_id = hf_config.bos_token_id - assert isinstance(bos_token_id, int) - - tokenizer = self.info.get_tokenizer() - eot_token_id = tokenizer.bos_token_id - assert isinstance(eot_token_id, int) - - def get_replacement_fuyu(item_idx: int): - images = mm_items.get_items("image", ImageProcessorItems) - image_size = images.get_image_size(item_idx) - - ncols, nrows = self.info.get_image_feature_grid_size( - image_width=image_size.width, - image_height=image_size.height, - ) - image_tokens = ([_IMAGE_TOKEN_ID] * ncols + - [_NEWLINE_TOKEN_ID]) * nrows - - return PromptUpdateDetails.select_token_id( - image_tokens + [bos_token_id], - embed_token_id=_IMAGE_TOKEN_ID, - ) - - return [ - PromptReplacement( - modality="image", - target=[eot_token_id], - replacement=get_replacement_fuyu, - ) - ] -``` - -::: - -:::: - -## 5. Register processor-related classes - -After you have defined {class}`~vllm.multimodal.processing.BaseProcessingInfo` (Step 2), -{class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` (Step 3), -and {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` (Step 4), -decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor <vllm.multimodal.registry.MultiModalRegistry.register_processor>` -to register them to the multi-modal registry: - -```diff - from vllm.model_executor.models.interfaces import SupportsMultiModal -+ from vllm.multimodal import MULTIMODAL_REGISTRY - -+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor, -+ info=YourProcessingInfo, -+ dummy_inputs=YourDummyInputsBuilder) - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): -``` - -## Notes - -### Inserting feature tokens without replacement - -Some HF processors directly insert feature tokens without replacing anything in the original prompt. In that case, you can use {class}`~vllm.multimodal.processing.PromptInsertion` instead of {class}`~vllm.multimodal.processing.PromptReplacement` inside {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`. - -Examples: - -- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py> -- Florence2 (insert at start of prompt): <gh-file:vllm/model_executor/models/florence2.py> -- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py> - -### Handling prompt updates unrelated to multi-modal data - -{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` assumes that each application of prompt update corresponds to one multi-modal item. If the HF processor performs additional processing regardless of how many multi-modal items there are, you should override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_tokens_only` so that the processed token inputs are consistent with the result of applying the HF processor on text inputs. This is because token inputs bypass the HF processor according to [our design](#mm-processing). - -Examples: - -- Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py> -- Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py> -- Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py> - -### Custom HF processor - -Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor`. - -Examples: - -- DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py> -- InternVL: <gh-file:vllm/model_executor/models/internvl.py> -- Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py> diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md deleted file mode 100644 index 64cd25b53807e..0000000000000 --- a/docs/source/contributing/model/registration.md +++ /dev/null @@ -1,55 +0,0 @@ -(new-model-registration)= - -# Registering a Model to vLLM - -vLLM relies on a model registry to determine how to run each model. -A list of pre-registered architectures can be found [here](#supported-models). - -If your model is not on this list, you must register it to vLLM. -This page provides detailed instructions on how to do so. - -## Built-in models - -To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](#build-from-source). -This gives you the ability to modify the codebase and test your model. - -After you have implemented your model (see [tutorial](#new-model-basic)), put it into the <gh-dir:vllm/model_executor/models> directory. -Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM. -Finally, update our [list of supported models](#supported-models) to promote your model! - -:::{important} -The list of models in each section should be maintained in alphabetical order. -::: - -## Out-of-tree models - -You can load an external model using a plugin without modifying the vLLM codebase. - -:::{seealso} -[vLLM's Plugin System](#plugin-system) -::: - -To register the model, use the following code: - -```python -from vllm import ModelRegistry -from your_code import YourModelForCausalLM -ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) -``` - -If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`: - -```python -from vllm import ModelRegistry - -ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") -``` - -:::{important} -If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. -Read more about that [here](#supports-multimodal). -::: - -:::{note} -Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. -::: diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md deleted file mode 100644 index ca56710bc2ef2..0000000000000 --- a/docs/source/deployment/docker.md +++ /dev/null @@ -1,133 +0,0 @@ -(deployment-docker)= - -# Using Docker - -(deployment-docker-pre-built-image)= - -## Use vLLM's Official Docker Image - -vLLM offers an official Docker image for deployment. -The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags). - -```console -$ docker run --runtime nvidia --gpus all \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - --env "HUGGING_FACE_HUB_TOKEN=<secret>" \ - -p 8000:8000 \ - --ipc=host \ - vllm/vllm-openai:latest \ - --model mistralai/Mistral-7B-v0.1 -``` - -This image can also be used with other container engines such as [Podman](https://podman.io/). - -```console -$ podman run --gpus all \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ - -p 8000:8000 \ - --ipc=host \ - vllm/vllm-openai:latest \ - --model mistralai/Mistral-7B-v0.1 -``` - -You can add any other <project:#engine-args> you need after the image tag (`vllm/vllm-openai:latest`). - -:::{note} -You can either use the `ipc=host` flag or `--shm-size` flag to allow the -container to access the host's shared memory. vLLM uses PyTorch, which uses shared -memory to share data between processes under the hood, particularly for tensor parallel inference. -::: - -:::{note} -Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>). - -If you need to use those dependencies (having accepted the license terms), -create a custom Dockerfile on top of the base image with an extra layer that installs them: - -```Dockerfile -FROM vllm/vllm-openai:v0.8.3 - -# e.g. install the `audio` optional dependencies -# NOTE: Make sure the version of vLLM matches the base image! -RUN uv pip install --system vllm[audio]==0.8.3 -``` - -::: - -:::{tip} -Some new models may only be available on the main branch of [HF Transformers](https://github.com/huggingface/transformers). - -To use the development version of `transformers`, create a custom Dockerfile on top of the base image -with an extra layer that installs their code from source: - -```Dockerfile -FROM vllm/vllm-openai:latest - -RUN uv pip install --system git+https://github.com/huggingface/transformers.git -``` - -::: - -(deployment-docker-build-image-from-source)= - -## Building vLLM's Docker Image from Source - -You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM: - -```console -# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 -DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --file docker/Dockerfile -``` - -:::{note} -By default vLLM will build for all GPU types for widest distribution. If you are just building for the -current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""` -for vLLM to find the current GPU type and build for that. - -If you are using Podman instead of Docker, you might need to disable SELinux labeling by -adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184). -::: - -## Building for Arm64/aarch64 - -A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use -of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64. - -:::{note} -Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=` -flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits. -Keep an eye on memory usage with parallel jobs as it can be substantial (see example below). -::: - -```console -# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) -$ python3 use_existing_torch.py -$ DOCKER_BUILDKIT=1 docker build . \ - --file docker/Dockerfile \ - --target vllm-openai \ - --platform "linux/arm64" \ - -t vllm/vllm-gh200-openai:latest \ - --build-arg max_jobs=66 \ - --build-arg nvcc_threads=2 \ - --build-arg torch_cuda_arch_list="9.0+PTX" \ - --build-arg vllm_fa_cmake_gpu_arches="90-real" -``` - -## Use the custom-built vLLM Docker image - -To run vLLM with the custom-built Docker image: - -```console -$ docker run --runtime nvidia --gpus all \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - -p 8000:8000 \ - --env "HUGGING_FACE_HUB_TOKEN=<secret>" \ - vllm/vllm-openai <args...> -``` - -The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command). - -:::{note} -**For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` . -::: diff --git a/docs/source/deployment/frameworks/helm.md b/docs/source/deployment/frameworks/helm.md deleted file mode 100644 index 7320d727fbaa4..0000000000000 --- a/docs/source/deployment/frameworks/helm.md +++ /dev/null @@ -1,250 +0,0 @@ -(deployment-helm)= - -# Helm - -A Helm chart to deploy vLLM for Kubernetes - -Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values. - -This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file. - -## Prerequisites - -Before you begin, ensure that you have the following: - -- A running Kubernetes cluster -- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin) -- Available GPU resources in your cluster -- S3 with the model which will be deployed - -## Installing the chart - -To install the chart with the release name `test-vllm`: - -```console -helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY -``` - -## Uninstalling the Chart - -To uninstall the `test-vllm` deployment: - -```console -helm uninstall test-vllm --namespace=ns-vllm -``` - -The command removes all the Kubernetes components associated with the -chart **including persistent volumes** and deletes the release. - -## Architecture - -:::{image} /assets/deployment/architecture_helm_deployment.png -::: - -## Values - -:::{list-table} -:widths: 25 25 25 25 -:header-rows: 1 - -- * Key - * Type - * Default - * Description -- * autoscaling - * object - * {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} - * Autoscaling configuration -- * autoscaling.enabled - * bool - * false - * Enable autoscaling -- * autoscaling.maxReplicas - * int - * 100 - * Maximum replicas -- * autoscaling.minReplicas - * int - * 1 - * Minimum replicas -- * autoscaling.targetCPUUtilizationPercentage - * int - * 80 - * Target CPU utilization for autoscaling -- * configs - * object - * {} - * Configmap -- * containerPort - * int - * 8000 - * Container port -- * customObjects - * list - * [] - * Custom Objects configuration -- * deploymentStrategy - * object - * {} - * Deployment strategy configuration -- * externalConfigs - * list - * [] - * External configuration -- * extraContainers - * list - * [] - * Additional containers configuration -- * extraInit - * object - * {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} - * Additional configuration for the init container -- * extraInit.pvcStorage - * string - * "50Gi" - * Storage size of the s3 -- * extraInit.s3modelpath - * string - * "relative_s3_model_path/opt-125m" - * Path of the model on the s3 which hosts model weights and config files -- * extraInit.awsEc2MetadataDisabled - * boolean - * true - * Disables the use of the Amazon EC2 instance metadata service -- * extraPorts - * list - * [] - * Additional ports configuration -- * gpuModels - * list - * ["TYPE_GPU_USED"] - * Type of gpu used -- * image - * object - * {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} - * Image configuration -- * image.command - * list - * ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] - * Container launch command -- * image.repository - * string - * "vllm/vllm-openai" - * Image repository -- * image.tag - * string - * "latest" - * Image tag -- * livenessProbe - * object - * {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} - * Liveness probe configuration -- * livenessProbe.failureThreshold - * int - * 3 - * Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive -- * livenessProbe.httpGet - * object - * {"path":"/health","port":8000} - * Configuration of the Kubelet http request on the server -- * livenessProbe.httpGet.path - * string - * "/health" - * Path to access on the HTTP server -- * livenessProbe.httpGet.port - * int - * 8000 - * Name or number of the port to access on the container, on which the server is listening -- * livenessProbe.initialDelaySeconds - * int - * 15 - * Number of seconds after the container has started before liveness probe is initiated -- * livenessProbe.periodSeconds - * int - * 10 - * How often (in seconds) to perform the liveness probe -- * maxUnavailablePodDisruptionBudget - * string - * "" - * Disruption Budget Configuration -- * readinessProbe - * object - * {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} - * Readiness probe configuration -- * readinessProbe.failureThreshold - * int - * 3 - * Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready -- * readinessProbe.httpGet - * object - * {"path":"/health","port":8000} - * Configuration of the Kubelet http request on the server -- * readinessProbe.httpGet.path - * string - * "/health" - * Path to access on the HTTP server -- * readinessProbe.httpGet.port - * int - * 8000 - * Name or number of the port to access on the container, on which the server is listening -- * readinessProbe.initialDelaySeconds - * int - * 5 - * Number of seconds after the container has started before readiness probe is initiated -- * readinessProbe.periodSeconds - * int - * 5 - * How often (in seconds) to perform the readiness probe -- * replicaCount - * int - * 1 - * Number of replicas -- * resources - * object - * {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} - * Resource configuration -- * resources.limits."nvidia.com/gpu" - * int - * 1 - * Number of gpus used -- * resources.limits.cpu - * int - * 4 - * Number of CPUs -- * resources.limits.memory - * string - * "16Gi" - * CPU memory configuration -- * resources.requests."nvidia.com/gpu" - * int - * 1 - * Number of gpus used -- * resources.requests.cpu - * int - * 4 - * Number of CPUs -- * resources.requests.memory - * string - * "16Gi" - * CPU memory configuration -- * secrets - * object - * {} - * Secrets configuration -- * serviceName - * string - * - * Service name -- * servicePort - * int - * 80 - * Service port -- * labels.environment - * string - * test - * Environment name -- * labels.release - * string - * test - * Release name -::: diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md deleted file mode 100644 index 3408c6c10edef..0000000000000 --- a/docs/source/deployment/frameworks/index.md +++ /dev/null @@ -1,22 +0,0 @@ -# Using other frameworks - -:::{toctree} -:maxdepth: 1 - -anything-llm -bentoml -cerebrium -chatbox -dify -dstack -helm -litellm -lobe-chat -lws -modal -open-webui -retrieval_augmented_generation -skypilot -streamlit -triton -::: diff --git a/docs/source/deployment/integrations/index.md b/docs/source/deployment/integrations/index.md deleted file mode 100644 index 410742b88c735..0000000000000 --- a/docs/source/deployment/integrations/index.md +++ /dev/null @@ -1,11 +0,0 @@ -# External Integrations - -:::{toctree} -:maxdepth: 1 - -kserve -kubeai -llamastack -llmaz -production-stack -::: diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md deleted file mode 100644 index e1770c8226435..0000000000000 --- a/docs/source/design/kernel/paged_attention.md +++ /dev/null @@ -1,529 +0,0 @@ -(design-paged-attention)= - -# vLLM Paged Attention - -- Currently, vLLM utilizes its own implementation of a multi-head query - attention kernel (`csrc/attention/attention_kernels.cu`). - This kernel is designed to be compatible with - vLLM's paged KV caches, where the key and value cache are stored in - separate blocks (note that this block concept differs from the GPU - thread block. So in a later document, I will refer to vLLM paged - attention block as "block", while refer to GPU thread block as - "thread block"). -- To achieve high performance, this kernel relies on a specially - designed memory layout and access method, specifically when threads - read data from global memory to shared memory. The purpose of this - document is to provide a high-level explanation of the kernel - implementation step by step, aiding those who wish to learn about the - vLLM multi-head query attention kernel. After going through this - document, users will likely have a better understanding and feel easier - to follow the actual implementation. -- Please note that this document may not cover all details, such as how - to calculate the correct index for the corresponding data or the dot - multiplication implementation. However, after reading this document - and becoming familiar with the high-level logic flow, it should be - easier for you to read the actual code and understand the details. - -## Inputs - -- The kernel function takes a list of arguments for the current thread - to perform its assigned work. The three most important arguments are - the input pointers `q`, `k_cache`, and `v_cache`, which point - to query, key, and value data on global memory that need to be read - and processed. The output pointer `out` points to global memory - where the result should be written. These four pointers actually - refer to multi-dimensional arrays, but each thread only accesses the - portion of data assigned to it. I have omitted all other runtime - parameters here for simplicity. - - ```cpp - template< - typename scalar_t, - int HEAD_SIZE, - int BLOCK_SIZE, - int NUM_THREADS, - int PARTITION_SIZE = 0> - __device__ void paged_attention_kernel( - ... // Other side args. - const scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size] - const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] - const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] - const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] - ... // Other side args. - ) - ``` - -- There are also a list of template arguments above the function - signature that are determined during compilation time. `scalar_t` - represents the data type of the query, key, and value data elements, - such as FP16. `HEAD_SIZE` indicates the number of elements in each - head. `BLOCK_SIZE` refers to the number of tokens in each block. - `NUM_THREADS` denotes the number of threads in each thread block. - `PARTITION_SIZE` represents the number of tensor parallel GPUs (For - simplicity, we assume this is 0 and tensor parallel is disabled). - -- With these arguments, we need to perform a sequence of preparations. - This includes calculating the current head index, block index, and - other necessary variables. However, for now, we can ignore these - preparations and proceed directly to the actual calculations. It will - be easier to understand them once we grasp the entire flow. - -## Concepts - -- Just before we dive into the calculation flow, I want to describe a - few concepts that are needed for later sections. However, you may - skip this section and return later if you encounter any confusing - terminologies. -- **Sequence**: A sequence represents a client request. For example, - the data pointed to by `q` has a shape of - `[num_seqs, num_heads, head_size]`. That represents there are total - `num_seqs` of query sequence data are pointed by `q`. Since this - kernel is a single query attention kernel, each sequence only has one - query token. Hence, the `num_seqs` equals the total number of tokens - that are processed in the batch. -- **Context**: The context consists of the generated tokens from the - sequence. For instance, `["What", "is", "your"]` are the context - tokens, and the input query token is `"name"`. The model might - generate the token `"?"`. -- **Vec**: The vec is a list of elements that are fetched and - calculated together. For query and key data, the vec size - (`VEC_SIZE`) is determined so that each thread group can fetch and - calculate 16 bytes of data at a time. For value data, the vec size - (`V_VEC_SIZE`) is determined so that each thread can fetch and - calculate 16 bytes of data at a time. For example, if the - `scalar_t` is FP16 (2 bytes) and `THREAD_GROUP_SIZE` is 2, the - `VEC_SIZE` will be 4, while the `V_VEC_SIZE` will be 8. -- **Thread group**: The thread group is a small group of - threads(`THREAD_GROUP_SIZE`) that fetches and calculates one - query token and one key token at a time. Each thread handles only a - portion of the token data. The total number of elements processed by - one thread group is referred as `x`. For example, if the thread - group contains 2 threads and the head size is 8, then thread 0 - handles the query and key elements at index 0, 2, 4, 6, while thread - 1 handles the elements at index 1, 3, 5, 7. -- **Block**: The key and value cache data in vLLM are split into - blocks. Each block stores data for a fixed number(`BLOCK_SIZE`) - of tokens at one head. Each block may contain only a portion of the - whole context tokens. For example, if the block size is 16 and the - head size is 128, then for one head, one block can store 16 * 128 = - 2048 elements. -- **Warp**: A warp is a group of 32 threads(`WARP_SIZE`) that - execute simultaneously on a stream multiprocessor (SM). In this - kernel, each warp processes the calculation between one query token - and key tokens of one entire block at a time (it may process multiple - blocks in multiple iterations). For example, if there are 4 warps and - 6 blocks for one context, the assignment would be like warp 0 handles - the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2 - handles the 2nd block and warp 3 handles the 3rd block. -- **Thread block**: A thread block is a group of - threads(`NUM_THREADS`) that can access the same shared memory. - Each thread block contains multiple warps(`NUM_WARPS`), and in - this kernel, each thread block processes the calculation between one - query token and key tokens of a whole context. -- **Grid**: A grid is a collection of thread blocks and defines the - shape of the collection. In this kernel, the shape is - `(num_heads, num_seqs, max_num_partitions)`. Therefore, each thread - block only handles the calculation for one head, one sequence, and - one partition. - -## Query - -- This section will introduce how query data is stored in memory and - fetched by each thread. As mentioned above, each thread group fetches - one query token data, while each thread itself only handles a part of - one query token data. Within each warp, every thread group will fetch - the same query token data, but will multiply it with different key - token data. - - ```cpp - const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; - ``` - - :::{figure} ../../assets/kernel/query.png - :align: center - :alt: query - :width: 70% - - Query data of one token at one head - ::: - -- Each thread defines its own `q_ptr` which points to the assigned - query token data on global memory. For example, if `VEC_SIZE` is 4 - and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains - total of 128 elements divided into 128 / 4 = 32 vecs. - - :::{figure} ../../assets/kernel/q_vecs.png - :align: center - :alt: q_vecs - :width: 70% - - `q_vecs` for one thread group - ::: - - ```cpp - __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; - ``` - -- Next, we need to read the global memory data pointed to by `q_ptr` - into shared memory as `q_vecs`. It is important to note that each - vecs is assigned to a different row. For example, if the - `THREAD_GROUP_SIZE` is 2, thread 0 will handle the 0th row vecs, - while thread 1 handles the 1st row vecs. By reading the query data in - this way, neighboring threads like thread 0 and thread 1 can read - neighbor memory, achieving the memory coalescing to improve - performance. - -## Key - -- Similar to the "Query" section, this section introduces memory layout - and assignment for keys. While each thread group only handle one - query token one kernel run, it may handle multiple key tokens across - multiple iterations. Meanwhile, each warp will process multiple blocks - of key tokens in multiple iterations, ensuring that all context - tokens are processed by the entire thread group after the kernel run. - In this context, "handle" refers to performing the dot multiplication - between query data and key data. - - ```cpp - const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride - + kv_head_idx * kv_head_stride - + physical_block_offset * x; - ``` - -- Unlike to `q_ptr`, `k_ptr` in each thread will point to different - key token at different iterations. As shown above, that `k_ptr` - points to key token data based on `k_cache` at assigned block, - assigned head and assigned token. - - :::{figure} ../../assets/kernel/key.png - :align: center - :alt: key - :width: 70% - - Key data of all context tokens at one head - ::: - -- The diagram above illustrates the memory layout for key data. It - assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is - 8, `THREAD_GROUP_SIZE` is 2, and there are a total of 4 warps. Each - rectangle represents all the elements for one key token at one head, - which will be processed by one thread group. The left half shows the - total 16 blocks of key token data for warp 0, while the right half - represents the remaining key token data for other warps or - iterations. Inside each rectangle, there are a total 32 vecs (128 - elements for one token) that will be processed by 2 threads (one - thread group) separately. - - :::{figure} ../../assets/kernel/k_vecs.png - :align: center - :alt: k_vecs - :width: 70% - - `k_vecs` for one thread - ::: - - ```cpp - K_vec k_vecs[NUM_VECS_PER_THREAD] - ``` - -- Next, we need to read the key token data from `k_ptr` and store - them on register memory as `k_vecs`. We use register memory for - `k_vecs` because it will only be accessed by one thread once, - whereas `q_vecs` will be accessed by multiple threads multiple - times. Each `k_vecs` will contain multiple vectors for later - calculation. Each vec will be set at each inner iteration. The - assignment of vecs allows neighboring threads in a warp to read - neighboring memory together, which again promotes the memory - coalescing. For instance, thread 0 will read vec 0, while thread 1 - will read vec 1. In the next inner loop, thread 0 will read vec 2, - while thread 1 will read vec 3, and so on. - -- You may still be a little confused about the overall flow. Don't - worry, please keep reading the next "QK" section. It will illustrate - the query and key calculation flow in a clearer and higher-level - manner. - -## QK - -- As shown the pseudo code below, before the entire for loop block, we - fetch the query data for one token and store it in `q_vecs`. Then, - in the outer for loop, we iterate through different `k_ptrs` that - point to different tokens and prepare the `k_vecs` in the inner for - loop. Finally, we perform the dot multiplication between the - `q_vecs` and each `k_vecs`. - - ```cpp - q_vecs = ... - for ... { - k_ptr = ... - for ... { - k_vecs[i] = ... - } - ... - float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs); - } - ``` - -- As mentioned before, for each thread, it only fetches part of the - query and key token data at a time. However, there will be a cross - thread group reduction happen in the `Qk_dot<>::dot` . So `qk` - returned here is not just between part of the query and key token dot - multiplication, but actually a full result between entire query and - key token data. - -- For example, if the value of `HEAD_SIZE` is 128 and - `THREAD_GROUP_SIZE` is 2, each thread's `k_vecs` will contain - total 64 elements. However, the returned `qk` is actually the - result of dot multiplication between 128 query elements and 128 key - elements. If you want to learn more about the details of the dot - multiplication and reduction, you may refer to the implementation of - `Qk_dot<>::dot`. However, for the sake of simplicity, I will not - cover it in this document. - -## Softmax - -- Next, we need to calculate the normalized softmax for all `qk`s, - as shown above, where each $x$ represents a `qk`. To do this, - we must obtain the reduced value of `qk_max`($m(x)$) and - the `exp_sum`($\ell(x)$) of all `qk`s. The reduction - should be performed across the entire thread block, encompassing - results between the query token and all context key tokens. - - :::{math} - :nowrap: true - - \begin{gather*} - m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\ - \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)} - \end{gather*} - ::: - -### `qk_max` and `logits` - -- Just right after we get the `qk` result, we can set the temporary - `logits` result with `qk` (In the end, the `logits` should - store the normalized softmax result). Also we can compare and collect - the `qk_max` for all `qk`s that are calculated by current - thread group. - - ```cpp - if (thread_group_offset == 0) { - const bool mask = token_idx >= context_len; - logits[token_idx - start_token_idx] = mask ? 0.f : qk; - qk_max = mask ? qk_max : fmaxf(qk_max, qk); - } - ``` - -- Please note that the `logits` here is on shared memory, so each - thread group will set the fields for its own assigned context tokens. - Overall, the size of logits should be number of context tokens. - - ```cpp - for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) { - qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); - } - - if (lane == 0) { - red_smem[warp_idx] = qk_max; - } - ``` - -- Then we need to get the reduced `qk_max` across each warp. The main - idea is to make threads in warp to communicate with each other and - get the final max `qk` . - - ```cpp - for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { - qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); - } - qk_max = VLLM_SHFL_SYNC(qk_max, 0); - ``` - -- Finally, we can get the reduced `qk_max` from whole thread block by - compare the `qk_max` from all warps in this thread block. Then we - need to broadcast the final result to each thread. - -### `exp_sum` - -- Similar to `qk_max`, we need to get the reduced sum value from the - entire thread block too. - - ```cpp - for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { - float val = __expf(logits[i] - qk_max); - logits[i] = val; - exp_sum += val; - } - ... - exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum); - ``` - -- Firstly, sum all exp values from each thread group, and meanwhile, - convert each entry of `logits` from `qk` to `exp(qk - qk_max)`. - Please note, the `qk_max` here is already the max `qk` across the - whole thread block. And then we can do reduction for `exp_sum` - across whole thread block just like the `qk_max`. - - ```cpp - const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f); - for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { - logits[i] *= inv_sum; - } - ``` - -- Finally, with the reduced `qk_max` and `exp_sum`, we can obtain - the final normalized softmax result as `logits`. This `logits` - variable will be used for dot multiplication with the value data in - later steps. Now, it should store the normalized softmax result of - `qk` for all assigned context tokens. - -## Value - -:::{figure} ../../assets/kernel/value.png -:align: center -:alt: value -:width: 70% - -Value data of all context tokens at one head -::: - -:::{figure} ../../assets/kernel/logits_vec.png -:align: center -:alt: logits_vec -:width: 50% - -`logits_vec` for one thread -::: - -:::{figure} ../../assets/kernel/v_vec.png -:align: center -:alt: v_vec -:width: 70% - -List of `v_vec` for one thread -::: - -- Now we need to retrieve the value data and perform dot multiplication - with `logits`. Unlike query and key, there is no thread group - concept for value data. As shown in diagram, different from key token - memory layout, elements from the same column correspond to the same - value token. For one block of value data, there are `HEAD_SIZE` of - rows and `BLOCK_SIZE` of columns that are split into multiple - `v_vecs`. - -- Each thread always fetches `V_VEC_SIZE` elements from the same - `V_VEC_SIZE` of tokens at a time. As a result, a single thread - retrieves multiple `v_vec`s from different rows and the same - columns through multiple inner iterations. For each `v_vec`, it - needs to be dot multiplied with the corresponding `logits_vec`, - which is also `V_VEC_SIZE` elements from `logits`. Overall, with - multiple inner iterations, each warp will process one block of value - tokens. And with multiple outer iterations, the whole context value - tokens are processed - - ```cpp - float accs[NUM_ROWS_PER_THREAD]; - for ... { // Iteration over different blocks. - logits_vec = ... - for ... { // Iteration over different rows. - v_vec = ... - ... - accs[i] += dot(logits_vec, v_vec); - } - } - ``` - -- As shown in the above pseudo code, in the outer loop, similar to - `k_ptr`, `logits_vec` iterates over different blocks and reads - `V_VEC_SIZE` elements from `logits`. In the inner loop, each - thread reads `V_VEC_SIZE` elements from the same tokens as a - `v_vec` and performs dot multiplication. It is important to note - that in each inner iteration, the thread fetches different head - position elements for the same tokens. The dot result is then - accumulated in `accs`. Therefore, each entry of `accs` is mapped - to a head position assigned to the current thread. - -- For example, if `BLOCK_SIZE` is 16 and `V_VEC_SIZE` is 8, each - thread fetches 8 value elements for 8 tokens at a time. Each element - is from different tokens at the same head position. If `HEAD_SIZE` - is 128 and `WARP_SIZE` is 32, for each inner loop, a warp needs to - fetch `WARP_SIZE * V_VEC_SIZE = 256` elements. This means there are - a total of 128 * 16 / 256 = 8 inner iterations for a warp to handle - a whole block of value tokens. And each `accs` in each thread - contains 8 elements that accumulated at 8 different head positions. - For the thread 0, the `accs` variable will have 8 elements, which - are 0th, 32th โ€ฆ 224th elements of a value head that are accumulated - from all assigned 8 tokens. - -## LV - -- Now, we need to perform reduction for `accs` within each warp. This - process allows each thread to accumulate the `accs` for the - assigned head positions of all tokens in one block. - - ```cpp - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - float acc = accs[i]; - for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) { - acc += VLLM_SHFL_XOR_SYNC(acc, mask); - } - accs[i] = acc; - } - ``` - -- Next, we perform reduction for `accs` across all warps, allowing - each thread to have the accumulation of `accs` for the assigned - head positions of all context tokens. Please note that each `accs` - in every thread only stores the accumulation for a portion of - elements of the entire head for all context tokens. However, overall, - all results for output have been calculated but are just stored in - different thread register memory. - - ```cpp - float* out_smem = reinterpret_cast<float*>(shared_mem); - for (int i = NUM_WARPS; i > 1; i /= 2) { - // Upper warps write to shared memory. - ... - float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE]; - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - ... - dst[row_idx] = accs[i]; - } - - // Lower warps update the output. - const float* src = &out_smem[warp_idx * HEAD_SIZE]; - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - ... - accs[i] += src[row_idx]; - } - - // Write out the accs. - } - ``` - -## Output - -- Now we can write all of calculated result from local register memory - to final output global memory. - - ```cpp - scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE - + head_idx * max_num_partitions * HEAD_SIZE - + partition_idx * HEAD_SIZE; - ``` - -- First, we need to define the `out_ptr` variable, which points to - the start address of the assigned sequence and assigned head. - - ```cpp - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER; - if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) { - from_float(*(out_ptr + row_idx), accs[i]); - } - } - ``` - -- Finally, we need to iterate over different assigned head positions - and write out the corresponding accumulated result based on the - `out_ptr`. diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md deleted file mode 100644 index 8865d26deaeda..0000000000000 --- a/docs/source/features/compatibility_matrix.md +++ /dev/null @@ -1,476 +0,0 @@ -(compatibility-matrix)= - -# Compatibility Matrix - -The tables below show mutually exclusive features and the support on some hardware. - -The symbols used have the following meanings: - -- โœ… = Full compatibility -- ๐ŸŸ  = Partial compatibility -- โŒ = No compatibility - -:::{note} -Check the โŒ or ๐ŸŸ  with links to see tracking issue for unsupported feature/hardware combination. -::: - -## Feature x Feature - -:::{raw} html -<style> - /* Make smaller to try to improve readability */ - td { - font-size: 0.8rem; - text-align: center; - } - - th { - text-align: center; - font-size: 0.8rem; - } -</style> -::: - -:::{list-table} -:header-rows: 1 -:stub-columns: 1 -:widths: auto -:class: vertical-table-header - -- * Feature - * [CP](#chunked-prefill) - * [APC](#automatic-prefix-caching) - * [LoRA](#lora-adapter) - * <abbr title="Prompt Adapter">prmpt adptr</abbr> - * [SD](#spec-decode) - * CUDA graph - * <abbr title="Pooling Models">pooling</abbr> - * <abbr title="Encoder-Decoder Models">enc-dec</abbr> - * <abbr title="Logprobs">logP</abbr> - * <abbr title="Prompt Logprobs">prmpt logP</abbr> - * <abbr title="Async Output Processing">async output</abbr> - * multi-step - * <abbr title="Multimodal Inputs">mm</abbr> - * best-of - * beam-search - * <abbr title="Guided Decoding">guided dec</abbr> -- * [CP](#chunked-prefill) - * โœ… - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * -- * [APC](#automatic-prefix-caching) - * โœ… - * โœ… - * - * - * - * - * - * - * - * - * - * - * - * - * - * -- * [LoRA](#lora-adapter) - * โœ… - * โœ… - * โœ… - * - * - * - * - * - * - * - * - * - * - * - * - * -- * <abbr title="Prompt Adapter">prmpt adptr</abbr> - * โœ… - * โœ… - * โœ… - * โœ… - * - * - * - * - * - * - * - * - * - * - * - * -- * [SD](#spec-decode) - * โœ… - * โœ… - * โŒ - * โœ… - * โœ… - * - * - * - * - * - * - * - * - * - * - * -- * CUDA graph - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * - * - * - * - * - * - * - * - * - * -- * <abbr title="Pooling Models">pooling</abbr> - * โŒ - * โŒ - * โŒ - * โŒ - * โŒ - * โŒ - * โœ… - * - * - * - * - * - * - * - * - * -- * <abbr title="Encoder-Decoder Models">enc-dec</abbr> - * โŒ - * [โŒ](gh-issue:7366) - * โŒ - * โŒ - * [โŒ](gh-issue:7366) - * โœ… - * โœ… - * โœ… - * - * - * - * - * - * - * - * -- * <abbr title="Logprobs">logP</abbr> - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โŒ - * โœ… - * โœ… - * - * - * - * - * - * - * -- * <abbr title="Prompt Logprobs">prmpt logP</abbr> - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โŒ - * โœ… - * โœ… - * โœ… - * - * - * - * - * - * -- * <abbr title="Async Output Processing">async output</abbr> - * โœ… - * โœ… - * โœ… - * โœ… - * โŒ - * โœ… - * โŒ - * โŒ - * โœ… - * โœ… - * โœ… - * - * - * - * - * -- * multi-step - * โŒ - * โœ… - * โŒ - * โœ… - * โŒ - * โœ… - * โŒ - * โŒ - * โœ… - * โœ… - * โœ… - * โœ… - * - * - * - * -- * <abbr title="Multimodal Inputs">mm</abbr> - * โœ… - * [๐ŸŸ ](gh-pr:8348) - * [๐ŸŸ ](gh-pr:4194) - * โ” - * โ” - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โ” - * โœ… - * - * - * -- * best-of - * โœ… - * โœ… - * โœ… - * โœ… - * [โŒ](gh-issue:6137) - * โœ… - * โŒ - * โœ… - * โœ… - * โœ… - * โ” - * [โŒ](gh-issue:7968) - * โœ… - * โœ… - * - * -- * beam-search - * โœ… - * โœ… - * โœ… - * โœ… - * [โŒ](gh-issue:6137) - * โœ… - * โŒ - * โœ… - * โœ… - * โœ… - * โ” - * [โŒ](gh-issue:7968) - * โ” - * โœ… - * โœ… - * -- * <abbr title="Guided Decoding">guided dec</abbr> - * โœ… - * โœ… - * โ” - * โ” - * [โŒ](gh-issue:11484) - * โœ… - * โŒ - * โ” - * โœ… - * โœ… - * โœ… - * [โŒ](gh-issue:9893) - * โ” - * โœ… - * โœ… - * โœ… -::: - -(feature-x-hardware)= - -## Feature x Hardware - -:::{list-table} -:header-rows: 1 -:stub-columns: 1 -:widths: auto - -- * Feature - * Volta - * Turing - * Ampere - * Ada - * Hopper - * CPU - * AMD -- * [CP](#chunked-prefill) - * [โŒ](gh-issue:2729) - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… -- * [APC](#automatic-prefix-caching) - * [โŒ](gh-issue:3687) - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… -- * [LoRA](#lora-adapter) - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… -- * <abbr title="Prompt Adapter">prmpt adptr</abbr> - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * [โŒ](gh-issue:8475) - * โœ… -- * [SD](#spec-decode) - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… -- * CUDA graph - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โŒ - * โœ… -- * <abbr title="Pooling Models">pooling</abbr> - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โ” -- * <abbr title="Encoder-Decoder Models">enc-dec</abbr> - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โŒ -- * <abbr title="Multimodal Inputs">mm</abbr> - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… -- * <abbr title="Logprobs">logP</abbr> - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… -- * <abbr title="Prompt Logprobs">prmpt logP</abbr> - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… -- * <abbr title="Async Output Processing">async output</abbr> - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โŒ - * โŒ -- * multi-step - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * [โŒ](gh-issue:8477) - * โœ… -- * best-of - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… -- * beam-search - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… -- * <abbr title="Guided Decoding">guided dec</abbr> - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… - * โœ… -::: diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md deleted file mode 100644 index 7ad46b7094ee9..0000000000000 --- a/docs/source/features/quantization/index.md +++ /dev/null @@ -1,24 +0,0 @@ -(quantization-index)= - -# Quantization - -Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices. - -:::{toctree} -:caption: Contents -:maxdepth: 1 - -supported_hardware -auto_awq -bnb -bitblas -gguf -gptqmodel -int4 -int8 -fp8 -modelopt -quark -quantized_kvcache -torchao -::: diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md deleted file mode 100644 index f8af1ba60b125..0000000000000 --- a/docs/source/features/quantization/supported_hardware.md +++ /dev/null @@ -1,153 +0,0 @@ -(quantization-supported-hardware)= - -# Supported Hardware - -The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: - -:::{list-table} -:header-rows: 1 -:widths: 20 8 8 8 8 8 8 8 8 8 8 - -- * Implementation - * Volta - * Turing - * Ampere - * Ada - * Hopper - * AMD GPU - * Intel GPU - * x86 CPU - * AWS Inferentia - * Google TPU -- * AWQ - * โŒ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โŒ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โŒ - * โŒ -- * GPTQ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โŒ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โŒ - * โŒ -- * Marlin (GPTQ/AWQ/FP8) - * โŒ - * โŒ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โŒ - * โŒ - * โŒ - * โŒ - * โŒ -- * INT8 (W8A8) - * โŒ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โŒ - * โŒ - * โœ…๏ธŽ - * โŒ - * โœ…๏ธŽ -- * FP8 (W8A8) - * โŒ - * โŒ - * โŒ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โŒ - * โŒ - * โŒ - * โŒ -- * BitBLAS (GPTQ) - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โŒ - * โŒ - * โŒ - * โŒ - * โŒ -- * AQLM - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โŒ - * โŒ - * โŒ - * โŒ - * โŒ -- * bitsandbytes - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โŒ - * โŒ - * โŒ - * โŒ - * โŒ -- * DeepSpeedFP - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โŒ - * โŒ - * โŒ - * โŒ - * โŒ -- * GGUF - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โŒ - * โŒ - * โŒ - * โŒ -- * modelopt - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ๏ธŽ - * โŒ - * โŒ - * โŒ - * โŒ - * โŒ -::: - -- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. -- โœ…๏ธŽ indicates that the quantization method is supported on the specified hardware. -- โŒ indicates that the quantization method is not supported on the specified hardware. - -:::{note} -This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. - -For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team. -::: diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py deleted file mode 100644 index f77dbefb0a018..0000000000000 --- a/docs/source/generate_examples.py +++ /dev/null @@ -1,244 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import itertools -import re -from dataclasses import dataclass, field -from pathlib import Path - -ROOT_DIR = Path(__file__).parent.parent.parent.resolve() -ROOT_DIR_RELATIVE = '../../../..' -EXAMPLE_DIR = ROOT_DIR / "examples" -EXAMPLE_DOC_DIR = ROOT_DIR / "docs/source/getting_started/examples" - - -def fix_case(text: str) -> str: - subs = { - "api": "API", - "cli": "CLI", - "cpu": "CPU", - "llm": "LLM", - "mae": "MAE", - "tpu": "TPU", - "aqlm": "AQLM", - "gguf": "GGUF", - "lora": "LoRA", - "rlhf": "RLHF", - "vllm": "vLLM", - "openai": "OpenAI", - "lmcache": "LMCache", - "multilora": "MultiLoRA", - "mlpspeculator": "MLPSpeculator", - r"fp\d+": lambda x: x.group(0).upper(), # e.g. fp16, fp32 - r"int\d+": lambda x: x.group(0).upper(), # e.g. int8, int16 - } - for pattern, repl in subs.items(): - text = re.sub(rf'\b{pattern}\b', repl, text, flags=re.IGNORECASE) - return text - - -@dataclass -class Index: - """ - Index class to generate a structured document index. - - Attributes: - path (Path): The path save the index file to. - title (str): The title of the index. - description (str): A brief description of the index. - caption (str): An optional caption for the table of contents. - maxdepth (int): The maximum depth of the table of contents. Defaults to 1. - documents (list[str]): A list of document paths to include in the index. Defaults to an empty list. - - Methods: - generate() -> str: - Generates the index content as a string in the specified format. - """ # noqa: E501 - path: Path - title: str - description: str - caption: str - maxdepth: int = 1 - documents: list[str] = field(default_factory=list) - - def generate(self) -> str: - content = f"# {self.title}\n\n{self.description}\n\n" - content += ":::{toctree}\n" - content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n" - content += "\n".join(self.documents) + "\n:::\n" - return content - - -@dataclass -class Example: - """ - Example class for generating documentation content from a given path. - - Attributes: - path (Path): The path to the main directory or file. - category (str): The category of the document. - main_file (Path): The main file in the directory. - other_files (list[Path]): list of other files in the directory. - title (str): The title of the document. - - Methods: - __post_init__(): Initializes the main_file, other_files, and title attributes. - determine_main_file() -> Path: Determines the main file in the given path. - determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file. - determine_title() -> str: Determines the title of the document. - generate() -> str: Generates the documentation content. - """ # noqa: E501 - path: Path - category: str = None - main_file: Path = field(init=False) - other_files: list[Path] = field(init=False) - title: str = field(init=False) - - def __post_init__(self): - self.main_file = self.determine_main_file() - self.other_files = self.determine_other_files() - self.title = self.determine_title() - - def determine_main_file(self) -> Path: - """ - Determines the main file in the given path. - If the path is a file, it returns the path itself. Otherwise, it searches - for Markdown files (*.md) in the directory and returns the first one found. - Returns: - Path: The main file path, either the original path if it's a file or the first - Markdown file found in the directory. - Raises: - IndexError: If no Markdown files are found in the directory. - """ # noqa: E501 - return self.path if self.path.is_file() else list( - self.path.glob("*.md")).pop() - - def determine_other_files(self) -> list[Path]: - """ - Determine other files in the directory excluding the main file. - - This method checks if the given path is a file. If it is, it returns an empty list. - Otherwise, it recursively searches through the directory and returns a list of all - files that are not the main file. - - Returns: - list[Path]: A list of Path objects representing the other files in the directory. - """ # noqa: E501 - if self.path.is_file(): - return [] - is_other_file = lambda file: file.is_file() and file != self.main_file - return [file for file in self.path.rglob("*") if is_other_file(file)] - - def determine_title(self) -> str: - return fix_case(self.path.stem.replace("_", " ").title()) - - def generate(self) -> str: - # Convert the path to a relative path from __file__ - make_relative = lambda path: ROOT_DIR_RELATIVE / path.relative_to( - ROOT_DIR) - - content = f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n" - include = "include" if self.main_file.suffix == ".md" else \ - "literalinclude" - if include == "literalinclude": - content += f"# {self.title}\n\n" - content += f":::{{{include}}} {make_relative(self.main_file)}\n" - if include == "literalinclude": - content += f":language: {self.main_file.suffix[1:]}\n" - content += ":::\n\n" - - if not self.other_files: - return content - - content += "## Example materials\n\n" - for file in sorted(self.other_files): - include = "include" if file.suffix == ".md" else "literalinclude" - content += f":::{{admonition}} {file.relative_to(self.path)}\n" - content += ":class: dropdown\n\n" - content += f":::{{{include}}} {make_relative(file)}\n:::\n" - content += ":::\n\n" - - return content - - -def generate_examples(): - # Create the EXAMPLE_DOC_DIR if it doesn't exist - if not EXAMPLE_DOC_DIR.exists(): - EXAMPLE_DOC_DIR.mkdir(parents=True) - - # Create empty indices - examples_index = Index( - path=EXAMPLE_DOC_DIR / "examples_index.md", - title="Examples", - description= - "A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using <gh-file:docs/source/generate_examples.py> from examples found in <gh-file:examples>.", # noqa: E501 - caption="Examples", - maxdepth=2) - # Category indices stored in reverse order because they are inserted into - # examples_index.documents at index 0 in order - category_indices = { - "other": - Index( - path=EXAMPLE_DOC_DIR / "examples_other_index.md", - title="Other", - description= - "Other examples that don't strongly fit into the online or offline serving categories.", # noqa: E501 - caption="Examples", - ), - "online_serving": - Index( - path=EXAMPLE_DOC_DIR / "examples_online_serving_index.md", - title="Online Serving", - description= - "Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.", # noqa: E501 - caption="Examples", - ), - "offline_inference": - Index( - path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md", - title="Offline Inference", - description= - "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches. We recommend starting with <project:basic.md>.", # noqa: E501 - caption="Examples", - ), - } - - examples = [] - glob_patterns = ["*.py", "*.md", "*.sh"] - # Find categorised examples - for category in category_indices: - category_dir = EXAMPLE_DIR / category - globs = [category_dir.glob(pattern) for pattern in glob_patterns] - for path in itertools.chain(*globs): - examples.append(Example(path, category)) - # Find examples in subdirectories - for path in category_dir.glob("*/*.md"): - examples.append(Example(path.parent, category)) - # Find uncategorised examples - globs = [EXAMPLE_DIR.glob(pattern) for pattern in glob_patterns] - for path in itertools.chain(*globs): - examples.append(Example(path)) - # Find examples in subdirectories - for path in EXAMPLE_DIR.glob("*/*.md"): - # Skip categorised examples - if path.parent.name in category_indices: - continue - examples.append(Example(path.parent)) - - # Generate the example documentation - for example in sorted(examples, key=lambda e: e.path.stem): - doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md" - with open(doc_path, "w+") as f: - f.write(example.generate()) - # Add the example to the appropriate index - index = category_indices.get(example.category, examples_index) - index.documents.append(example.path.stem) - - # Generate the index files - for category_index in category_indices.values(): - if category_index.documents: - examples_index.documents.insert(0, category_index.path.name) - with open(category_index.path, "w+") as f: - f.write(category_index.generate()) - - with open(examples_index.path, "w+") as f: - f.write(examples_index.generate()) diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation.md deleted file mode 100644 index 44134bf01b76c..0000000000000 --- a/docs/source/getting_started/installation.md +++ /dev/null @@ -1,28 +0,0 @@ -(installation-index)= - -# Installation - -vLLM supports the following hardware platforms: - -:::{toctree} -:maxdepth: 1 -:hidden: - -installation/gpu -installation/cpu -installation/ai_accelerator -::: - -- <project:installation/gpu.md> - - NVIDIA CUDA - - AMD ROCm - - Intel XPU -- <project:installation/cpu.md> - - Intel/AMD x86 - - ARM AArch64 - - Apple silicon - - IBM Z (S390X) -- <project:installation/ai_accelerator.md> - - Google TPU - - Intel Gaudi - - AWS Neuron diff --git a/docs/source/getting_started/installation/ai_accelerator.md b/docs/source/getting_started/installation/ai_accelerator.md deleted file mode 100644 index 0a207af1a4c75..0000000000000 --- a/docs/source/getting_started/installation/ai_accelerator.md +++ /dev/null @@ -1,299 +0,0 @@ -# Other AI accelerators - -vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions: - -:::::{tab-set} -:sync-group: device - -::::{tab-item} Google TPU -:selected: -:sync: tpu - -:::{include} ai_accelerator/tpu.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: - -:::: - -::::{tab-item} Intel Gaudi -:sync: hpu-gaudi - -:::{include} ai_accelerator/hpu-gaudi.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: - -:::: - -::::{tab-item} AWS Neuron -:sync: neuron - -:::{include} ai_accelerator/neuron.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: - -:::: - -::::: - -## Requirements - -:::::{tab-set} -:sync-group: device - -::::{tab-item} Google TPU -:sync: tpu - -:::{include} ai_accelerator/tpu.inc.md -:start-after: "## Requirements" -:end-before: "## Configure a new environment" -::: - -:::: - -::::{tab-item} Intel Gaudi -:sync: hpu-gaudi - -:::{include} ai_accelerator/hpu-gaudi.inc.md -:start-after: "## Requirements" -:end-before: "## Configure a new environment" -::: - -:::: - -::::{tab-item} AWS Neuron -:sync: neuron - -:::{include} ai_accelerator/neuron.inc.md -:start-after: "## Requirements" -:end-before: "## Configure a new environment" -::: - -:::: - -::::: - -## Configure a new environment - -:::::{tab-set} -:sync-group: device - -::::{tab-item} Google TPU -:sync: tpu - -:::{include} ai_accelerator/tpu.inc.md -:start-after: "## Configure a new environment" -:end-before: "## Set up using Python" -::: - -:::: - -::::{tab-item} Intel Gaudi -:sync: hpu-gaudi - -:::{include} ai_accelerator/hpu-gaudi.inc.md -:start-after: "## Configure a new environment" -:end-before: "## Set up using Python" -::: - -:::: - -::::{tab-item} AWS Neuron -:sync: neuron - -:::{include} ai_accelerator/neuron.inc.md -:start-after: "## Configure a new environment" -:end-before: "## Set up using Python" -::: - -:::: - -::::: - -## Set up using Python - -### Pre-built wheels - -:::::{tab-set} -:sync-group: device - -::::{tab-item} Google TPU -:sync: tpu - -:::{include} ai_accelerator/tpu.inc.md -:start-after: "### Pre-built wheels" -:end-before: "### Build wheel from source" -::: - -:::: - -::::{tab-item} Intel Gaudi -:sync: hpu-gaudi - -:::{include} ai_accelerator/hpu-gaudi.inc.md -:start-after: "### Pre-built wheels" -:end-before: "### Build wheel from source" -::: - -:::: - -::::{tab-item} AWS Neuron -:sync: neuron - -:::{include} ai_accelerator/neuron.inc.md -:start-after: "### Pre-built wheels" -:end-before: "### Build wheel from source" -::: - -:::: - -::::: - -### Build wheel from source - -:::::{tab-set} -:sync-group: device - -::::{tab-item} Google TPU -:sync: tpu - -:::{include} ai_accelerator/tpu.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: - -:::: - -::::{tab-item} Intel Gaudi -:sync: hpu-gaudi - -:::{include} ai_accelerator/hpu-gaudi.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: - -:::: - -::::{tab-item} AWS Neuron -:sync: neuron - -:::{include} ai_accelerator/neuron.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: - -:::: - -::::: - -## Set up using Docker - -### Pre-built images - -:::::{tab-set} -:sync-group: device - -::::{tab-item} Google TPU -:sync: tpu - -:::{include} ai_accelerator/tpu.inc.md -:start-after: "### Pre-built images" -:end-before: "### Build image from source" -::: - -:::: - -::::{tab-item} Intel Gaudi -:sync: hpu-gaudi - -:::{include} ai_accelerator/hpu-gaudi.inc.md -:start-after: "### Pre-built images" -:end-before: "### Build image from source" -::: - -:::: - -::::{tab-item} AWS Neuron -:sync: neuron - -:::{include} ai_accelerator/neuron.inc.md -:start-after: "### Pre-built images" -:end-before: "### Build image from source" -::: - -:::: - -::::: - -### Build image from source - -:::::{tab-set} -:sync-group: device - -::::{tab-item} Google TPU -:sync: tpu - -:::{include} ai_accelerator/tpu.inc.md -:start-after: "### Build image from source" -:end-before: "## Extra information" -::: - -:::: - -::::{tab-item} Intel Gaudi -:sync: hpu-gaudi - -:::{include} ai_accelerator/hpu-gaudi.inc.md -:start-after: "### Build image from source" -:end-before: "## Extra information" -::: - -:::: - -::::{tab-item} AWS Neuron -:sync: neuron - -:::{include} ai_accelerator/neuron.inc.md -:start-after: "### Build image from source" -:end-before: "## Extra information" -::: - -:::: - -::::: - -## Extra information - -:::::{tab-set} -:sync-group: device - -::::{tab-item} Google TPU -:sync: tpu - -:::{include} ai_accelerator/tpu.inc.md -:start-after: "## Extra information" -::: - -:::: - -::::{tab-item} Intel Gaudi -:sync: hpu-gaudi - -:::{include} ai_accelerator/hpu-gaudi.inc.md -:start-after: "## Extra information" -::: - -:::: - -::::{tab-item} AWS Neuron -:sync: neuron - -:::{include} ai_accelerator/neuron.inc.md -:start-after: "## Extra information" -::: - -:::: - -::::: diff --git a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md deleted file mode 100644 index b4bfb696faa28..0000000000000 --- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md +++ /dev/null @@ -1,139 +0,0 @@ -# Installation - -vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching. -Paged Attention and Chunked Prefill are currently in development and will be available soon. -Data types currently supported in Neuron SDK are FP16 and BF16. - -:::{attention} -There are no pre-built wheels or images for this device, so you must build vLLM from source. -::: - -## Requirements - -- OS: Linux -- Python: 3.9 -- 3.11 -- Accelerator: NeuronCore_v2 (in trn1/inf2 instances) -- Pytorch 2.0.1/2.1.1 -- AWS Neuron SDK 2.16/2.17 (Verified on python 3.8) - -## Configure a new environment - -### Launch Trn1/Inf2 instances - -Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html). - -- Please follow the instructions at [launch an Amazon EC2 Instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance) to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type. -- To get more information about instances sizes and pricing see: [Trn1 web page](https://aws.amazon.com/ec2/instance-types/trn1/), [Inf2 web page](https://aws.amazon.com/ec2/instance-types/inf2/) -- Select Ubuntu Server 22.04 TLS AMI -- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB. -- After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance - -### Install drivers and tools - -The installation of drivers and tools wouldn't be necessary, if [Deep Learning AMI Neuron](https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html) is installed. In case the drivers and tools are not installed on the operating system, follow the steps below: - -```console -# Configure Linux for Neuron repository updates -. /etc/os-release -sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF -deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main -EOF -wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add - - -# Update OS packages -sudo apt-get update -y - -# Install OS headers -sudo apt-get install linux-headers-$(uname -r) -y - -# Install git -sudo apt-get install git -y - -# install Neuron Driver -sudo apt-get install aws-neuronx-dkms=2.* -y - -# Install Neuron Runtime -sudo apt-get install aws-neuronx-collectives=2.* -y -sudo apt-get install aws-neuronx-runtime-lib=2.* -y - -# Install Neuron Tools -sudo apt-get install aws-neuronx-tools=2.* -y - -# Add PATH -export PATH=/opt/aws/neuron/bin:$PATH -``` - -## Set up using Python - -### Pre-built wheels - -Currently, there are no pre-built Neuron wheels. - -### Build wheel from source - -:::{note} -The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. -::: - -Following instructions are applicable to Neuron SDK 2.16 and beyond. - -#### Install transformers-neuronx and its dependencies - -[transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) will be the backend to support inference on trn1/inf2 instances. -Follow the steps below to install transformer-neuronx package and its dependencies. - -```console -# Install Python venv -sudo apt-get install -y python3.10-venv g++ - -# Create Python venv -python3.10 -m venv aws_neuron_venv_pytorch - -# Activate Python venv -source aws_neuron_venv_pytorch/bin/activate - -# Install Jupyter notebook kernel -pip install ipykernel -python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)" -pip install jupyter notebook -pip install environment_kernels - -# Set pip repository pointing to the Neuron repository -python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com - -# Install wget, awscli -python -m pip install wget -python -m pip install awscli - -# Update Neuron Compiler and Framework -python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx -``` - -#### Install vLLM from source - -Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows: - -```console -git clone https://github.com/vllm-project/vllm.git -cd vllm -pip install -U -r requirements/neuron.txt -VLLM_TARGET_DEVICE="neuron" pip install . -``` - -If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed. - -## Set up using Docker - -### Pre-built images - -Currently, there are no pre-built Neuron images. - -### Build image from source - -See <project:#deployment-docker-build-image-from-source> for instructions on building the Docker image. - -Make sure to use <gh-file:docker/Dockerfile.neuron> in place of the default Dockerfile. - -## Extra information - -There is no extra information for this device. diff --git a/docs/source/getting_started/installation/cpu/arm.inc.md b/docs/source/getting_started/installation/cpu/arm.inc.md deleted file mode 100644 index e7d8d60630dc0..0000000000000 --- a/docs/source/getting_started/installation/cpu/arm.inc.md +++ /dev/null @@ -1,34 +0,0 @@ -# Installation - -vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. - -ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. - -:::{attention} -There are no pre-built wheels or images for this device, so you must build vLLM from source. -::: - -## Requirements - -- OS: Linux -- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended) -- Instruction Set Architecture (ISA): NEON support is required - -## Set up using Python - -### Pre-built wheels - -### Build wheel from source - -:::{include} cpu/build.inc.md -::: - -Testing has been conducted on AWS Graviton3 instances for compatibility. - -## Set up using Docker - -### Pre-built images - -### Build image from source - -## Extra information diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md deleted file mode 100644 index 9ae2035db5433..0000000000000 --- a/docs/source/getting_started/installation/cpu/x86.inc.md +++ /dev/null @@ -1,41 +0,0 @@ -# Installation - -vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. - -:::{attention} -There are no pre-built wheels or images for this device, so you must build vLLM from source. -::: - -## Requirements - -- OS: Linux -- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended) -- Instruction Set Architecture (ISA): AVX512 (optional, recommended) - -:::{tip} -[Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. -::: - -## Set up using Python - -### Pre-built wheels - -### Build wheel from source - -:::{include} cpu/build.inc.md -::: - -:::{note} -- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, which brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. -- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building. -::: - -## Set up using Docker - -### Pre-built images - -See [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo) - -### Build image from source - -## Extra information diff --git a/docs/source/getting_started/installation/gpu.md b/docs/source/getting_started/installation/gpu.md deleted file mode 100644 index 22db992354fb1..0000000000000 --- a/docs/source/getting_started/installation/gpu.md +++ /dev/null @@ -1,301 +0,0 @@ -# GPU - -vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions: - -:::::{tab-set} -:sync-group: device - -::::{tab-item} NVIDIA CUDA -:selected: -:sync: cuda - -:::{include} gpu/cuda.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: - -:::: - -::::{tab-item} AMD ROCm -:sync: rocm - -:::{include} gpu/rocm.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: - -:::: - -::::{tab-item} Intel XPU -:sync: xpu - -:::{include} gpu/xpu.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: - -:::: - -::::: - -## Requirements - -- OS: Linux -- Python: 3.9 -- 3.12 - -:::::{tab-set} -:sync-group: device - -::::{tab-item} NVIDIA CUDA -:sync: cuda - -:::{include} gpu/cuda.inc.md -:start-after: "## Requirements" -:end-before: "## Set up using Python" -::: - -:::: - -::::{tab-item} AMD ROCm -:sync: rocm - -:::{include} gpu/rocm.inc.md -:start-after: "## Requirements" -:end-before: "## Set up using Python" -::: - -:::: - -::::{tab-item} Intel XPU -:sync: xpu - -:::{include} gpu/xpu.inc.md -:start-after: "## Requirements" -:end-before: "## Set up using Python" -::: - -:::: - -::::: - -## Set up using Python - -### Create a new Python environment - -:::{include} python_env_setup.inc.md -::: - -:::::{tab-set} -:sync-group: device - -::::{tab-item} NVIDIA CUDA -:sync: cuda - -:::{include} gpu/cuda.inc.md -:start-after: "## Create a new Python environment" -:end-before: "### Pre-built wheels" -::: - -:::: - -::::{tab-item} AMD ROCm -:sync: rocm - -There is no extra information on creating a new Python environment for this device. - -:::: - -::::{tab-item} Intel XPU -:sync: xpu - -There is no extra information on creating a new Python environment for this device. - -:::: - -::::: - -### Pre-built wheels - -:::::{tab-set} -:sync-group: device - -::::{tab-item} NVIDIA CUDA -:sync: cuda - -:::{include} gpu/cuda.inc.md -:start-after: "### Pre-built wheels" -:end-before: "### Build wheel from source" -::: - -:::: - -::::{tab-item} AMD ROCm -:sync: rocm - -:::{include} gpu/rocm.inc.md -:start-after: "### Pre-built wheels" -:end-before: "### Build wheel from source" -::: - -:::: - -::::{tab-item} Intel XPU -:sync: xpu - -:::{include} gpu/xpu.inc.md -:start-after: "### Pre-built wheels" -:end-before: "### Build wheel from source" -::: - -:::: - -::::: - -(build-from-source)= - -### Build wheel from source - -:::::{tab-set} -:sync-group: device - -::::{tab-item} NVIDIA CUDA -:sync: cuda - -:::{include} gpu/cuda.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: - -:::: - -::::{tab-item} AMD ROCm -:sync: rocm - -:::{include} gpu/rocm.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: - -:::: - -::::{tab-item} Intel XPU -:sync: xpu - -:::{include} gpu/xpu.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: - -:::: - -::::: - -## Set up using Docker - -### Pre-built images - -:::::{tab-set} -:sync-group: device - -::::{tab-item} NVIDIA CUDA -:sync: cuda - -:::{include} gpu/cuda.inc.md -:start-after: "### Pre-built images" -:end-before: "### Build image from source" -::: - -:::: - -::::{tab-item} AMD ROCm -:sync: rocm - -:::{include} gpu/rocm.inc.md -:start-after: "### Pre-built images" -:end-before: "### Build image from source" -::: - -:::: - -::::{tab-item} Intel XPU -:sync: xpu - -:::{include} gpu/xpu.inc.md -:start-after: "### Pre-built images" -:end-before: "### Build image from source" -::: - -:::: - -::::: - -### Build image from source - -:::::{tab-set} -:sync-group: device - -::::{tab-item} NVIDIA CUDA -:sync: cuda - -:::{include} gpu/cuda.inc.md -:start-after: "### Build image from source" -:end-before: "## Supported features" -::: - -:::: - -::::{tab-item} AMD ROCm -:sync: rocm - -:::{include} gpu/rocm.inc.md -:start-after: "### Build image from source" -:end-before: "## Supported features" -::: - -:::: - -::::{tab-item} Intel XPU -:sync: xpu - -:::{include} gpu/xpu.inc.md -:start-after: "### Build image from source" -:end-before: "## Supported features" -::: - -:::: - -::::: - -## Supported features - -:::::{tab-set} -:sync-group: device - -::::{tab-item} NVIDIA CUDA -:sync: cuda - -:::{include} gpu/cuda.inc.md -:start-after: "## Supported features" -::: - -:::: - -::::{tab-item} AMD ROCm -:sync: rocm - -:::{include} gpu/rocm.inc.md -:start-after: "## Supported features" -::: - -:::: - -::::{tab-item} Intel XPU -:sync: xpu - -:::{include} gpu/xpu.inc.md -:start-after: "## Supported features" -::: - -:::: - -::::: diff --git a/docs/source/getting_started/installation/python_env_setup.inc.md b/docs/source/getting_started/installation/python_env_setup.inc.md deleted file mode 100644 index 00b61ea5c8264..0000000000000 --- a/docs/source/getting_started/installation/python_env_setup.inc.md +++ /dev/null @@ -1,19 +0,0 @@ -You can create a new Python environment using [conda](https://docs.conda.io/projects/conda/en/stable/user-guide/getting-started.html): - -```console -# (Recommended) Create a new conda environment. -conda create -n vllm python=3.12 -y -conda activate vllm -``` - -:::{note} -[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages. -::: - -Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command: - -```console -# (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment. -uv venv --python 3.12 --seed -source .venv/bin/activate -``` diff --git a/docs/source/index.md b/docs/source/index.md deleted file mode 100644 index db2192e87dcf2..0000000000000 --- a/docs/source/index.md +++ /dev/null @@ -1,217 +0,0 @@ -# Welcome to vLLM - -:::{figure} ./assets/logos/vllm-logo-text-light.png -:align: center -:alt: vLLM -:class: no-scaled-link -:width: 60% -::: - -:::{raw} html -<p style="text-align:center"> -<strong>Easy, fast, and cheap LLM serving for everyone -</strong> -</p> - -<p style="text-align:center"> -<script async defer src="https://buttons.github.io/buttons.js"></script> -<a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a> -<a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a> -<a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a> -</p> -::: - -vLLM is a fast and easy-to-use library for LLM inference and serving. - -Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry. - -vLLM is fast with: - -- State-of-the-art serving throughput -- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) -- Continuous batching of incoming requests -- Fast model execution with CUDA/HIP graph -- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8 -- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer. -- Speculative decoding -- Chunked prefill - -vLLM is flexible and easy to use with: - -- Seamless integration with popular HuggingFace models -- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more -- Tensor parallelism and pipeline parallelism support for distributed inference -- Streaming outputs -- OpenAI-compatible API server -- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudiยฎ accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators. -- Prefix caching support -- Multi-lora support - -For more information, check out the following: - -- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention) -- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023) -- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al. -- [vLLM Meetups](#meetups) - -## Documentation - -% How to start using vLLM? - -:::{toctree} -:caption: Getting Started -:maxdepth: 1 - -getting_started/installation -getting_started/quickstart -getting_started/examples/examples_index -getting_started/troubleshooting -getting_started/faq -getting_started/v1_user_guide - -::: - -% What does vLLM support? - -:::{toctree} -:caption: Models -:maxdepth: 1 - -models/supported_models -models/generative_models -models/pooling_models -models/extensions/index -::: - -% Additional capabilities - -:::{toctree} -:caption: Features -:maxdepth: 1 - -features/quantization/index -features/multimodal_inputs -features/prompt_embeds -features/lora -features/tool_calling -features/reasoning_outputs -features/structured_outputs -features/automatic_prefix_caching -features/disagg_prefill -features/spec_decode -features/compatibility_matrix -::: - -% Details about running vLLM - -:::{toctree} -:caption: Training -:maxdepth: 1 - -training/trl.md -training/rlhf.md - -::: - -:::{toctree} -:caption: Inference and Serving -:maxdepth: 1 - -serving/offline_inference -serving/openai_compatible_server -serving/serve_args -serving/distributed_serving -serving/metrics -serving/engine_args -serving/env_vars -serving/usage_stats -serving/integrations/index -::: - -% Scaling up vLLM for production - -:::{toctree} -:caption: Deployment -:maxdepth: 1 - -deployment/security -deployment/docker -deployment/k8s -deployment/nginx -deployment/frameworks/index -deployment/integrations/index -::: - -% Making the most out of vLLM - -:::{toctree} -:caption: Performance -:maxdepth: 1 - -performance/optimization -performance/benchmarks -::: - -% Explanation of vLLM internals - -:::{toctree} -:caption: Design Documents -:maxdepth: 2 - -design/arch_overview -design/huggingface_integration -design/plugin_system -design/kernel/paged_attention -design/mm_processing -design/automatic_prefix_caching -design/multiprocessing -::: - -:::{toctree} -:caption: V1 Design Documents -:maxdepth: 2 - -design/v1/torch_compile -design/v1/prefix_caching -design/v1/metrics -::: - -% How to contribute to the vLLM project - -:::{toctree} -:caption: Developer Guide -:maxdepth: 2 - -contributing/overview -contributing/deprecation_policy -contributing/profiling/profiling_index -contributing/dockerfile/dockerfile -contributing/model/index -contributing/vulnerability_management -::: - -% Technical API specifications - -:::{toctree} -:caption: API Reference -:maxdepth: 2 - -api/summary -api/vllm/vllm -::: - -% Latest news and acknowledgements - -:::{toctree} -:caption: Community -:maxdepth: 1 - -community/blog -community/meetups -community/sponsors -::: - -## Indices and tables - -- {ref}`genindex` -- {ref}`modindex` diff --git a/docs/source/models/extensions/index.md b/docs/source/models/extensions/index.md deleted file mode 100644 index cdcdaa5b35018..0000000000000 --- a/docs/source/models/extensions/index.md +++ /dev/null @@ -1,9 +0,0 @@ -# Built-in Extensions - -:::{toctree} -:maxdepth: 1 - -runai_model_streamer -tensorizer -fastsafetensor -::: diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md deleted file mode 100644 index 4d574216242b8..0000000000000 --- a/docs/source/models/supported_models.md +++ /dev/null @@ -1,1401 +0,0 @@ -(supported-models)= - -# Supported Models - -vLLM supports [generative](generative-models) and [pooling](pooling-models) models across various tasks. -If a model supports more than one task, you can set the task via the `--task` argument. - -For each task, we list the model architectures that have been implemented in vLLM. -Alongside each architecture, we include some popular models that use it. - -## Model Implementation - -### vLLM - -If vLLM natively supports a model, its implementation can be found in <gh-file:vllm/model_executor/models>. - -These models are what we list in <project:#supported-text-models> and <project:#supported-mm-models>. - -(transformers-backend)= - -### Transformers - -vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models are supported, and vision language model support is planned! - -To check if the modeling backend is Transformers, you can simply do this: - -```python -from vllm import LLM -llm = LLM(model=..., task="generate") # Name or path of your model -llm.apply_model(lambda model: print(type(model))) -``` - -If it is `TransformersForCausalLM` then it means it's based on Transformers! - -:::{tip} -You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for <project:#offline-inference> or `--model-impl transformers` for the <project:#openai-compatible-server>. -::: - -:::{note} -vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM. -::: - -#### Custom models - -If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM! - -For a model to be compatible with the Transformers backend for vLLM it must: - -- be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)): - * The model directory must have the correct structure (e.g. `config.json` is present). - * `config.json` must contain `auto_map.AutoModel`. -- be a Transformers backend for vLLM compatible model (see <project:#writing-custom-models>): - * Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`). - -If the compatible model is: - -- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for <project:#offline-inference> or `--trust-remote-code` for the <project:#openai-compatible-server>. -- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for <project:#offline-inference> or `vllm serve <MODEL_DIR>` for the <project:#openai-compatible-server>. - -This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM! - -(writing-custom-models)= - -#### Writing custom models - -This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)). - -To make your model compatible with the Transformers backend, it needs: - -1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`. -2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention. -3. `MyModel` must contain `_supports_attention_backend = True`. - -```{code-block} python -:caption: modeling_my_model.py - -from transformers import PreTrainedModel -from torch import nn - -class MyAttention(nn.Module): - - def forward(self, hidden_states, **kwargs): - ... - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] - attn_output, attn_weights = attention_interface( - self, - query_states, - key_states, - value_states, - **kwargs, - ) - ... - -class MyModel(PreTrainedModel): - _supports_attention_backend = True -``` - -Here is what happens in the background when this model is loaded: - -1. The config is loaded. -2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`. -3. `MyModel` is loaded into `TransformersForCausalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used. - -That's it! - -For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class: - -```{code-block} python -:caption: configuration_my_model.py - -from transformers import PretrainedConfig - -class MyConfig(PretrainedConfig): - base_model_tp_plan = { - "layers.*.self_attn.k_proj": "colwise", - "layers.*.self_attn.v_proj": "colwise", - "layers.*.self_attn.o_proj": "rowwise", - "layers.*.mlp.gate_proj": "colwise", - "layers.*.mlp.up_proj": "colwise", - "layers.*.mlp.down_proj": "rowwise", - } - base_model_pp_plan = { - "embed_tokens": (["input_ids"], ["inputs_embeds"]), - "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), - "norm": (["hidden_states"], ["hidden_states"]), - } -``` - -- `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported). -- `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s: - * You only need to do this for layers which are not present on all pipeline stages - * vLLM assumes that there will be only one `nn.ModuleList`, which is distributed across the pipeline stages - * The `list` in the first element of the `tuple` contains the names of the input arguments - * The `list` in the last element of the `tuple` contains the names of the variables the layer outputs to in your modeling code - -## Loading a Model - -### Hugging Face Hub - -By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models). To change the download path for models, you can set the `HF_HOME` environment variable; for more details, refer to [their official documentation](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhome). - -To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository. -If the `"architectures"` field contains a model architecture listed below, then it should be natively supported. - -Models do not _need_ to be natively supported to be used in vLLM. -The [Transformers backend](#transformers-backend) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!). - -:::{tip} -The easiest way to check if your model is really supported at runtime is to run the program below: - -```python -from vllm import LLM - -# For generative models (task=generate) only -llm = LLM(model=..., task="generate") # Name or path of your model -output = llm.generate("Hello, my name is") -print(output) - -# For pooling models (task={embed,classify,reward,score}) only -llm = LLM(model=..., task="embed") # Name or path of your model -output = llm.encode("Hello, my name is") -print(output) -``` - -If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. -::: - -Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM. -Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support. - -#### Download a model - -If you prefer, you can use the Hugging Face CLI to [download a model](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-download) or specific files from a model repository: - -```console -# Download a model -huggingface-cli download HuggingFaceH4/zephyr-7b-beta - -# Specify a custom cache directory -huggingface-cli download HuggingFaceH4/zephyr-7b-beta --cache-dir ./path/to/cache - -# Download a specific file from a model repo -huggingface-cli download HuggingFaceH4/zephyr-7b-beta eval_results.json -``` - -#### List the downloaded models - -Use the Hugging Face CLI to [manage models](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#scan-your-cache) stored in local cache: - -```console -# List cached models -huggingface-cli scan-cache - -# Show detailed (verbose) output -huggingface-cli scan-cache -v - -# Specify a custom cache directory -huggingface-cli scan-cache --dir ~/.cache/huggingface/hub -``` - -#### Delete a cached model - -Use the Hugging Face CLI to interactively [delete downloaded model](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#clean-your-cache) from the cache: - -```console -# The `delete-cache` command requires extra dependencies to work with the TUI. -# Please run `pip install huggingface_hub[cli]` to install them. - -# Launch the interactive TUI to select models to delete -$ huggingface-cli delete-cache -? Select revisions to delete: 1 revisions selected counting for 438.9M. - โ—‹ None of the following (if selected, nothing will be deleted). -Model BAAI/bge-base-en-v1.5 (438.9M, used 1 week ago) -โฏ โ—‰ a5beb1e3: main # modified 1 week ago - -Model BAAI/bge-large-en-v1.5 (1.3G, used 1 week ago) - โ—‹ d4aa6901: main # modified 1 week ago - -Model BAAI/bge-reranker-base (1.1G, used 4 weeks ago) - โ—‹ 2cfc18c9: main # modified 4 weeks ago - -Press <space> to select, <enter> to validate and <ctrl+c> to quit without modification. - -# Need to confirm after selected -? Select revisions to delete: 1 revision(s) selected. -? 1 revisions selected counting for 438.9M. Confirm deletion ? Yes -Start deletion. -Done. Deleted 1 repo(s) and 0 revision(s) for a total of 438.9M. -``` - -#### Using a proxy - -Here are some tips for loading/downloading models from Hugging Face using a proxy: - -- Set the proxy globally for your session (or set it in the profile file): - -```shell -export http_proxy=http://your.proxy.server:port -export https_proxy=http://your.proxy.server:port -``` - -- Set the proxy for just the current command: - -```shell -https_proxy=http://your.proxy.server:port huggingface-cli download <model_name> - -# or use vllm cmd directly -https_proxy=http://your.proxy.server:port vllm serve <model_name> --disable-log-requests -``` - -- Set the proxy in Python interpreter: - -```python -import os - -os.environ['http_proxy'] = 'http://your.proxy.server:port' -os.environ['https_proxy'] = 'http://your.proxy.server:port' -``` - -### ModelScope - -To use models from [ModelScope](https://www.modelscope.cn) instead of Hugging Face Hub, set an environment variable: - -```shell -export VLLM_USE_MODELSCOPE=True -``` - -And use with `trust_remote_code=True`. - -```python -from vllm import LLM - -llm = LLM(model=..., revision=..., task=..., trust_remote_code=True) - -# For generative models (task=generate) only -output = llm.generate("Hello, my name is") -print(output) - -# For pooling models (task={embed,classify,reward,score}) only -output = llm.encode("Hello, my name is") -print(output) -``` - -(feature-status-legend)= - -## Feature Status Legend - -- โœ…๏ธŽ indicates that the feature is supported for the model. - -- ๐Ÿšง indicates that the feature is planned but not yet supported for the model. - -- โš ๏ธ indicates that the feature is available but may have known issues or limitations. - -(supported-text-models)= - -## List of Text-only Language Models - -### Generative Models - -See [this page](#generative-models) for more information on how to use generative models. - -#### Text Generation - -Specified using `--task generate`. - -:::{list-table} -:widths: 25 25 50 5 5 -:header-rows: 1 - -- * Architecture - * Models - * Example HF Models - * [LoRA](#lora-adapter) - * [PP](#distributed-serving) -- * `AquilaForCausalLM` - * Aquila, Aquila2 - * `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `ArcticForCausalLM` - * Arctic - * `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. - * - * โœ…๏ธŽ -- * `BaiChuanForCausalLM` - * Baichuan2, Baichuan - * `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `BambaForCausalLM` - * Bamba - * `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` - * - * -- * `BloomForCausalLM` - * BLOOM, BLOOMZ, BLOOMChat - * `bigscience/bloom`, `bigscience/bloomz`, etc. - * - * โœ…๏ธŽ -- * `BartForConditionalGeneration` - * BART - * `facebook/bart-base`, `facebook/bart-large-cnn`, etc. - * - * -- * `ChatGLMModel`, `ChatGLMForConditionalGeneration` - * ChatGLM - * `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `CohereForCausalLM`, `Cohere2ForCausalLM` - * Command-R - * `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `DbrxForCausalLM` - * DBRX - * `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. - * - * โœ…๏ธŽ -- * `DeciLMForCausalLM` - * DeciLM - * `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. - * - * โœ…๏ธŽ -- * `DeepseekForCausalLM` - * DeepSeek - * `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. - * - * โœ…๏ธŽ -- * `DeepseekV2ForCausalLM` - * DeepSeek-V2 - * `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. - * - * โœ…๏ธŽ -- * `DeepseekV3ForCausalLM` - * DeepSeek-V3 - * `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. - * - * โœ…๏ธŽ -- * `ExaoneForCausalLM` - * EXAONE-3 - * `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `FalconForCausalLM` - * Falcon - * `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. - * - * โœ…๏ธŽ -- * `FalconMambaForCausalLM` - * FalconMamba - * `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `GemmaForCausalLM` - * Gemma - * `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Gemma2ForCausalLM` - * Gemma 2 - * `google/gemma-2-9b`, `google/gemma-2-27b`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Gemma3ForCausalLM` - * Gemma 3 - * `google/gemma-3-1b-it`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `GlmForCausalLM` - * GLM-4 - * `THUDM/glm-4-9b-chat-hf`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Glm4ForCausalLM` - * GLM-4-0414 - * `THUDM/GLM-4-32B-0414`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `GPT2LMHeadModel` - * GPT-2 - * `gpt2`, `gpt2-xl`, etc. - * - * โœ…๏ธŽ -- * `GPTBigCodeForCausalLM` - * StarCoder, SantaCoder, WizardCoder - * `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `GPTJForCausalLM` - * GPT-J - * `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. - * - * โœ…๏ธŽ -- * `GPTNeoXForCausalLM` - * GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM - * `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. - * - * โœ…๏ธŽ -- * `GraniteForCausalLM` - * Granite 3.0, Granite 3.1, PowerLM - * `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `GraniteMoeForCausalLM` - * Granite 3.0 MoE, PowerMoE - * `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `GraniteMoeHybridForCausalLM` - * Granite 4.0 MoE Hybrid - * `ibm-granite/granite-4.0-tiny-preview`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `GraniteMoeSharedForCausalLM` - * Granite MoE Shared - * `ibm-research/moe-7b-1b-active-shared-experts` (test model) - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `GritLM` - * GritLM - * `parasail-ai/GritLM-7B-vllm`. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Grok1ModelForCausalLM` - * Grok1 - * `hpcai-tech/grok-1`. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `InternLMForCausalLM` - * InternLM - * `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `InternLM2ForCausalLM` - * InternLM2 - * `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `InternLM3ForCausalLM` - * InternLM3 - * `internlm/internlm3-8b-instruct`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `JAISLMHeadModel` - * Jais - * `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. - * - * โœ…๏ธŽ -- * `JambaForCausalLM` - * Jamba - * `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `LlamaForCausalLM` - * Llama 3.1, Llama 3, Llama 2, LLaMA, Yi - * `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `MambaForCausalLM` - * Mamba - * `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. - * - * โœ…๏ธŽ -- * `MiniCPMForCausalLM` - * MiniCPM - * `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `MiniCPM3ForCausalLM` - * MiniCPM3 - * `openbmb/MiniCPM3-4B`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `MistralForCausalLM` - * Mistral, Mistral-Instruct - * `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `MixtralForCausalLM` - * Mixtral-8x7B, Mixtral-8x7B-Instruct - * `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `MPTForCausalLM` - * MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter - * `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. - * - * โœ…๏ธŽ -- * `NemotronForCausalLM` - * Nemotron-3, Nemotron-4, Minitron - * `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `OLMoForCausalLM` - * OLMo - * `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. - * - * โœ…๏ธŽ -- * `OLMo2ForCausalLM` - * OLMo2 - * `allenai/OLMo-2-0425-1B`, etc. - * - * โœ…๏ธŽ -- * `OLMoEForCausalLM` - * OLMoE - * `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `OPTForCausalLM` - * OPT, OPT-IML - * `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. - * - * โœ…๏ธŽ -- * `OrionForCausalLM` - * Orion - * `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. - * - * โœ…๏ธŽ -- * `PhiForCausalLM` - * Phi - * `microsoft/phi-1_5`, `microsoft/phi-2`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Phi3ForCausalLM` - * Phi-4, Phi-3 - * `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Phi3SmallForCausalLM` - * Phi-3-Small - * `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. - * - * โœ…๏ธŽ -- * `PhiMoEForCausalLM` - * Phi-3.5-MoE - * `microsoft/Phi-3.5-MoE-instruct`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `PersimmonForCausalLM` - * Persimmon - * `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. - * - * โœ…๏ธŽ -- * `Plamo2ForCausalLM` - * PLaMo2 - * `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. - * - * -- * `QWenLMHeadModel` - * Qwen - * `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Qwen2ForCausalLM` - * QwQ, Qwen2 - * `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Qwen2MoeForCausalLM` - * Qwen2MoE - * `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. - * - * โœ…๏ธŽ -- * `Qwen3ForCausalLM` - * Qwen3 - * `Qwen/Qwen3-8B`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Qwen3MoeForCausalLM` - * Qwen3MoE - * `Qwen/Qwen3-30B-A3B`, etc. - * - * โœ…๏ธŽ -- * `StableLmForCausalLM` - * StableLM - * `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. - * - * โœ…๏ธŽ -- * `Starcoder2ForCausalLM` - * Starcoder2 - * `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. - * - * โœ…๏ธŽ -- * `SolarForCausalLM` - * Solar Pro - * `upstage/solar-pro-preview-instruct`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `TeleChat2ForCausalLM` - * TeleChat2 - * `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `TeleFLMForCausalLM` - * TeleFLM - * `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `XverseForCausalLM` - * XVERSE - * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `MiniMaxText01ForCausalLM` - * MiniMax-Text - * `MiniMaxAI/MiniMax-Text-01`, etc. - * - * โœ…๏ธŽ -- * `Zamba2ForCausalLM` - * Zamba2 - * `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. - * - * -- * `MiMoForCausalLM` - * MiMo - * `XiaomiMiMo/MiMo-7B-RL`, etc. - * - * -::: - -:::{note} -Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. -::: - -### Pooling Models - -See [this page](pooling-models) for more information on how to use pooling models. - -:::{important} -Since some model architectures support both generative and pooling tasks, -you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. -::: - -#### Text Embedding - -Specified using `--task embed`. - -:::{list-table} -:widths: 25 25 50 5 5 -:header-rows: 1 - -- * Architecture - * Models - * Example HF Models - * [LoRA](#lora-adapter) - * [PP](#distributed-serving) -- * `BertModel` - * BERT-based - * `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. - * - * -- * `Gemma2Model` - * Gemma 2-based - * `BAAI/bge-multilingual-gemma2`, etc. - * - * โœ…๏ธŽ -- * `GritLM` - * GritLM - * `parasail-ai/GritLM-7B-vllm`. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `GteModel` - * Arctic-Embed-2.0-M - * `Snowflake/snowflake-arctic-embed-m-v2.0`. - * - * ๏ธŽ -- * `GteNewModel` - * mGTE-TRM (see note) - * `Alibaba-NLP/gte-multilingual-base`, etc. - * ๏ธŽ - * ๏ธŽ -- * `ModernBertModel` - * ModernBERT-based - * `Alibaba-NLP/gte-modernbert-base`, etc. - * ๏ธŽ - * ๏ธŽ -- * `NomicBertModel` - * Nomic BERT - * `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. - * ๏ธŽ - * ๏ธŽ -- * `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. - * Llama-based - * `intfloat/e5-mistral-7b-instruct`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Qwen2Model`, `Qwen2ForCausalLM` - * Qwen2-based - * `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `RobertaModel`, `RobertaForMaskedLM` - * RoBERTa-based - * `sentence-transformers/all-roberta-large-v1`, etc. - * - * -- * `XLMRobertaModel` - * XLM-RoBERTa-based - * `intfloat/multilingual-e5-large`, `jinaai/jina-reranker-v2-base-multilingual`, `Snowflake/snowflake-arctic-embed-l-v2.0`, `jinaai/jina-embeddings-v3`(see note), etc. - * - * -::: - -:::{note} -`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. -You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`. -::: - -:::{note} -The HF implementation of `Alibaba-NLP/gte-Qwen2-1.5B-instruct` is hardcoded to use causal attention despite what is shown in `config.json`. To compare vLLM vs HF results, -you should set `--hf-overrides '{"is_causal": true}'` in vLLM so that the two implementations are consistent with each other. - -For both the 1.5B and 7B variants, you also need to enable `--trust-remote-code` for the correct tokenizer to be loaded. -See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882). -::: - -:::{note} -`jinaai/jina-embeddings-v3` supports multiple tasks through lora, while vllm temporarily only supports text-matching tasks by merging lora weights. -::: - -:::{note} -The second-generation GTE model (mGTE-TRM) is named `NewModel`. The name `NewModel` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewModel"]}'` to specify the use of the `GteNewModel` architecture. -::: - -If your model is not in the above list, we will try to automatically convert the model using -{func}`~vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings -of the whole prompt are extracted from the normalized hidden state corresponding to the last token. - -#### Reward Modeling - -Specified using `--task reward`. - -:::{list-table} -:widths: 25 25 50 5 5 -:header-rows: 1 - -- * Architecture - * Models - * Example HF Models - * [LoRA](#lora-adapter) - * [PP](#distributed-serving) -- * `InternLM2ForRewardModel` - * InternLM2-based - * `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `LlamaForCausalLM` - * Llama-based - * `peiyi9979/math-shepherd-mistral-7b-prm`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Qwen2ForRewardModel` - * Qwen2-based - * `Qwen/Qwen2.5-Math-RM-72B`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Qwen2ForProcessRewardModel` - * Qwen2-based - * `Qwen/Qwen2.5-Math-PRM-7B`, `Qwen/Qwen2.5-Math-PRM-72B`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -::: - -If your model is not in the above list, we will try to automatically convert the model using -{func}`~vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly. - -:::{important} -For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, -e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. -::: - -#### Classification - -Specified using `--task classify`. - -:::{list-table} -:widths: 25 25 50 5 5 -:header-rows: 1 - -- * Architecture - * Models - * Example HF Models - * [LoRA](#lora-adapter) - * [PP](#distributed-serving) -- * `JambaForSequenceClassification` - * Jamba - * `ai21labs/Jamba-tiny-reward-dev`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Qwen2ForSequenceClassification` - * Qwen2-based - * `jason9693/Qwen2.5-1.5B-apeach`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ -::: - -If your model is not in the above list, we will try to automatically convert the model using -{func}`~vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. - -#### Sentence Pair Scoring - -Specified using `--task score`. - -:::{list-table} -:widths: 25 25 50 5 5 -:header-rows: 1 - -- * Architecture - * Models - * Example HF Models - * [LoRA](#lora-adapter) - * [PP](#distributed-serving) -- * `BertForSequenceClassification` - * BERT-based - * `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. - * - * -- * `RobertaForSequenceClassification` - * RoBERTa-based - * `cross-encoder/quora-roberta-base`, etc. - * - * -- * `XLMRobertaForSequenceClassification` - * XLM-RoBERTa-based - * `BAAI/bge-reranker-v2-m3`, etc. - * - * -- * `ModernBertForSequenceClassification` - * ModernBert-based - * `Alibaba-NLP/gte-reranker-modernbert-base`, etc. - * - * -::: - -(supported-mm-models)= - -## List of Multimodal Language Models - -The following modalities are supported depending on the model: - -- **T**ext -- **I**mage -- **V**ideo -- **A**udio - -Any combination of modalities joined by `+` are supported. - -- e.g.: `T + I` means that the model supports text-only, image-only, and text-with-image inputs. - -On the other hand, modalities separated by `/` are mutually exclusive. - -- e.g.: `T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. - -See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model. - -:::{important} -**To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference) -or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt: - -Offline inference: - -```python -from vllm import LLM - -llm = LLM( - model="Qwen/Qwen2-VL-7B-Instruct", - limit_mm_per_prompt={"image": 4}, -) -``` - -Online serving: - -```bash -vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt '{"image":4}' -``` - -**This is no longer required if you are using vLLM V1.** - -::: - -:::{note} -vLLM currently only supports adding LoRA to the language backbone of multimodal models. -::: - -### Generative Models - -See [this page](#generative-models) for more information on how to use generative models. - -#### Text Generation - -Specified using `--task generate`. - -:::{list-table} -:widths: 25 25 15 20 5 5 5 -:header-rows: 1 - -- * Architecture - * Models - * Inputs - * Example HF Models - * [LoRA](#lora-adapter) - * [PP](#distributed-serving) - * [V1](gh-issue:8779) -- * `AriaForConditionalGeneration` - * Aria - * T + I<sup>+</sup> - * `rhymes-ai/Aria` - * - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `AyaVisionForConditionalGeneration` - * Aya Vision - * T + I<sup>+</sup> - * `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. - * - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Blip2ForConditionalGeneration` - * BLIP-2 - * T + I<sup>E</sup> - * `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. - * - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `ChameleonForConditionalGeneration` - * Chameleon - * T + I - * `facebook/chameleon-7b` etc. - * - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `DeepseekVLV2ForCausalLM`<sup>^</sup> - * DeepSeek-VL2 - * T + I<sup>+</sup> - * `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. - * - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Florence2ForConditionalGeneration` - * Florence-2 - * T + I - * `microsoft/Florence-2-base`, `microsoft/Florence-2-large` etc. - * - * - * -- * `FuyuForCausalLM` - * Fuyu - * T + I - * `adept/fuyu-8b` etc. - * - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Gemma3ForConditionalGeneration` - * Gemma 3 - * T + I<sup>+</sup> - * `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ - * โš ๏ธ -- * `GLM4VForCausalLM`<sup>^</sup> - * GLM-4V - * T + I - * `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220` etc. - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `GraniteSpeechForConditionalGeneration` - * Granite Speech - * T + A - * `ibm-granite/granite-speech-3.3-8b` - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `H2OVLChatModel` - * H2OVL - * T + I<sup>E+</sup> - * `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. - * - * โœ…๏ธŽ - * โœ…๏ธŽ\* -- * `Idefics3ForConditionalGeneration` - * Idefics3 - * T + I - * `HuggingFaceM4/Idefics3-8B-Llama3` etc. - * โœ…๏ธŽ - * - * โœ…๏ธŽ -- * `InternVLChatModel` - * InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 - * T + I<sup>E+</sup> - * `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. - * - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `KimiVLForConditionalGeneration` - * Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking - * T + I<sup>+</sup> - * `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` - * - * - * โœ…๏ธŽ -- * `Llama4ForConditionalGeneration` - * Llama 4 - * T + I<sup>+</sup> - * `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. - * - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `LlavaForConditionalGeneration` - * LLaVA-1.5 - * T + I<sup>E+</sup> - * `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. - * - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `LlavaNextForConditionalGeneration` - * LLaVA-NeXT - * T + I<sup>E+</sup> - * `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. - * - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `LlavaNextVideoForConditionalGeneration` - * LLaVA-NeXT-Video - * T + V - * `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. - * - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `LlavaOnevisionForConditionalGeneration` - * LLaVA-Onevision - * T + I<sup>+</sup> + V<sup>+</sup> - * `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. - * - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `MiniCPMO` - * MiniCPM-O - * T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> - * `openbmb/MiniCPM-o-2_6`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `MiniCPMV` - * MiniCPM-V - * T + I<sup>E+</sup> + V<sup>E+</sup> - * `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `MiniMaxVL01ForConditionalGeneration` - * MiniMax-VL - * T + I<sup>E+</sup> - * `MiniMaxAI/MiniMax-VL-01`, etc. - * - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Mistral3ForConditionalGeneration` - * Mistral3 - * T + I<sup>+</sup> - * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `MllamaForConditionalGeneration` - * Llama 3.2 - * T + I<sup>+</sup> - * `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. - * - * - * -- * `MolmoForCausalLM` - * Molmo - * T + I<sup>+</sup> - * `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `NVLM_D_Model` - * NVLM-D 1.0 - * T + I<sup>+</sup> - * `nvidia/NVLM-D-72B`, etc. - * - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Ovis` - * Ovis2, Ovis1.6 - * T + I<sup>+</sup> - * `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. - * - * - * โœ…๏ธŽ -- * `PaliGemmaForConditionalGeneration` - * PaliGemma, PaliGemma 2 - * T + I<sup>E</sup> - * `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. - * - * โœ…๏ธŽ - * โš ๏ธ -- * `Phi3VForCausalLM` - * Phi-3-Vision, Phi-3.5-Vision - * T + I<sup>E+</sup> - * `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. - * - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Phi4MMForCausalLM` - * Phi-4-multimodal - * T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> - * `microsoft/Phi-4-multimodal-instruct`, etc. - * โœ…๏ธŽ - * - * โœ…๏ธŽ -- * `PixtralForConditionalGeneration` - * Pixtral - * T + I<sup>+</sup> - * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc. - * - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `QwenVLForConditionalGeneration`<sup>^</sup> - * Qwen-VL - * T + I<sup>E+</sup> - * `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Qwen2AudioForConditionalGeneration` - * Qwen2-Audio - * T + A<sup>+</sup> - * `Qwen/Qwen2-Audio-7B-Instruct` - * - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Qwen2VLForConditionalGeneration` - * QVQ, Qwen2-VL - * T + I<sup>E+</sup> + V<sup>E+</sup> - * `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Qwen2_5_VLForConditionalGeneration` - * Qwen2.5-VL - * T + I<sup>E+</sup> + V<sup>E+</sup> - * `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `Qwen2_5OmniThinkerForConditionalGeneration` - * Qwen2.5-Omni - * T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> - * `Qwen/Qwen2.5-Omni-7B` - * - * โœ…๏ธŽ - * โœ…๏ธŽ\* -- * `SkyworkR1VChatModel` - * Skywork-R1V-38B - * T + I - * `Skywork/Skywork-R1V-38B` - * - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `SmolVLMForConditionalGeneration` - * SmolVLM2 - * T + I - * `SmolVLM2-2.2B-Instruct` - * - * โœ…๏ธŽ - * โœ…๏ธŽ -- * `UltravoxModel` - * Ultravox - * T + A<sup>E+</sup> - * `fixie-ai/ultravox-v0_5-llama-3_2-1b` - * โœ…๏ธŽ - * โœ…๏ธŽ - * โœ…๏ธŽ -::: - -<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM. -    โ€ข For example, to use DeepSeek-VL2 series models: -      `--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` -<sup>E</sup> Pre-computed embeddings can be inputted for this modality. -<sup>+</sup> Multiple items can be inputted per text prompt for this modality. - -:::{warning} -Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs. -However, there are differences in how they handle text + image inputs: - -V0 correctly implements the model's attention pattern: -- Uses bidirectional attention between the image tokens corresponding to the same image -- Uses causal attention for other tokens -- Implemented via (naive) PyTorch SDPA with masking tensors -- Note: May use significant memory for long prompts with image - -V1 currently uses a simplified attention pattern: -- Uses causal attention for all tokens, including image tokens -- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}` -- Will be updated in the future to support the correct behavior - -This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends. -::: - -:::{note} -`h2oai/h2ovl-mississippi-2b` will be available in V1 once we support head size 80. -::: - -:::{note} -To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. -::: - -:::{warning} -The output quality of `AllenAI/Molmo-7B-D-0924` (especially in object localization tasks) has deteriorated in recent updates. - -For the best results, we recommend using the following dependency versions (tested on A10 and L40): - -```text -# Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40) -torch==2.5.1 -torchvision==0.20.1 -transformers==4.48.1 -tokenizers==0.21.0 -tiktoken==0.7.0 -vllm==0.7.0 - -# Optional but recommended for improved performance and stability -triton==3.1.0 -xformers==0.0.28.post3 -uvloop==0.21.0 -protobuf==5.29.3 -openai==1.60.2 -opencv-python-headless==4.11.0.86 -pillow==10.4.0 - -# Installed FlashAttention (for float16 only) -flash-attn>=2.5.6 # Not used in float32, but should be documented -``` - -**Note:** Make sure you understand the security implications of using outdated packages. -::: - -:::{note} -The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now. -For more details, please see: <gh-pr:4087#issuecomment-2250397630> -::: - -:::{warning} -Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1. -::: - -:::{note} -To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from source via -`pip install git+https://github.com/huggingface/transformers.git`. - -Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1. -`--mm-processor-kwargs '{"use_audio_in_video": true}'`. -::: - -### Pooling Models - -See [this page](pooling-models) for more information on how to use pooling models. - -:::{important} -Since some model architectures support both generative and pooling tasks, -you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. -::: - -#### Text Embedding - -Specified using `--task embed`. - -Any text generation model can be converted into an embedding model by passing `--task embed`. - -:::{note} -To get the best results, you should use pooling models that are specifically trained as such. -::: - -The following table lists those that are tested in vLLM. - -:::{list-table} -:widths: 25 25 15 25 5 5 -:header-rows: 1 - -- * Architecture - * Models - * Inputs - * Example HF Models - * [LoRA](#lora-adapter) - * [PP](#distributed-serving) -- * `LlavaNextForConditionalGeneration` - * LLaVA-NeXT-based - * T / I - * `royokong/e5-v` - * - * โœ…๏ธŽ -- * `Phi3VForCausalLM` - * Phi-3-Vision-based - * T + I - * `TIGER-Lab/VLM2Vec-Full` - * ๐Ÿšง - * โœ…๏ธŽ -- * `Qwen2VLForConditionalGeneration` - * Qwen2-VL-based - * T + I - * `MrLight/dse-qwen2-2b-mrl-v1` - * - * โœ…๏ธŽ -::: - -#### Transcription - -Specified using `--task transcription`. - -Speech2Text models trained specifically for Automatic Speech Recognition. - -:::{list-table} -:widths: 25 25 25 5 5 -:header-rows: 1 - -- * Architecture - * Models - * Example HF Models - * [LoRA](#lora-adapter) - * [PP](#distributed-serving) -- * `Whisper` - * Whisper-based - * `openai/whisper-large-v3-turbo` - * ๐Ÿšง - * ๐Ÿšง -::: - -_________________ - -## Model Support Policy - -At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Hereโ€™s how we manage third-party model support: - -1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated! - -2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results. - - :::{tip} - When comparing the output of `model.generate` from Hugging Face Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. - ::: - -3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback. - -4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use. - -5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement. - -Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem. - -Note that, as an inference engine, vLLM does not introduce new models. Therefore, all models supported by vLLM are third-party models in this regard. - -We have the following levels of testing for models: - -1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test. -2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test. -3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:examples) for the models that have passed this test. -4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category. diff --git a/docs/source/serving/engine_args.md b/docs/source/serving/engine_args.md deleted file mode 100644 index 9325a2406e8ca..0000000000000 --- a/docs/source/serving/engine_args.md +++ /dev/null @@ -1,36 +0,0 @@ -(engine-args)= - -# Engine Arguments - -Engine arguments control the behavior of the vLLM engine. - -- For [offline inference](#offline-inference), they are part of the arguments to `LLM` class. -- For [online serving](#openai-compatible-server), they are part of the arguments to `vllm serve`. - -For references to all arguments available from `vllm serve` see the [serve args](#serve-args) documentation. - -Below, you can find an explanation of every engine argument: - -<!--- pyml disable-num-lines 7 no-space-in-emphasis --> -```{eval-rst} -.. argparse:: - :module: vllm.engine.arg_utils - :func: _engine_args_parser - :prog: vllm serve - :nodefaultconst: - :markdownhelp: -``` - -## Async Engine Arguments - -Additional arguments are available to the asynchronous engine which is used for online serving: - -<!--- pyml disable-num-lines 7 no-space-in-emphasis --> -```{eval-rst} -.. argparse:: - :module: vllm.engine.arg_utils - :func: _async_engine_args_parser - :prog: vllm serve - :nodefaultconst: - :markdownhelp: -``` diff --git a/docs/source/serving/env_vars.md b/docs/source/serving/env_vars.md deleted file mode 100644 index 9845241930a40..0000000000000 --- a/docs/source/serving/env_vars.md +++ /dev/null @@ -1,15 +0,0 @@ -# Environment Variables - -vLLM uses the following environment variables to configure the system: - -:::{warning} -Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work. - -All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables). -::: - -:::{literalinclude} ../../../vllm/envs.py -:end-before: end-env-vars-definition -:language: python -:start-after: begin-env-vars-definition -::: diff --git a/docs/source/serving/integrations/index.md b/docs/source/serving/integrations/index.md deleted file mode 100644 index e2b4c0814605b..0000000000000 --- a/docs/source/serving/integrations/index.md +++ /dev/null @@ -1,8 +0,0 @@ -# External Integrations - -:::{toctree} -:maxdepth: 1 - -langchain -llamaindex -::: diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md deleted file mode 100644 index 433d2e894dd8d..0000000000000 --- a/docs/source/serving/offline_inference.md +++ /dev/null @@ -1,217 +0,0 @@ -(offline-inference)= - -# Offline Inference - -You can run vLLM in your own code on a list of prompts. - -The offline API is based on the {class}`~vllm.LLM` class. -To initialize the vLLM engine, create a new instance of `LLM` and specify the model to run. - -For example, the following code downloads the [`facebook/opt-125m`](https://huggingface.co/facebook/opt-125m) model from HuggingFace -and runs it in vLLM using the default configuration. - -```python -from vllm import LLM - -llm = LLM(model="facebook/opt-125m") -``` - -After initializing the `LLM` instance, you can perform model inference using various APIs. -The available APIs depend on the type of model that is being run: - -- [Generative models](#generative-models) output logprobs which are sampled from to obtain the final output text. -- [Pooling models](#pooling-models) output their hidden states directly. - -Please refer to the above pages for more details about each API. - -:::{seealso} -[API Reference](#offline-inference-api) -::: - -(configuration-options)= - -## Configuration Options - -This section lists the most common options for running the vLLM engine. -For a full list, refer to the <project:#configuration> page. - -(model-resolution)= - -### Model resolution - -vLLM loads HuggingFace-compatible models by inspecting the `architectures` field in `config.json` of the model repository -and finding the corresponding implementation that is registered to vLLM. -Nevertheless, our model resolution may fail for the following reasons: - -- The `config.json` of the model repository lacks the `architectures` field. -- Unofficial repositories refer to a model using alternative names which are not recorded in vLLM. -- The same architecture name is used for multiple models, creating ambiguity as to which model should be loaded. - -To fix this, explicitly specify the model architecture by passing `config.json` overrides to the `hf_overrides` option. -For example: - -```python -from vllm import LLM - -model = LLM( - model="cerebras/Cerebras-GPT-1.3B", - hf_overrides={"architectures": ["GPT2LMHeadModel"]}, # GPT-2 -) -``` - -Our [list of supported models](#supported-models) shows the model architectures that are recognized by vLLM. - -(reducing-memory-usage)= - -### Reducing memory usage - -Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem. - -#### Tensor Parallelism (TP) - -Tensor parallelism (`tensor_parallel_size` option) can be used to split the model across multiple GPUs. - -The following code splits the model across 2 GPUs. - -```python -from vllm import LLM - -llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", - tensor_parallel_size=2) -``` - -:::{important} -To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. {func}`torch.cuda.set_device`) -before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`. - -To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable. -::: - -:::{note} -With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). - -You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. -::: - -#### Quantization - -Quantized models take less memory at the cost of lower precision. - -Statically quantized models can be downloaded from HF Hub (some popular ones are available at [Red Hat AI](https://huggingface.co/RedHatAI)) -and used directly without extra configuration. - -Dynamic quantization is also supported via the `quantization` option -- see [here](#quantization-index) for more details. - -#### Context length and batch size - -You can further reduce memory usage by limiting the context length of the model (`max_model_len` option) -and the maximum batch size (`max_num_seqs` option). - -```python -from vllm import LLM - -llm = LLM(model="adept/fuyu-8b", - max_model_len=2048, - max_num_seqs=2) -``` - -#### Reduce CUDA Graphs - -By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU. - -:::{important} -CUDA graph capture takes up more memory in V1 than in V0. -::: - -You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage: - -```python -from vllm import LLM -from vllm.config import CompilationConfig, CompilationLevel - -llm = LLM( - model="meta-llama/Llama-3.1-8B-Instruct", - compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, - # By default, it goes up to max_num_seqs - cudagraph_capture_sizes=[1, 2, 4, 8, 16], - ), -) -``` - -You can disable graph capturing completely via the `enforce_eager` flag: - -```python -from vllm import LLM - -llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", - enforce_eager=True) -``` - -#### Adjust cache size - -If you run out of CPU RAM, try the following options: - -- (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB). -- (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB). - -#### Multi-modal input limits - -You can allow a smaller number of multi-modal items per prompt to reduce the memory footprint of the model: - -```python -from vllm import LLM - -# Accept up to 3 images and 1 video per prompt -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - limit_mm_per_prompt={"image": 3, "video": 1}) -``` - -You can go a step further and disable unused modalities completely by setting its limit to zero. -For example, if your application only accepts image input, there is no need to allocate any memory for videos. - -```python -from vllm import LLM - -# Accept any number of images but no videos -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - limit_mm_per_prompt={"video": 0}) -``` - -You can even run a multi-modal model for text-only inference: - -```python -from vllm import LLM - -# Don't accept images. Just text. -llm = LLM(model="google/gemma-3-27b-it", - limit_mm_per_prompt={"image": 0}) -``` - -#### Multi-modal processor arguments - -For certain models, you can adjust the multi-modal processor arguments to -reduce the size of the processed multi-modal inputs, which in turn saves memory. - -Here are some examples: - -```python -from vllm import LLM - -# Available for Qwen2-VL series models -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - mm_processor_kwargs={ - "max_pixels": 768 * 768, # Default is 1280 * 28 * 28 - }) - -# Available for InternVL series models -llm = LLM(model="OpenGVLab/InternVL2-2B", - mm_processor_kwargs={ - "max_dynamic_patch": 4, # Default is 12 - }) -``` - -### Performance optimization and tuning - -You can potentially improve the performance of vLLM by finetuning various options. -Please refer to [this guide](#optimization-and-tuning) for more details. diff --git a/docs/source/serving/serve_args.md b/docs/source/serving/serve_args.md deleted file mode 100644 index edb49f4ba6de4..0000000000000 --- a/docs/source/serving/serve_args.md +++ /dev/null @@ -1,47 +0,0 @@ -(serve-args)= - -# Server Arguments - -The `vllm serve` command is used to launch the OpenAI-compatible server. - -## CLI Arguments - -The following are all arguments available from the `vllm serve` command: - -<!--- pyml disable-num-lines 7 no-space-in-emphasis --> -```{eval-rst} -.. argparse:: - :module: vllm.entrypoints.openai.cli_args - :func: create_parser_for_docs - :prog: vllm serve - :nodefaultconst: - :markdownhelp: -``` - -## Configuration file - -You can load CLI arguments via a [YAML](https://yaml.org/) config file. -The argument names must be the long form of those outlined [above](#serve-args). - -For example: - -```yaml -# config.yaml - -model: meta-llama/Llama-3.1-8B-Instruct -host: "127.0.0.1" -port: 6379 -uvicorn-log-level: "info" -``` - -To use the above config file: - -```bash -vllm serve --config config.yaml -``` - -:::{note} -In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence. -The order of priorities is `command line > config file values > defaults`. -e.g. `vllm serve SOME_MODEL --config config.yaml`, SOME_MODEL takes precedence over `model` in config file. -::: diff --git a/docs/source/training/rlhf.md b/docs/training/rlhf.md similarity index 69% rename from docs/source/training/rlhf.md rename to docs/training/rlhf.md index 72e89c0c7478c..4f75e4e01495c 100644 --- a/docs/source/training/rlhf.md +++ b/docs/training/rlhf.md @@ -6,6 +6,6 @@ vLLM can be used to generate the completions for RLHF. The best way to do this i See the following basic examples to get started if you don't want to use an existing library: -- [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf.html) -- [Training and inference processes are colocated on the same GPUs using Ray](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf_colocate.html) -- [Utilities for performing RLHF with vLLM](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf_utils.html) +- [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md) +- [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md) +- [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md) diff --git a/docs/source/training/trl.md b/docs/training/trl.md similarity index 66% rename from docs/source/training/trl.md rename to docs/training/trl.md index ebdf593dbde52..c7c1a5a3bbd1e 100644 --- a/docs/source/training/trl.md +++ b/docs/training/trl.md @@ -6,8 +6,7 @@ Online methods such as GRPO or Online DPO require the model to generate completi See the guide [vLLM for fast generation in online methods](https://huggingface.co/docs/trl/main/en/speeding_up_training#vllm-for-fast-generation-in-online-methods) in the TRL documentation for more information. -:::{seealso} -For more information on the `use_vllm` flag you can provide to the configs of these online methods, see: -- [`trl.GRPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/grpo_trainer#trl.GRPOConfig.use_vllm) -- [`trl.OnlineDPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/online_dpo_trainer#trl.OnlineDPOConfig.use_vllm) -::: +!!! info + For more information on the `use_vllm` flag you can provide to the configs of these online methods, see: + - [`trl.GRPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/grpo_trainer#trl.GRPOConfig.use_vllm) + - [`trl.OnlineDPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/online_dpo_trainer#trl.OnlineDPOConfig.use_vllm) diff --git a/docs/usage/README.md b/docs/usage/README.md new file mode 100644 index 0000000000000..681db57d8e0f5 --- /dev/null +++ b/docs/usage/README.md @@ -0,0 +1,7 @@ +# Using vLLM + +vLLM supports the following usage patterns: + +- [Inference and Serving](../serving/offline_inference.md): Run a single instance of a model. +- [Deployment](../deployment/docker.md): Scale up model instances for production. +- [Training](../training/rlhf.md): Train or fine-tune a model. diff --git a/docs/source/getting_started/faq.md b/docs/usage/faq.md similarity index 91% rename from docs/source/getting_started/faq.md rename to docs/usage/faq.md index c1bb28937c144..51977d4434f5a 100644 --- a/docs/source/getting_started/faq.md +++ b/docs/usage/faq.md @@ -1,23 +1,24 @@ -(faq)= - -# Frequently Asked Questions +--- +title: Frequently Asked Questions +--- +[](){ #faq } > Q: How can I serve multiple models on a single port using the OpenAI API? A: Assuming that you're referring to using OpenAI compatible server to serve multiple models at once, that is not currently supported, you can run multiple instances of the server (each serving a different model) at the same time, and have another layer to route the incoming request to the correct server accordingly. -______________________________________________________________________ +--- > Q: Which model to use for offline inference embedding? A: You can try [e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) and [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5); -more are listed [here](#supported-models). +more are listed [here][supported-models]. By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B), [Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models, but they are expected to be inferior to models that are specifically trained on embedding tasks. -______________________________________________________________________ +--- > Q: Can the output of a prompt vary across runs in vLLM? diff --git a/docs/source/serving/metrics.md b/docs/usage/metrics.md similarity index 66% rename from docs/source/serving/metrics.md rename to docs/usage/metrics.md index 647ece3f85f06..6603aa83b4af7 100644 --- a/docs/source/serving/metrics.md +++ b/docs/usage/metrics.md @@ -4,7 +4,7 @@ vLLM exposes a number of metrics that can be used to monitor the health of the system. These metrics are exposed via the `/metrics` endpoint on the vLLM OpenAI compatible API server. -You can start the server using Python, or using [Docker](#deployment-docker): +You can start the server using Python, or using [Docker][deployment-docker]: ```console vllm serve unsloth/Llama-3.2-1B-Instruct @@ -31,24 +31,9 @@ vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-I The following metrics are exposed: -:::{literalinclude} ../../../vllm/engine/metrics.py -:end-before: end-metrics-definitions -:language: python -:start-after: begin-metrics-definitions -::: - -The following metrics are deprecated and due to be removed in a future version: - -- `vllm:num_requests_swapped`, `vllm:cpu_cache_usage_perc`, and - `vllm:cpu_prefix_cache_hit_rate` because KV cache offloading is not - used in V1. -- `vllm:gpu_prefix_cache_hit_rate` is replaced by queries+hits - counters in V1. -- `vllm:time_in_queue_requests` because it duplicates - `vllm:request_queue_time_seconds`. -- `vllm:model_forward_time_milliseconds` and - `vllm:model_execute_time_milliseconds` because - prefill/decode/inference time metrics should be used instead. +```python +--8<-- "vllm/engine/metrics.py:metrics-definitions" +``` Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1` but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch, diff --git a/docs/usage/reproducibility.md b/docs/usage/reproducibility.md new file mode 100644 index 0000000000000..a494dcf19191f --- /dev/null +++ b/docs/usage/reproducibility.md @@ -0,0 +1,52 @@ +# Reproducibility + +vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. You need to do the following to achieve +reproducible results: + +- For V1: Turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`. +- For V0: Set the global seed (see below). + +Example: <gh-file:examples/offline_inference/reproducibility.py> + +!!! warning + + Applying the above settings [changes the random state in user code](#locality-of-random-state). + +!!! note + + Even with the above settings, vLLM only provides reproducibility + when it runs on the same hardware and the same vLLM version. + Also, the online serving API (`vllm serve`) does not support reproducibility + because it is almost impossible to make the scheduling deterministic in the + online setting. + +## Setting the global seed + +The `seed` parameter in vLLM is used to control the random states for various random number generators. + +If a specific seed value is provided, the random states for `random`, `np.random`, and `torch.manual_seed` will be set accordingly. + +However, in some cases, setting the seed will also [change the random state in user code](#locality-of-random-state). + +### Default Behavior + +In V0, the `seed` parameter defaults to `None`. When the `seed` parameter is `None`, the random states for `random`, `np.random`, and `torch.manual_seed` are not set. This means that each run of vLLM will produce different results if `temperature > 0`, as expected. + +In V1, the `seed` parameter defaults to `0` which sets the random state for each worker, so the results will remain consistent for each vLLM run even if `temperature > 0`. + +!!! note + + It is impossible to un-specify a seed for V1 because different workers need to sample the same outputs + for workflows such as speculative decoding. + + For more information, see: <gh-pr:17929> + +### Locality of random state + +The random state in user code (i.e. the code that constructs [LLM][vllm.LLM] class) is updated by vLLM under the following conditions: + +- For V0: The seed is specified. +- For V1: The workers are run in the same process as user code, i.e.: `VLLM_ENABLE_V1_MULTIPROCESSING=0`. + +By default, these conditions are not active so you can use vLLM without having to worry about +accidentally making deterministic subsequent operations that rely on random state. diff --git a/docs/source/deployment/security.md b/docs/usage/security.md similarity index 94% rename from docs/source/deployment/security.md rename to docs/usage/security.md index 9c4d639c0b3da..1209cc8dd4572 100644 --- a/docs/source/deployment/security.md +++ b/docs/usage/security.md @@ -1,4 +1,4 @@ -# Security Guide +# Security ## Inter-Node Communication @@ -12,14 +12,14 @@ All communications between nodes in a multi-node vLLM deployment are **insecure The following options control inter-node communications in vLLM: -1. **Environment Variables:** +#### 1. **Environment Variables:** - `VLLM_HOST_IP`: Sets the IP address for vLLM processes to communicate on -2. **KV Cache Transfer Configuration:** +#### 2. **KV Cache Transfer Configuration:** - `--kv-ip`: The IP address for KV cache transfer communications (default: 127.0.0.1) - `--kv-port`: The port for KV cache transfer communications (default: 14579) -3. **Data Parallel Configuration:** +#### 3. **Data Parallel Configuration:** - `data_parallel_master_ip`: IP of the data parallel master (default: 127.0.0.1) - `data_parallel_master_port`: Port of the data parallel master (default: 29500) @@ -39,16 +39,16 @@ Key points from the PyTorch security guide: ### Security Recommendations -1. **Network Isolation:** +#### 1. **Network Isolation:** - Deploy vLLM nodes on a dedicated, isolated network - Use network segmentation to prevent unauthorized access - Implement appropriate firewall rules -2. **Configuration Best Practices:** +#### 2. **Configuration Best Practices:** - Always set `VLLM_HOST_IP` to a specific IP address rather than using defaults - Configure firewalls to only allow necessary ports between nodes -3. **Access Control:** +#### 3. **Access Control:** - Restrict physical and network access to the deployment environment - Implement proper authentication and authorization for management interfaces - Follow the principle of least privilege for all system components diff --git a/docs/source/getting_started/troubleshooting.md b/docs/usage/troubleshooting.md similarity index 85% rename from docs/source/getting_started/troubleshooting.md rename to docs/usage/troubleshooting.md index a4744827f2268..889cfccdacac6 100644 --- a/docs/source/getting_started/troubleshooting.md +++ b/docs/usage/troubleshooting.md @@ -1,12 +1,12 @@ -(troubleshooting)= - -# Troubleshooting +--- +title: Troubleshooting +--- +[](){ #troubleshooting } This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. -:::{note} -Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated. -::: +!!! note + Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated. ## Hangs downloading a model @@ -18,13 +18,12 @@ It's recommended to download the model first using the [huggingface-cli](https:/ If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory. -:::{note} -To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck. -::: +!!! note + To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck. ## Out of memory -If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider adopting [these options](#reducing-memory-usage) to reduce the memory consumption. +If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider adopting [these options](../configuration/conserving_memory.md) to reduce the memory consumption. ## Generation quality changed @@ -53,9 +52,9 @@ You might also need to set `export NCCL_SOCKET_IFNAME=<your_network_interface>` ## Error near `self.graph.replay()` If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph. -To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. +To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the [LLM][vllm.LLM] class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. -(troubleshooting-incorrect-hardware-driver)= +[](){ #troubleshooting-incorrect-hardware-driver } ## Incorrect hardware/driver @@ -140,16 +139,15 @@ If the script runs successfully, you should see the message `sanity check is suc If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as `export NCCL_P2P_DISABLE=1` to see if it helps. Please check [their documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html) for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully. -:::{note} -A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments: +!!! note + A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments: -- In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`. -- In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`. + - In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`. + - In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`. -Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes. -::: + Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes. -(troubleshooting-python-multiprocessing)= +[](){ #troubleshooting-python-multiprocessing } ## Python multiprocessing @@ -161,7 +159,7 @@ If you have seen a warning in your logs like this: WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously initialized. We must use the `spawn` multiprocessing start method. Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See - https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#python-multiprocessing + https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing for more information. ``` @@ -260,7 +258,7 @@ or: ValueError: Model architectures ['<arch>'] are not supported for now. Supported architectures: [...] ``` -But you are sure that the model is in the [list of supported models](#supported-models), there may be some issue with vLLM's model resolution. In that case, please follow [these steps](#model-resolution) to explicitly specify the vLLM implementation for the model. +But you are sure that the model is in the [list of supported models][supported-models], there may be some issue with vLLM's model resolution. In that case, please follow [these steps](../configuration/model_resolution.md) to explicitly specify the vLLM implementation for the model. ## Failed to infer device type diff --git a/docs/source/serving/usage_stats.md b/docs/usage/usage_stats.md similarity index 100% rename from docs/source/serving/usage_stats.md rename to docs/usage/usage_stats.md diff --git a/docs/source/getting_started/v1_user_guide.md b/docs/usage/v1_guide.md similarity index 99% rename from docs/source/getting_started/v1_user_guide.md rename to docs/usage/v1_guide.md index de90b8a7851e6..3d5d7ce45cce4 100644 --- a/docs/source/getting_started/v1_user_guide.md +++ b/docs/usage/v1_guide.md @@ -1,4 +1,4 @@ -# vLLM V1 User Guide +# vLLM V1 V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack). diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index bab41c915c32d..56cdd6861baa4 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -1,11 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 """ -This example shows how to use vLLM for running offline inference +This example shows how to use vLLM for running offline inference with the correct prompt format on audio language models. For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ + import os from dataclasses import asdict from typing import NamedTuple, Optional @@ -22,7 +23,7 @@ audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] question_per_audio_count = { 0: "What is 1+1?", 1: "What is recited in the audio?", - 2: "What sport and what nursery rhyme are referenced?" + 2: "What sport and what nursery rhyme are referenced?", } @@ -72,8 +73,7 @@ def run_granite_speech(question: str, audio_count: int) -> ModelRequestData: # MiniCPM-O def run_minicpmo(question: str, audio_count: int) -> ModelRequestData: model_name = "openbmb/MiniCPM-o-2_6" - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) engine_args = EngineArgs( model=model_name, trust_remote_code=True, @@ -82,19 +82,18 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData: limit_mm_per_prompt={"audio": audio_count}, ) - stop_tokens = ['<|im_end|>', '<|endoftext|>'] + stop_tokens = ["<|im_end|>", "<|endoftext|>"] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] audio_placeholder = "(<audio>./</audio>)" * audio_count audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}" # noqa: E501 - messages = [{ - 'role': 'user', - 'content': f'{audio_placeholder}\n{question}' - }] - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True, - chat_template=audio_chat_template) + messages = [{"role": "user", "content": f"{audio_placeholder}\n{question}"}] + prompt = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + chat_template=audio_chat_template, + ) return ModelRequestData( engine_args=engine_args, @@ -113,7 +112,7 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData: # Since the vision-lora and speech-lora co-exist with the base model, # we have to manually specify the path of the lora weights. speech_lora_path = os.path.join(model_path, "speech-lora") - placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)]) + placeholders = "".join([f"<|audio_{i + 1}|>" for i in range(audio_count)]) prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>" @@ -145,15 +144,19 @@ def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData: limit_mm_per_prompt={"audio": audio_count}, ) - audio_in_prompt = "".join([ - f"Audio {idx+1}: " - f"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count) - ]) + audio_in_prompt = "".join( + [ + f"Audio {idx + 1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n" + for idx in range(audio_count) + ] + ) - prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" - "<|im_start|>user\n" - f"{audio_in_prompt}{question}<|im_end|>\n" - "<|im_start|>assistant\n") + prompt = ( + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n" + f"{audio_in_prompt}{question}<|im_end|>\n" + "<|im_start|>assistant\n" + ) return ModelRequestData( engine_args=engine_args, @@ -172,19 +175,22 @@ def run_qwen2_5_omni(question: str, audio_count: int): limit_mm_per_prompt={"audio": audio_count}, ) - audio_in_prompt = "".join([ - "<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count) - ]) + audio_in_prompt = "".join( + ["<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)] + ) default_system = ( "You are Qwen, a virtual human developed by the Qwen Team, Alibaba " "Group, capable of perceiving auditory and visual inputs, as well as " - "generating text and speech.") + "generating text and speech." + ) - prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n" - "<|im_start|>user\n" - f"{audio_in_prompt}{question}<|im_end|>\n" - "<|im_start|>assistant\n") + prompt = ( + f"<|im_start|>system\n{default_system}<|im_end|>\n" + "<|im_start|>user\n" + f"{audio_in_prompt}{question}<|im_end|>\n" + "<|im_start|>assistant\n" + ) return ModelRequestData( engine_args=engine_args, prompt=prompt, @@ -196,13 +202,10 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData: model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b" tokenizer = AutoTokenizer.from_pretrained(model_name) - messages = [{ - 'role': 'user', - 'content': "<|audio|>\n" * audio_count + question - }] - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + messages = [{"role": "user", "content": "<|audio|>\n" * audio_count + question}] + prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) engine_args = EngineArgs( model=model_name, @@ -220,8 +223,7 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData: # Whisper def run_whisper(question: str, audio_count: int) -> ModelRequestData: - assert audio_count == 1, ( - "Whisper only support single audio input per prompt") + assert audio_count == 1, "Whisper only support single audio input per prompt" model_name = "openai/whisper-large-v3-turbo" prompt = "<|startoftranscript|>" @@ -252,27 +254,33 @@ model_example_map = { def parse_args(): parser = FlexibleArgumentParser( - description='Demo on using vLLM for offline inference with ' - 'audio language models') - parser.add_argument('--model-type', - '-m', - type=str, - default="ultravox", - choices=model_example_map.keys(), - help='Huggingface "model_type".') - parser.add_argument('--num-prompts', - type=int, - default=1, - help='Number of prompts to run.') - parser.add_argument("--num-audios", - type=int, - default=1, - choices=[0, 1, 2], - help="Number of audio items per prompt.") - parser.add_argument("--seed", - type=int, - default=None, - help="Set the seed when initializing `vllm.LLM`.") + description="Demo on using vLLM for offline inference with " + "audio language models" + ) + parser.add_argument( + "--model-type", + "-m", + type=str, + default="ultravox", + choices=model_example_map.keys(), + help='Huggingface "model_type".', + ) + parser.add_argument( + "--num-prompts", type=int, default=1, help="Number of prompts to run." + ) + parser.add_argument( + "--num-audios", + type=int, + default=1, + choices=[0, 1, 2], + help="Number of audio items per prompt.", + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.", + ) return parser.parse_args() @@ -283,29 +291,30 @@ def main(args): raise ValueError(f"Model type {model} is not supported.") audio_count = args.num_audios - req_data = model_example_map[model](question_per_audio_count[audio_count], - audio_count) + req_data = model_example_map[model]( + question_per_audio_count[audio_count], audio_count + ) # Disable other modalities to save memory default_limits = {"image": 0, "video": 0, "audio": 0} req_data.engine_args.limit_mm_per_prompt = default_limits | dict( - req_data.engine_args.limit_mm_per_prompt or {}) + req_data.engine_args.limit_mm_per_prompt or {} + ) engine_args = asdict(req_data.engine_args) | {"seed": args.seed} llm = LLM(**engine_args) # We set temperature to 0.2 so that outputs can be different # even when all prompts are identical when running batch inference. - sampling_params = SamplingParams(temperature=0.2, - max_tokens=64, - stop_token_ids=req_data.stop_token_ids) + sampling_params = SamplingParams( + temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids + ) mm_data = {} if audio_count > 0: mm_data = { "audio": [ - asset.audio_and_sample_rate - for asset in audio_assets[:audio_count] + asset.audio_and_sample_rate for asset in audio_assets[:audio_count] ] } @@ -315,8 +324,9 @@ def main(args): # Batch inference inputs = [inputs] * args.num_prompts # Add LoRA request if applicable - lora_request = (req_data.lora_requests * - args.num_prompts if req_data.lora_requests else None) + lora_request = ( + req_data.lora_requests * args.num_prompts if req_data.lora_requests else None + ) outputs = llm.generate( inputs, diff --git a/docs/source/features/automatic_prefix_caching.md b/examples/offline_inference/automatic_prefix_caching.py similarity index 63% rename from docs/source/features/automatic_prefix_caching.md rename to examples/offline_inference/automatic_prefix_caching.py index 59016d7fcf6b3..0d8c733042376 100644 --- a/docs/source/features/automatic_prefix_caching.md +++ b/examples/offline_inference/automatic_prefix_caching.py @@ -1,26 +1,31 @@ -(automatic-prefix-caching)= +# SPDX-License-Identifier: Apache-2.0 +""" +Demonstration script for Automatic Prefix Caching (APC) in vLLM. -# Automatic Prefix Caching +Automatic Prefix Caching (APC) allows the vLLM engine to reuse cached +KV (key-value) pairs from previous prompts if a new query shares the same +prefix. This reduces redundant computation and improves inference speed. -## Introduction +To enable APC, set `enable_prefix_caching=True` when initializing the +vLLM engine. -Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part. +This script uses a long Markdown table as the shared prompt prefix and +compares the generation time for two queries that share the same prefix +but ask different questions. -:::{note} -Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching). -::: +Run: +python examples/offline_inference/automatic_prefix_caching.py +""" -## Enabling APC in vLLM - -Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example: - -```python import time + from vllm import LLM, SamplingParams - +# ruff: noqa: E501 # A prompt containing a large markdown table. The table is randomly generated by GPT-4. -LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """ +LONG_PROMPT = ( + "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + + """ | ID | Name | Age | Occupation | Country | Email | Phone Number | Address | |-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------| | 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL | @@ -54,6 +59,7 @@ LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables i | 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ | | 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE | """ +) def get_generation_time(llm, sampling_params, prompts): @@ -62,41 +68,35 @@ def get_generation_time(llm, sampling_params, prompts): output = llm.generate(prompts, sampling_params=sampling_params) end_time = time.time() # print the output and generation time + print("-" * 30) print(f"Output: {output[0].outputs[0].text}") print(f"Generation time: {end_time - start_time} seconds.") + print("-" * 30) -# set enable_prefix_caching=True to enable APC -llm = LLM( - model='lmsys/longchat-13b-16k', - enable_prefix_caching=True -) +def main(): + # set enable_prefix_caching=True to enable APC + llm = LLM(model="lmsys/longchat-13b-16k", enable_prefix_caching=True) -sampling_params = SamplingParams(temperature=0, max_tokens=100) + sampling_params = SamplingParams(temperature=0, max_tokens=100) -# Querying the age of John Doe -get_generation_time( - llm, - sampling_params, - LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ", -) + # Querying the age of John Doe + get_generation_time( + llm, + sampling_params, + LONG_PROMPT + + "Question: what is the age of John Doe? Your answer: The age of John Doe is ", + ) -# Querying the age of Zack Blue -# This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again. -get_generation_time( - llm, - sampling_params, - LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ", -) -``` + # Querying the age of Zack Blue + # This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again. + get_generation_time( + llm, + sampling_params, + LONG_PROMPT + + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ", + ) -## Example workloads -We describe two example workloads, where APC can provide huge performance benefit: - -- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency. -- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency. - -## Limits - -APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused). +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/basic/chat.py b/examples/offline_inference/basic/chat.py index 8e6f78ed7de21..b0bb5aa71b8a7 100644 --- a/examples/offline_inference/basic/chat.py +++ b/examples/offline_inference/basic/chat.py @@ -56,22 +56,12 @@ def main(args: dict): # In this script, we demonstrate how to pass input to the chat method: conversation = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hello! How can I assist you today?"}, { "role": "user", - "content": "Hello" - }, - { - "role": "assistant", - "content": "Hello! How can I assist you today?" - }, - { - "role": "user", - "content": - "Write an essay about the importance of higher education.", + "content": "Write an essay about the importance of higher education.", }, ] outputs = llm.chat(conversation, sampling_params, use_tqdm=False) diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py index 5b6dcb41eee1c..40ccb1294e424 100644 --- a/examples/offline_inference/basic/classify.py +++ b/examples/offline_inference/basic/classify.py @@ -10,9 +10,9 @@ def parse_args(): parser = FlexibleArgumentParser() parser = EngineArgs.add_cli_args(parser) # Set example specific arguments - parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach", - task="classify", - enforce_eager=True) + parser.set_defaults( + model="jason9693/Qwen2.5-1.5B-apeach", task="classify", enforce_eager=True + ) return parser.parse_args() @@ -36,10 +36,11 @@ def main(args: Namespace): print("\nGenerated Outputs:\n" + "-" * 60) for prompt, output in zip(prompts, outputs): probs = output.outputs.probs - probs_trimmed = ((str(probs[:16])[:-1] + - ", ...]") if len(probs) > 16 else probs) - print(f"Prompt: {prompt!r} \n" - f"Class Probabilities: {probs_trimmed} (size={len(probs)})") + probs_trimmed = (str(probs[:16])[:-1] + ", ...]") if len(probs) > 16 else probs + print( + f"Prompt: {prompt!r} \n" + f"Class Probabilities: {probs_trimmed} (size={len(probs)})" + ) print("-" * 60) diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py index cb5f923ffb697..38a73ccca251e 100644 --- a/examples/offline_inference/basic/embed.py +++ b/examples/offline_inference/basic/embed.py @@ -10,9 +10,9 @@ def parse_args(): parser = FlexibleArgumentParser() parser = EngineArgs.add_cli_args(parser) # Set example specific arguments - parser.set_defaults(model="intfloat/e5-mistral-7b-instruct", - task="embed", - enforce_eager=True) + parser.set_defaults( + model="intfloat/e5-mistral-7b-instruct", task="embed", enforce_eager=True + ) return parser.parse_args() @@ -36,10 +36,10 @@ def main(args: Namespace): print("\nGenerated Outputs:\n" + "-" * 60) for prompt, output in zip(prompts, outputs): embeds = output.outputs.embedding - embeds_trimmed = ((str(embeds[:16])[:-1] + - ", ...]") if len(embeds) > 16 else embeds) - print(f"Prompt: {prompt!r} \n" - f"Embeddings: {embeds_trimmed} (size={len(embeds)})") + embeds_trimmed = ( + (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds + ) + print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})") print("-" * 60) diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py index d2bda8b3180c3..3da73c6c407d4 100644 --- a/examples/offline_inference/basic/score.py +++ b/examples/offline_inference/basic/score.py @@ -10,9 +10,9 @@ def parse_args(): parser = FlexibleArgumentParser() parser = EngineArgs.add_cli_args(parser) # Set example specific arguments - parser.set_defaults(model="BAAI/bge-reranker-v2-m3", - task="score", - enforce_eager=True) + parser.set_defaults( + model="BAAI/bge-reranker-v2-m3", task="score", enforce_eager=True + ) return parser.parse_args() diff --git a/examples/offline_inference/batch_llm_inference.py b/examples/offline_inference/batch_llm_inference.py index 6548857b6d111..c1edfb52ff70c 100644 --- a/examples/offline_inference/batch_llm_inference.py +++ b/examples/offline_inference/batch_llm_inference.py @@ -17,12 +17,14 @@ Ray Data provides functionality for: Learn more about Ray Data's LLM integration: https://docs.ray.io/en/latest/data/working-with-llms.html """ + import ray from packaging.version import Version from ray.data.llm import build_llm_processor, vLLMEngineProcessorConfig -assert Version(ray.__version__) >= Version( - "2.44.1"), "Ray version must be at least 2.44.1" +assert Version(ray.__version__) >= Version("2.44.1"), ( + "Ray version must be at least 2.44.1" +) # Uncomment to reduce clutter in stdout # ray.init(log_to_driver=False) @@ -53,20 +55,18 @@ config = vLLMEngineProcessorConfig( vllm_processor = build_llm_processor( config, preprocess=lambda row: dict( - messages=[{ - "role": "system", - "content": "You are a bot that responds with haikus." - }, { - "role": "user", - "content": row["text"] - }], + messages=[ + {"role": "system", "content": "You are a bot that responds with haikus."}, + {"role": "user", "content": row["text"]}, + ], sampling_params=dict( temperature=0.3, max_tokens=250, - )), + ), + ), postprocess=lambda row: dict( answer=row["generated_text"], - **row # This will return all the original columns in the dataset. + **row, # This will return all the original columns in the dataset. ), ) diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py index b532bf42adfba..61230d8955842 100644 --- a/examples/offline_inference/chat_with_tools.py +++ b/examples/offline_inference/chat_with_tools.py @@ -50,87 +50,93 @@ model_name = "mistralai/Mistral-7B-Instruct-v0.3" # or any other mistral model with function calling ability sampling_params = SamplingParams(max_tokens=8192, temperature=0.0) -llm = LLM(model=model_name, - tokenizer_mode="mistral", - config_format="mistral", - load_format="mistral") +llm = LLM( + model=model_name, + tokenizer_mode="mistral", + config_format="mistral", + load_format="mistral", +) def generate_random_id(length=9): characters = string.ascii_letters + string.digits - random_id = ''.join(random.choice(characters) for _ in range(length)) + random_id = "".join(random.choice(characters) for _ in range(length)) return random_id # simulate an API that can be called -def get_current_weather(city: str, state: str, unit: 'str'): - return (f"The weather in {city}, {state} is 85 degrees {unit}. It is " - "partly cloudly, with highs in the 90's.") +def get_current_weather(city: str, state: str, unit: "str"): + return ( + f"The weather in {city}, {state} is 85 degrees {unit}. It is " + "partly cloudly, with highs in the 90's." + ) tool_functions = {"get_current_weather": get_current_weather} -tools = [{ - "type": "function", - "function": { - "name": "get_current_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "city": { - "type": - "string", - "description": - "The city to find the weather for, e.g. 'San Francisco'" +tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "The city to find the weather for, e.g. 'San Francisco'", + }, + "state": { + "type": "string", + "description": "the two-letter abbreviation for the state that the city is" + " in, e.g. 'CA' which would mean 'California'", + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"], + }, }, - "state": { - "type": - "string", - "description": - "the two-letter abbreviation for the state that the city is" - " in, e.g. 'CA' which would mean 'California'" - }, - "unit": { - "type": "string", - "description": "The unit to fetch the temperature in", - "enum": ["celsius", "fahrenheit"] - } + "required": ["city", "state", "unit"], }, - "required": ["city", "state", "unit"] - } + }, } -}] +] -messages = [{ - "role": - "user", - "content": - "Can you tell me what the temperate will be in Dallas, in fahrenheit?" -}] +messages = [ + { + "role": "user", + "content": "Can you tell me what the temperate will be in Dallas, in fahrenheit?", + } +] outputs = llm.chat(messages, sampling_params=sampling_params, tools=tools) output = outputs[0].outputs[0].text.strip() # append the assistant message -messages.append({ - "role": "assistant", - "content": output, -}) +messages.append( + { + "role": "assistant", + "content": output, + } +) # let's now actually parse and execute the model's output simulating an API call by using the # above defined function tool_calls = json.loads(output) tool_answers = [ - tool_functions[call['name']](**call['arguments']) for call in tool_calls + tool_functions[call["name"]](**call["arguments"]) for call in tool_calls ] # append the answer as a tool message and let the LLM give you an answer -messages.append({ - "role": "tool", - "content": "\n\n".join(tool_answers), - "tool_call_id": generate_random_id(), -}) +messages.append( + { + "role": "tool", + "content": "\n\n".join(tool_answers), + "tool_call_id": generate_random_id(), + } +) outputs = llm.chat(messages, sampling_params, tools=tools) diff --git a/examples/offline_inference/context_extension.py b/examples/offline_inference/context_extension.py new file mode 100644 index 0000000000000..1a70446c30a05 --- /dev/null +++ b/examples/offline_inference/context_extension.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm import LLM, SamplingParams + +rope_theta = 1000000 +original_max_position_embeddings = 32768 +factor = 4.0 + +# Use yarn to extend context +hf_overrides = { + "rope_theta": rope_theta, + "rope_scaling": { + "rope_type": "yarn", + "factor": factor, + "original_max_position_embeddings": original_max_position_embeddings, + }, + "max_model_len": int(original_max_position_embeddings * factor), +} + +llm = LLM(model="Qwen/Qwen3-0.6B", hf_overrides=hf_overrides) + +sampling_params = SamplingParams( + temperature=0.8, + top_p=0.95, + max_tokens=128, +) + +conversation = [ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hello! How can I assist you today?"}, +] +outputs = llm.chat(conversation, sampling_params, use_tqdm=False) + + +def print_outputs(outputs): + print("\nGenerated Outputs:\n" + "-" * 80) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\n") + print(f"Generated text: {generated_text!r}") + print("-" * 80) + + +print_outputs(outputs) diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index 054fa33403d04..3f71b498304a0 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -27,6 +27,7 @@ Multi-node: --master-addr=10.99.48.128 \ --master-port=13345 """ + import os from time import sleep @@ -87,10 +88,14 @@ def main(args, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, # with DP, each rank should process different prompts. # usually all the DP ranks process a full dataset, # and each rank processes a different part of the dataset. - promts_per_rank = len(prompts) // dp_size - start = global_dp_rank * promts_per_rank - end = start + promts_per_rank - prompts = prompts[start:end] + floor = len(prompts) // dp_size + remainder = len(prompts) % dp_size + + # Distribute prompts into even groups. + def start(rank): + return rank * floor + min(rank, remainder) + + prompts = prompts[start(global_dp_rank) : start(global_dp_rank + 1)] if len(prompts) == 0: # if any rank has no prompts to process, # we need to set a placeholder prompt @@ -101,9 +106,9 @@ def main(args, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, # since we are doing data parallel, every rank can have different # sampling params. here we set different max_tokens for different # ranks for demonstration. - sampling_params = SamplingParams(temperature=0.8, - top_p=0.95, - max_tokens=[16, 20][global_dp_rank % 2]) + sampling_params = SamplingParams( + temperature=0.8, top_p=0.95, max_tokens=[16, 20][global_dp_rank % 2] + ) # Fixed params args.pop("tensor_parallel_size") @@ -123,8 +128,10 @@ def main(args, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, break prompt = output.prompt generated_text = output.outputs[0].text - print(f"DP rank {global_dp_rank}, Prompt: {prompt!r}, " - f"Generated text: {generated_text!r}") + print( + f"DP rank {global_dp_rank}, Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}" + ) # Give engines time to pause their processing loops before exiting. sleep(1) @@ -172,8 +179,7 @@ if __name__ == "__main__": for proc in procs: proc.join(timeout=300) if proc.exitcode is None: - print(f"Killing process {proc.pid} that " - f"didn't stop within 5 minutes.") + print(f"Killing process {proc.pid} that didn't stop within 5 minutes.") proc.kill() exit_code = 1 elif proc.exitcode: diff --git a/examples/offline_inference/disaggregated-prefill-v1/README.md b/examples/offline_inference/disaggregated-prefill-v1/README.md index f708eb2538380..9cbdb19820f56 100644 --- a/examples/offline_inference/disaggregated-prefill-v1/README.md +++ b/examples/offline_inference/disaggregated-prefill-v1/README.md @@ -5,5 +5,6 @@ This example contains scripts that demonstrate disaggregated prefill in the offl ## Files - `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially. + - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`. - `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`. - `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`. diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py index 11918f72feec8..4ae5d3310e0bf 100644 --- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py +++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py @@ -3,35 +3,48 @@ from vllm import LLM, SamplingParams from vllm.config import KVTransferConfig -# Read prompts from output.txt -prompts = [] -try: - with open("output.txt") as f: - for line in f: - prompts.append(line.strip()) - print(f"Loaded {len(prompts)} prompts from output.txt") -except FileNotFoundError: - print("Error: output.txt file not found") - exit(-1) -sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) +def read_prompts(): + """Read prompts from output.txt""" + prompts = [] + try: + with open("output.txt") as f: + for line in f: + prompts.append(line.strip()) + print(f"Loaded {len(prompts)} prompts from output.txt") + return prompts + except FileNotFoundError: + print("Error: output.txt file not found") + exit(-1) -llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", - enforce_eager=True, - gpu_memory_utilization=0.8, - max_num_batched_tokens=64, - max_num_seqs=16, - kv_transfer_config=KVTransferConfig( - kv_connector="SharedStorageConnector", - kv_role="kv_both", - kv_connector_extra_config={ - "shared_storage_path": "local_storage" - })) #, max_model_len=2048, max_num_batched_tokens=2048) -# 1ST generation (prefill instance) -outputs = llm.generate(prompts, sampling_params) +def main(): + prompts = read_prompts() + sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + llm = LLM( + model="meta-llama/Llama-3.2-1B-Instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + max_num_batched_tokens=64, + max_num_seqs=16, + kv_transfer_config=KVTransferConfig( + kv_connector="SharedStorageConnector", + kv_role="kv_both", + kv_connector_extra_config={"shared_storage_path": "local_storage"}, + ), + ) # , max_model_len=2048, max_num_batched_tokens=2048) + + # 1ST generation (prefill instance) + outputs = llm.generate(prompts, sampling_params) + + print("-" * 30) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 30) + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py index 798128301e0f0..5757a8a84b86a 100644 --- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py +++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py @@ -3,42 +3,55 @@ from vllm import LLM, SamplingParams from vllm.config import KVTransferConfig -context = "Hi " * 1000 -context2 = "Hey " * 500 -prompts = [ - context + "Hello, my name is", - context + "The capital of France is", - context2 + "Your name is", - context2 + "The capital of China is", -] -sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) +def read_prompts(): + context = "Hi " * 1000 + context2 = "Hey " * 500 + return [ + context + "Hello, my name is", + context + "The capital of France is", + context2 + "Your name is", + context2 + "The capital of China is", + ] -llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", - enforce_eager=True, - gpu_memory_utilization=0.8, - kv_transfer_config=KVTransferConfig( - kv_connector="SharedStorageConnector", - kv_role="kv_both", - kv_connector_extra_config={ - "shared_storage_path": "local_storage" - })) #, max_model_len=2048, max_num_batched_tokens=2048) -# 1ST generation (prefill instance) -outputs = llm.generate( - prompts, - sampling_params, -) +def main(): + prompts = read_prompts() -new_prompts = [] -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - new_prompts.append(prompt + generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) -# Write new_prompts to output.txt -with open("output.txt", "w") as f: - for prompt in new_prompts: - f.write(prompt + "\n") -print(f"Saved {len(new_prompts)} prompts to output.txt") + llm = LLM( + model="meta-llama/Llama-3.2-1B-Instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + kv_transfer_config=KVTransferConfig( + kv_connector="SharedStorageConnector", + kv_role="kv_both", + kv_connector_extra_config={"shared_storage_path": "local_storage"}, + ), + ) # , max_model_len=2048, max_num_batched_tokens=2048) + + # 1ST generation (prefill instance) + outputs = llm.generate( + prompts, + sampling_params, + ) + + new_prompts = [] + print("-" * 30) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + new_prompts.append(prompt + generated_text) + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 30) + + # Write new_prompts to output.txt + with open("output.txt", "w") as f: + for prompt in new_prompts: + f.write(prompt + "\n") + print(f"Saved {len(new_prompts)} prompts to output.txt") + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/offline_inference/disaggregated_prefill.py index bb6fdd48f79e1..3ccab0dcd6d32 100644 --- a/examples/offline_inference/disaggregated_prefill.py +++ b/examples/offline_inference/disaggregated_prefill.py @@ -4,6 +4,7 @@ This file demonstrates the example usage of disaggregated prefilling We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode), and then transfer the KV cache between them. """ + import os import time from multiprocessing import Event, Process @@ -32,17 +33,21 @@ def run_prefill(prefill_done): # This instance is the prefill node (kv_producer, rank 0). # The number of parallel instances for KV cache transfer is set to 2, # as required for PyNcclConnector. - ktc = KVTransferConfig(kv_connector="PyNcclConnector", - kv_role="kv_producer", - kv_rank=0, - kv_parallel_size=2) + ktc = KVTransferConfig( + kv_connector="PyNcclConnector", + kv_role="kv_producer", + kv_rank=0, + kv_parallel_size=2, + ) # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB # memory. You may need to adjust the value to fit your GPU. - llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct", - kv_transfer_config=ktc, - max_model_len=2000, - gpu_memory_utilization=0.8) + llm = LLM( + model="meta-llama/Meta-Llama-3.1-8B-Instruct", + kv_transfer_config=ktc, + max_model_len=2000, + gpu_memory_utilization=0.8, + ) llm.generate(prompts, sampling_params) print("Prefill node is finished.") @@ -72,17 +77,21 @@ def run_decode(prefill_done): # This instance is the decode node (kv_consumer, rank 1). # The number of parallel instances for KV cache transfer is set to 2, # as required for PyNcclConnector. - ktc = KVTransferConfig(kv_connector="PyNcclConnector", - kv_role="kv_consumer", - kv_rank=1, - kv_parallel_size=2) + ktc = KVTransferConfig( + kv_connector="PyNcclConnector", + kv_role="kv_consumer", + kv_rank=1, + kv_parallel_size=2, + ) # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB # memory. You may need to adjust the value to fit your GPU. - llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct", - kv_transfer_config=ktc, - max_model_len=2000, - gpu_memory_utilization=0.8) + llm = LLM( + model="meta-llama/Meta-Llama-3.1-8B-Instruct", + kv_transfer_config=ktc, + max_model_len=2000, + gpu_memory_utilization=0.8, + ) # Wait for the producer to start the pipe print("Waiting for prefill node to finish...") @@ -99,8 +108,8 @@ def run_decode(prefill_done): def main(): prefill_done = Event() - prefill_process = Process(target=run_prefill, args=(prefill_done, )) - decode_process = Process(target=run_decode, args=(prefill_done, )) + prefill_process = Process(target=run_prefill, args=(prefill_done,)) + decode_process = Process(target=run_decode, args=(prefill_done,)) # Start prefill node prefill_process.start() diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py index 615f67e9f8d81..606ce7799a88f 100644 --- a/examples/offline_inference/eagle.py +++ b/examples/offline_inference/eagle.py @@ -6,6 +6,7 @@ import os from transformers import AutoTokenizer from vllm import LLM, SamplingParams +from vllm.v1.metrics.reader import Counter, Vector def load_prompts(dataset_path, num_prompts): @@ -20,9 +21,7 @@ def load_prompts(dataset_path, num_prompts): print(f"Error reading dataset: {e}") return [] else: - prompts = [ - "The future of AI is", "The president of the United States is" - ] + prompts = ["The future of AI is", "The president of the United States is"] return prompts[:num_prompts] @@ -33,34 +32,32 @@ def parse_args(): "--dataset", type=str, default="./examples/data/gsm8k.jsonl", - help="downloaded from the eagle repo " \ - "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/" + help="downloaded from the eagle repo " + "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/", + ) + parser.add_argument( + "--method", type=str, default="eagle", choices=["eagle", "eagle3"] ) - parser.add_argument("--method", - type=str, - default='eagle', - choices=['eagle', 'eagle3']) parser.add_argument("--max_num_seqs", type=int, default=8) parser.add_argument("--num_prompts", type=int, default=80) parser.add_argument("--num_spec_tokens", type=int, default=2) parser.add_argument("--tp", type=int, default=1) parser.add_argument("--draft_tp", type=int, default=1) - parser.add_argument("--enforce_eager", action='store_true') - parser.add_argument("--enable_chunked_prefill", action='store_true') + parser.add_argument("--enforce_eager", action="store_true") + parser.add_argument("--enable_chunked_prefill", action="store_true") parser.add_argument("--max_num_batched_tokens", type=int, default=2048) parser.add_argument("--temp", type=float, default=0) return parser.parse_args() def main(): - args = parse_args() model_dir = "meta-llama/Llama-3.1-8B-Instruct" - if args.method == 'eagle': + if args.method == "eagle": eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B" - elif args.method == 'eagle3': + elif args.method == "eagle3": eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" else: raise ValueError(f"unknown method: {args.method}") @@ -72,11 +69,9 @@ def main(): prompts = load_prompts(args.dataset, args.num_prompts) prompt_ids = [ - tokenizer.apply_chat_template([{ - "role": "user", - "content": prompt - }], - add_generation_prompt=True) + tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], add_generation_prompt=True + ) for prompt in prompts ] @@ -102,8 +97,7 @@ def main(): sampling_params = SamplingParams(temperature=args.temp, max_tokens=256) - outputs = llm.generate(prompt_token_ids=prompt_ids, - sampling_params=sampling_params) + outputs = llm.generate(prompt_token_ids=prompt_ids, sampling_params=sampling_params) # print the generated text for output in outputs: @@ -112,27 +106,33 @@ def main(): print(f"generated text: {output.outputs[0].text}") print("-" * 50) - if not hasattr(outputs, "metrics") or outputs.metrics is None: + try: + metrics = llm.get_metrics() + except AssertionError: + print("Metrics are not supported in the V0 engine.") return - # calculate the average number of accepted tokens per forward pass, +1 is - # to account for the token from the target model that's always going to be - # accepted - acceptance_counts = [0] * (args.num_spec_tokens + 1) - for output in outputs: - for step, count in enumerate( - output.metrics.spec_token_acceptance_counts): - acceptance_counts[step] += count + num_drafts = num_accepted = 0 + acceptance_counts = [0] * args.num_spec_tokens + for metric in metrics: + if metric.name == "vllm:spec_decode_num_drafts": + assert isinstance(metric, Counter) + num_drafts += metric.value + elif metric.name == "vllm:spec_decode_num_accepted_tokens": + assert isinstance(metric, Counter) + num_accepted += metric.value + elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos": + assert isinstance(metric, Vector) + for pos in range(len(metric.values)): + acceptance_counts[pos] += metric.values[pos] print("-" * 50) - print(f"mean acceptance length (including bonus tokens): \ - {1 + (sum(acceptance_counts) / acceptance_counts[0]):.2f}") + print(f"mean acceptance length: {1 + (num_accepted / num_drafts):.2f}") print("-" * 50) # print acceptance at each token position for i in range(len(acceptance_counts)): - print(f"acceptance at token {i}:" - f"{acceptance_counts[i] / (acceptance_counts[0]):.2f}") + print(f"acceptance at token {i}:{acceptance_counts[i] / num_drafts:.2f}") if __name__ == "__main__": diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/embed_jina_embeddings_v3.py index b347ddbf3197a..23f60c431fc24 100644 --- a/examples/offline_inference/embed_jina_embeddings_v3.py +++ b/examples/offline_inference/embed_jina_embeddings_v3.py @@ -10,9 +10,9 @@ def parse_args(): parser = FlexibleArgumentParser() parser = EngineArgs.add_cli_args(parser) # Set example specific arguments - parser.set_defaults(model="jinaai/jina-embeddings-v3", - task="embed", - trust_remote_code=True) + parser.set_defaults( + model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True + ) return parser.parse_args() @@ -41,11 +41,14 @@ def main(args: Namespace): print("-" * 60) for prompt, output in zip(prompts, outputs): embeds = output.outputs.embedding - embeds_trimmed = ((str(embeds[:16])[:-1] + - ", ...]") if len(embeds) > 16 else embeds) - print(f"Prompt: {prompt!r} \n" - f"Embeddings for text matching: {embeds_trimmed} " - f"(size={len(embeds)})") + embeds_trimmed = ( + (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds + ) + print( + f"Prompt: {prompt!r} \n" + f"Embeddings for text matching: {embeds_trimmed} " + f"(size={len(embeds)})" + ) print("-" * 60) diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/embed_matryoshka_fy.py index 7a6cb02556d9a..59c0592ae9e23 100644 --- a/examples/offline_inference/embed_matryoshka_fy.py +++ b/examples/offline_inference/embed_matryoshka_fy.py @@ -10,9 +10,9 @@ def parse_args(): parser = FlexibleArgumentParser() parser = EngineArgs.add_cli_args(parser) # Set example specific arguments - parser.set_defaults(model="jinaai/jina-embeddings-v3", - task="embed", - trust_remote_code=True) + parser.set_defaults( + model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True + ) return parser.parse_args() @@ -39,11 +39,10 @@ def main(args: Namespace): print("-" * 60) for prompt, output in zip(prompts, outputs): embeds = output.outputs.embedding - embeds_trimmed = ((str(embeds[:16])[:-1] + - ", ...]") if len(embeds) > 16 else embeds) - print(f"Prompt: {prompt!r} \n" - f"Embeddings: {embeds_trimmed} " - f"(size={len(embeds)})") + embeds_trimmed = ( + (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds + ) + print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})") print("-" * 60) diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py index c4916e00f473c..83dd1f667eb5f 100644 --- a/examples/offline_inference/encoder_decoder.py +++ b/examples/offline_inference/encoder_decoder.py @@ -1,12 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 -''' +""" Demonstrate prompting of text-to-text encoder/decoder models, specifically BART -''' +""" from vllm import LLM, SamplingParams -from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, - TokensPrompt, zip_enc_dec_prompts) +from vllm.inputs import ( + ExplicitEncoderDecoderPrompt, + TextPrompt, + TokensPrompt, + zip_enc_dec_prompts, +) def create_prompts(tokenizer): @@ -18,8 +22,9 @@ def create_prompts(tokenizer): # - Helpers for building prompts text_prompt_raw = "Hello, my name is" text_prompt = TextPrompt(prompt="The president of the United States is") - tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode( - prompt="The capital of France is")) + tokens_prompt = TokensPrompt( + prompt_token_ids=tokenizer.encode(prompt="The capital of France is") + ) # - Pass a single prompt to encoder/decoder model # (implicitly encoder input prompt); # decoder input prompt is assumed to be None @@ -57,14 +62,19 @@ def create_prompts(tokenizer): # decoder prompts together into a list of ExplicitEncoderDecoderPrompt # instances zipped_prompt_list = zip_enc_dec_prompts( - ['An encoder prompt', 'Another encoder prompt'], - ['A decoder prompt', 'Another decoder prompt']) + ["An encoder prompt", "Another encoder prompt"], + ["A decoder prompt", "Another decoder prompt"], + ) # - Let's put all of the above example prompts together into one list # which we will pass to the encoder/decoder LLM. return [ - single_text_prompt_raw, single_text_prompt, single_tokens_prompt, - enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3 + single_text_prompt_raw, + single_text_prompt, + single_tokens_prompt, + enc_dec_prompt1, + enc_dec_prompt2, + enc_dec_prompt3, ] + zipped_prompt_list @@ -85,10 +95,12 @@ def print_outputs(outputs): prompt = output.prompt encoder_prompt = output.encoder_prompt generated_text = output.outputs[0].text - print(f"Output {i+1}:") - print(f"Encoder prompt: {encoder_prompt!r}\n" - f"Decoder prompt: {prompt!r}\n" - f"Generated text: {generated_text!r}") + print(f"Output {i + 1}:") + print( + f"Encoder prompt: {encoder_prompt!r}\n" + f"Decoder prompt: {prompt!r}\n" + f"Generated text: {generated_text!r}" + ) print("-" * 50) diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py index 2883c37ca2360..ae3737e375941 100644 --- a/examples/offline_inference/encoder_decoder_multimodal.py +++ b/examples/offline_inference/encoder_decoder_multimodal.py @@ -3,6 +3,7 @@ This example shows how to use vLLM for running offline inference with the explicit/implicit prompt format on enc-dec LMMs for text generation. """ + import time from collections.abc import Sequence from dataclasses import asdict @@ -30,18 +31,14 @@ def run_florence2(): ) prompts = [ - { # implicit prompt with task token + { # implicit prompt with task token "prompt": "<DETAILED_CAPTION>", - "multi_modal_data": { - "image": ImageAsset("stop_sign").pil_image - }, + "multi_modal_data": {"image": ImageAsset("stop_sign").pil_image}, }, - { # explicit encoder/decoder prompt + { # explicit encoder/decoder prompt "encoder_prompt": { "prompt": "Describe in detail what is shown in the image.", - "multi_modal_data": { - "image": ImageAsset("cherry_blossom").pil_image - }, + "multi_modal_data": {"image": ImageAsset("cherry_blossom").pil_image}, }, "decoder_prompt": "", }, @@ -63,20 +60,20 @@ def run_mllama(): ) prompts = [ - { # Implicit prompt - "prompt": "<|image|><|begin_of_text|>What is the content of this image?", # noqa: E501 + { # Implicit prompt + "prompt": "<|image|><|begin_of_text|>What is the content of this image?", # noqa: E501 "multi_modal_data": { "image": ImageAsset("stop_sign").pil_image, }, }, - { # Explicit prompt + { # Explicit prompt "encoder_prompt": { "prompt": "<|image|>", "multi_modal_data": { "image": ImageAsset("stop_sign").pil_image, }, }, - "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501 + "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501 }, ] @@ -96,13 +93,13 @@ def run_whisper(): ) prompts = [ - { # Test implicit prompt + { # Test implicit prompt "prompt": "<|startoftranscript|>", "multi_modal_data": { "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate, }, }, - { # Test explicit encoder/decoder prompt + { # Test explicit encoder/decoder prompt "encoder_prompt": { "prompt": "", "multi_modal_data": { @@ -110,7 +107,7 @@ def run_whisper(): }, }, "decoder_prompt": "<|startoftranscript|>", - } + }, ] return ModelRequestData( @@ -128,18 +125,23 @@ model_example_map = { def parse_args(): parser = FlexibleArgumentParser( - description='Demo on using vLLM for offline inference with ' - 'vision language models for text generation') - parser.add_argument('--model-type', - '-m', - type=str, - default="mllama", - choices=model_example_map.keys(), - help='Huggingface "model_type".') - parser.add_argument("--seed", - type=int, - default=None, - help="Set the seed when initializing `vllm.LLM`.") + description="Demo on using vLLM for offline inference with " + "vision language models for text generation" + ) + parser.add_argument( + "--model-type", + "-m", + type=str, + default="mllama", + choices=model_example_map.keys(), + help='Huggingface "model_type".', + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.", + ) return parser.parse_args() @@ -153,7 +155,8 @@ def main(args): # Disable other modalities to save memory default_limits = {"image": 0, "video": 0, "audio": 0} req_data.engine_args.limit_mm_per_prompt = default_limits | dict( - req_data.engine_args.limit_mm_per_prompt or {}) + req_data.engine_args.limit_mm_per_prompt or {} + ) engine_args = asdict(req_data.engine_args) | {"seed": args.seed} llm = LLM(**engine_args) @@ -179,8 +182,7 @@ def main(args): for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Decoder prompt: {prompt!r}, " - f"Generated text: {generated_text!r}") + print(f"Decoder prompt: {prompt!r}, Generated text: {generated_text!r}") duration = time.time() - start diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py index d84cd9ee9f52b..5d5e55a83d221 100644 --- a/examples/offline_inference/llm_engine_example.py +++ b/examples/offline_inference/llm_engine_example.py @@ -3,6 +3,7 @@ This file demonstrates using the `LLMEngine` for processing prompts with various sampling parameters. """ + import argparse from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams @@ -12,24 +13,26 @@ from vllm.utils import FlexibleArgumentParser def create_test_prompts() -> list[tuple[str, SamplingParams]]: """Create a list of test prompts with their sampling parameters.""" return [ - ("A robot may not injure a human being", - SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)), - ("To be or not to be,", - SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)), - ("What is the meaning of life?", - SamplingParams(n=2, - temperature=0.8, - top_p=0.95, - frequency_penalty=0.1)), + ( + "A robot may not injure a human being", + SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1), + ), + ( + "To be or not to be,", + SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2), + ), + ( + "What is the meaning of life?", + SamplingParams(n=2, temperature=0.8, top_p=0.95, frequency_penalty=0.1), + ), ] -def process_requests(engine: LLMEngine, - test_prompts: list[tuple[str, SamplingParams]]): +def process_requests(engine: LLMEngine, test_prompts: list[tuple[str, SamplingParams]]): """Continuously process a list of prompts and handle the outputs.""" request_id = 0 - print('-' * 50) + print("-" * 50) while test_prompts or engine.has_unfinished_requests(): if test_prompts: prompt, sampling_params = test_prompts.pop(0) @@ -41,7 +44,7 @@ def process_requests(engine: LLMEngine, for request_output in request_outputs: if request_output.finished: print(request_output) - print('-' * 50) + print("-" * 50) def initialize_engine(args: argparse.Namespace) -> LLMEngine: @@ -52,7 +55,8 @@ def initialize_engine(args: argparse.Namespace) -> LLMEngine: def parse_args(): parser = FlexibleArgumentParser( - description='Demo on using the LLMEngine class directly') + description="Demo on using the LLMEngine class directly" + ) parser = EngineArgs.add_cli_args(parser) return parser.parse_args() @@ -64,6 +68,6 @@ def main(args: argparse.Namespace): process_requests(engine, test_prompts) -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() main(args) diff --git a/examples/offline_inference/load_sharded_state.py b/examples/offline_inference/load_sharded_state.py index 7e90d5d25e293..5bb2327a3f83e 100644 --- a/examples/offline_inference/load_sharded_state.py +++ b/examples/offline_inference/load_sharded_state.py @@ -36,22 +36,21 @@ def parse_args(): parser.set_defaults(load_format="sharded_state") # Add validation arguments - parser.add_argument("--prompt", - type=str, - default="Hello, world!", - help="Prompt for validation") - parser.add_argument("--max-tokens", - type=int, - default=100, - help="Maximum number of tokens to generate") - parser.add_argument("--temperature", - type=float, - default=0.7, - help="Sampling temperature") - parser.add_argument("--top-p", - type=float, - default=1.0, - help="Top-p sampling parameter") + parser.add_argument( + "--prompt", type=str, default="Hello, world!", help="Prompt for validation" + ) + parser.add_argument( + "--max-tokens", + type=int, + default=100, + help="Maximum number of tokens to generate", + ) + parser.add_argument( + "--temperature", type=float, default=0.7, help="Sampling temperature" + ) + parser.add_argument( + "--top-p", type=float, default=1.0, help="Top-p sampling parameter" + ) return parser.parse_args() @@ -60,8 +59,9 @@ def main(): args = parse_args() engine_args = EngineArgs.from_cli_args(args) - print(f"Loading model from {engine_args.model} " - f"using format {engine_args.load_format}") + print( + f"Loading model from {engine_args.model} using format {engine_args.load_format}" + ) print(f"Tensor parallel size: {engine_args.tensor_parallel_size}") # Load the model using engine args @@ -90,4 +90,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py index b6608ec6e9580..33c660015ba76 100644 --- a/examples/offline_inference/lora_with_quantization_inference.py +++ b/examples/offline_inference/lora_with_quantization_inference.py @@ -17,50 +17,55 @@ from vllm.lora.request import LoRARequest def create_test_prompts( - lora_path: str + lora_path: str, ) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]: return [ # this is an example of using quantization without LoRA - ("My name is", - SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=128), None), + ( + "My name is", + SamplingParams( + temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128 + ), + None, + ), # the next three examples use quantization with LoRA - ("my name is", - SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=128), - LoRARequest("lora-test-1", 1, lora_path)), - ("The capital of USA is", - SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=128), - LoRARequest("lora-test-2", 1, lora_path)), - ("The capital of France is", - SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=128), - LoRARequest("lora-test-3", 1, lora_path)), + ( + "my name is", + SamplingParams( + temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128 + ), + LoRARequest("lora-test-1", 1, lora_path), + ), + ( + "The capital of USA is", + SamplingParams( + temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128 + ), + LoRARequest("lora-test-2", 1, lora_path), + ), + ( + "The capital of France is", + SamplingParams( + temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128 + ), + LoRARequest("lora-test-3", 1, lora_path), + ), ] -def process_requests(engine: LLMEngine, - test_prompts: list[tuple[str, SamplingParams, - Optional[LoRARequest]]]): +def process_requests( + engine: LLMEngine, + test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]], +): """Continuously process a list of prompts and handle the outputs.""" request_id = 0 while test_prompts or engine.has_unfinished_requests(): if test_prompts: prompt, sampling_params, lora_request = test_prompts.pop(0) - engine.add_request(str(request_id), - prompt, - sampling_params, - lora_request=lora_request) + engine.add_request( + str(request_id), prompt, sampling_params, lora_request=lora_request + ) request_id += 1 request_outputs: list[RequestOutput] = engine.step() @@ -71,15 +76,18 @@ def process_requests(engine: LLMEngine, print(f"Output: {request_output.outputs[0].text}") -def initialize_engine(model: str, quantization: str, - lora_repo: Optional[str]) -> LLMEngine: +def initialize_engine( + model: str, quantization: str, lora_repo: Optional[str] +) -> LLMEngine: """Initialize the LLMEngine.""" - engine_args = EngineArgs(model=model, - quantization=quantization, - enable_lora=True, - max_lora_rank=64, - max_loras=4) + engine_args = EngineArgs( + model=model, + quantization=quantization, + enable_lora=True, + max_lora_rank=64, + max_loras=4, + ) return LLMEngine.from_engine_args(engine_args) @@ -90,32 +98,30 @@ def main(): # QLoRA (https://arxiv.org/abs/2305.14314) { "name": "qlora_inference_example", - 'model': "huggyllama/llama-7b", - 'quantization': "bitsandbytes", - 'lora_repo': 'timdettmers/qlora-flan-7b' + "model": "huggyllama/llama-7b", + "quantization": "bitsandbytes", + "lora_repo": "timdettmers/qlora-flan-7b", }, { "name": "AWQ_inference_with_lora_example", - 'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ', - 'quantization': "awq", - 'lora_repo': 'jashing/tinyllama-colorist-lora' + "model": "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", + "quantization": "awq", + "lora_repo": "jashing/tinyllama-colorist-lora", }, { "name": "GPTQ_inference_with_lora_example", - 'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ', - 'quantization': "gptq", - 'lora_repo': 'jashing/tinyllama-colorist-lora' - } + "model": "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", + "quantization": "gptq", + "lora_repo": "jashing/tinyllama-colorist-lora", + }, ] for test_config in test_configs: - print( - f"~~~~~~~~~~~~~~~~ Running: {test_config['name']} ~~~~~~~~~~~~~~~~" + print(f"~~~~~~~~~~~~~~~~ Running: {test_config['name']} ~~~~~~~~~~~~~~~~") + engine = initialize_engine( + test_config["model"], test_config["quantization"], test_config["lora_repo"] ) - engine = initialize_engine(test_config['model'], - test_config['quantization'], - test_config['lora_repo']) - lora_path = snapshot_download(repo_id=test_config['lora_repo']) + lora_path = snapshot_download(repo_id=test_config["lora_repo"]) test_prompts = create_test_prompts(lora_path) process_requests(engine, test_prompts) @@ -125,5 +131,5 @@ def main(): torch.cuda.empty_cache() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/examples/offline_inference/metrics.py b/examples/offline_inference/metrics.py new file mode 100644 index 0000000000000..7927f758cb575 --- /dev/null +++ b/examples/offline_inference/metrics.py @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm import LLM, SamplingParams +from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Vector + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + +def main(): + # Create an LLM. + llm = LLM(model="facebook/opt-125m", disable_log_stats=False) + + # Generate texts from the prompts. + outputs = llm.generate(prompts, sampling_params) + + # Print the outputs. + print("-" * 50) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) + + # Dump all metrics + for metric in llm.get_metrics(): + if isinstance(metric, Gauge): + print(f"{metric.name} (gauge) = {metric.value}") + elif isinstance(metric, Counter): + print(f"{metric.name} (counter) = {metric.value}") + elif isinstance(metric, Vector): + print(f"{metric.name} (vector) = {metric.values}") + elif isinstance(metric, Histogram): + print(f"{metric.name} (histogram)") + print(f" sum = {metric.sum}") + print(f" count = {metric.count}") + for bucket_le, value in metric.buckets.items(): + print(f" {bucket_le} = {value}") + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py index 37c3181dc5faf..98fef2648f6bb 100644 --- a/examples/offline_inference/mistral-small.py +++ b/examples/offline_inference/mistral-small.py @@ -74,19 +74,10 @@ def run_simple_demo(args: argparse.Namespace): messages = [ { - "role": - "user", + "role": "user", "content": [ - { - "type": "text", - "text": prompt - }, - { - "type": "image_url", - "image_url": { - "url": image_url - } - }, + {"type": "text", "text": prompt}, + {"type": "image_url", "image_url": {"url": image_url}}, ], }, ] @@ -121,25 +112,11 @@ def run_advanced_demo(args: argparse.Namespace): messages = [ { - "role": - "user", + "role": "user", "content": [ - { - "type": "text", - "text": prompt - }, - { - "type": "image_url", - "image_url": { - "url": url_1 - } - }, - { - "type": "image_url", - "image_url": { - "url": url_2 - } - }, + {"type": "text", "text": prompt}, + {"type": "image_url", "image_url": {"url": url_1}}, + {"type": "image_url", "image_url": {"url": url_2}}, ], }, { @@ -153,12 +130,7 @@ def run_advanced_demo(args: argparse.Namespace): { "role": "user", "content": [ - { - "type": "image_url", - "image_url": { - "url": url_3 - } - }, + {"type": "image_url", "image_url": {"url": url_3}}, ], }, ] @@ -171,7 +143,8 @@ def run_advanced_demo(args: argparse.Namespace): def parse_args(): parser = argparse.ArgumentParser( - description="Run a demo in simple or advanced mode.") + description="Run a demo in simple or advanced mode." + ) parser.add_argument( "mode", @@ -179,15 +152,18 @@ def parse_args(): help="Specify the demo mode: 'simple' or 'advanced'", ) - parser.add_argument('--format', - choices=["mistral", "hf"], - default="mistral", - help='Specify the format of the model to load.') + parser.add_argument( + "--format", + choices=["mistral", "hf"], + default="mistral", + help="Specify the format of the model to load.", + ) parser.add_argument( - '--disable-mm-preprocessor-cache', - action='store_true', - help='If True, disables caching of multi-modal preprocessor/mapper.') + "--disable-mm-preprocessor-cache", + action="store_true", + help="If True, disables caching of multi-modal preprocessor/mapper.", + ) return parser.parse_args() diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py index 53c58a76d9dc1..b750397f45b8d 100644 --- a/examples/offline_inference/mlpspeculator.py +++ b/examples/offline_inference/mlpspeculator.py @@ -13,8 +13,9 @@ import time from vllm import LLM, SamplingParams -def time_generation(llm: LLM, prompts: list[str], - sampling_params: SamplingParams, title: str): +def time_generation( + llm: LLM, prompts: list[str], sampling_params: SamplingParams, title: str +): # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. # Warmup first @@ -25,8 +26,7 @@ def time_generation(llm: LLM, prompts: list[str], end = time.time() print("-" * 50) print(title) - print("time: ", - (end - start) / sum(len(o.outputs[0].token_ids) for o in outputs)) + print("time: ", (end - start) / sum(len(o.outputs[0].token_ids) for o in outputs)) # Print the outputs. for output in outputs: generated_text = output.outputs[0].text @@ -38,7 +38,8 @@ def main(): template = ( "Below is an instruction that describes a task. Write a response " "that appropriately completes the request.\n\n### Instruction:\n{}" - "\n\n### Response:\n") + "\n\n### Response:\n" + ) # Sample prompts. prompts = [ diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py index de409740292a8..1fa2f16f82a8a 100644 --- a/examples/offline_inference/multilora_inference.py +++ b/examples/offline_inference/multilora_inference.py @@ -15,7 +15,7 @@ from vllm.lora.request import LoRARequest def create_test_prompts( - lora_path: str + lora_path: str, ) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]: """Create a list of test prompts with their sampling parameters. @@ -26,38 +26,49 @@ def create_test_prompts( first adapter have finished. """ return [ - ("A robot may not injure a human being", - SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=128), None), - ("To be or not to be,", - SamplingParams(temperature=0.8, - top_k=5, - presence_penalty=0.2, - max_tokens=128), None), + ( + "A robot may not injure a human being", + SamplingParams( + temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128 + ), + None, + ), + ( + "To be or not to be,", + SamplingParams( + temperature=0.8, top_k=5, presence_penalty=0.2, max_tokens=128 + ), + None, + ), ( "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 - SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=128, - stop_token_ids=[32003]), - LoRARequest("sql-lora", 1, lora_path)), + SamplingParams( + temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128, + stop_token_ids=[32003], + ), + LoRARequest("sql-lora", 1, lora_path), + ), ( "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 - SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=128, - stop_token_ids=[32003]), - LoRARequest("sql-lora2", 2, lora_path)), + SamplingParams( + temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128, + stop_token_ids=[32003], + ), + LoRARequest("sql-lora2", 2, lora_path), + ), ] -def process_requests(engine: LLMEngine, - test_prompts: list[tuple[str, SamplingParams, - Optional[LoRARequest]]]): +def process_requests( + engine: LLMEngine, + test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]], +): """Continuously process a list of prompts and handle the outputs.""" request_id = 0 @@ -65,10 +76,9 @@ def process_requests(engine: LLMEngine, while test_prompts or engine.has_unfinished_requests(): if test_prompts: prompt, sampling_params, lora_request = test_prompts.pop(0) - engine.add_request(str(request_id), - prompt, - sampling_params, - lora_request=lora_request) + engine.add_request( + str(request_id), prompt, sampling_params, lora_request=lora_request + ) request_id += 1 request_outputs: list[RequestOutput] = engine.step() @@ -88,12 +98,14 @@ def initialize_engine() -> LLMEngine: # numbers will cause higher memory usage. If you know that all LoRAs will # use the same rank, it is recommended to set this as low as possible. # max_cpu_loras: controls the size of the CPU LoRA cache. - engine_args = EngineArgs(model="meta-llama/Llama-2-7b-hf", - enable_lora=True, - max_loras=1, - max_lora_rank=8, - max_cpu_loras=2, - max_num_seqs=256) + engine_args = EngineArgs( + model="meta-llama/Llama-2-7b-hf", + enable_lora=True, + max_loras=1, + max_lora_rank=8, + max_cpu_loras=2, + max_num_seqs=256, + ) return LLMEngine.from_engine_args(engine_args) @@ -105,5 +117,5 @@ def main(): process_requests(engine, test_prompts) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py index 5906c7b2c6b30..f2d7698f22d7c 100644 --- a/examples/offline_inference/neuron.py +++ b/examples/offline_inference/neuron.py @@ -30,7 +30,8 @@ def main(): # The device argument can be either unspecified for automated detection, # or explicitly assigned. device="neuron", - tensor_parallel_size=2) + tensor_parallel_size=2, + ) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/examples/offline_inference/neuron_eagle.py b/examples/offline_inference/neuron_eagle.py index 4f63f1a2fb3c8..5d7fb819d3477 100644 --- a/examples/offline_inference/neuron_eagle.py +++ b/examples/offline_inference/neuron_eagle.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """ -This example shows how to run offline inference with an EAGLE speculative +This example shows how to run offline inference with an EAGLE speculative decoding model on neuron. To use EAGLE speculative decoding, you must use a draft model that is specifically fine-tuned for EAGLE speculation. Additionally, to use EAGLE with NxD Inference, the draft model must include @@ -15,40 +15,46 @@ prompts = [ "What is annapurna labs?", ] -# Create a sampling params object. -sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True) -# Create an LLM. -llm = LLM( - model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct", - speculative_config={ - "model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft", - "num_speculative_tokens": 5, - "max_model_len": 2048 - }, - max_num_seqs=4, - # The max_model_len and block_size arguments are required to be same as - # max sequence length when targeting neuron device. - # Currently, this is a known limitation in continuous batching support - # in neuronx-distributed-inference. - max_model_len=2048, - block_size=2048, - # The device can be automatically detected when AWS Neuron SDK is installed. - # The device argument can be either unspecified for automated detection, - # or explicitly assigned. - device="neuron", - tensor_parallel_size=32, - override_neuron_config={ - "enable_eagle_speculation": True, - "enable_fused_speculation": True - }, -) +def main(): + # Create a sampling params object. + sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True) -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, \n\n\n\ Generated text: {generated_text!r}") + # Create an LLM. + llm = LLM( + model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct", + speculative_config={ + "model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft", + "num_speculative_tokens": 5, + "max_model_len": 2048, + }, + max_num_seqs=4, + # The max_model_len and block_size arguments are required to be same as + # max sequence length when targeting neuron device. + # Currently, this is a known limitation in continuous batching support + # in neuronx-distributed-inference. + max_model_len=2048, + block_size=2048, + # The device can be automatically detected when AWS Neuron SDK is installed. + # The device argument can be either unspecified for automated detection, + # or explicitly assigned. + device="neuron", + tensor_parallel_size=32, + override_neuron_config={ + "enable_eagle_speculation": True, + "enable_fused_speculation": True, + }, + ) + + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, \n\n\n\ Generated text: {generated_text!r}") + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py index af21274a3a5b8..ec38525b9daf2 100644 --- a/examples/offline_inference/neuron_int8_quantization.py +++ b/examples/offline_inference/neuron_int8_quantization.py @@ -5,12 +5,12 @@ import os from vllm import LLM, SamplingParams # creates XLA hlo graphs for all the context length buckets. -os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048" +os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048" # creates XLA hlo graphs for all the token gen buckets. -os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048" +os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048" # Quantizes neuron model weight to int8 , # The default config for quantization is int8 dtype. -os.environ['NEURON_QUANT_DTYPE'] = "s8" +os.environ["NEURON_QUANT_DTYPE"] = "s8" # Sample prompts. prompts = [ @@ -44,7 +44,8 @@ def main(): override_neuron_config={ "cast_logits_dtype": "bfloat16", }, - tensor_parallel_size=2) + tensor_parallel_size=2, + ) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/examples/offline_inference/neuron_multimodal.py b/examples/offline_inference/neuron_multimodal.py new file mode 100644 index 0000000000000..a9478650b16f1 --- /dev/null +++ b/examples/offline_inference/neuron_multimodal.py @@ -0,0 +1,105 @@ +# SPDX-License-Identifier: Apache-2.0 +import requests +import torch +from neuronx_distributed_inference.models.mllama.utils import add_instruct +from PIL import Image + +from vllm import LLM, SamplingParams, TextPrompt + + +def get_image(image_url): + image = Image.open(requests.get(image_url, stream=True).raw) + return image + + +# Model Inputs +PROMPTS = [ + "What is in this image? Tell me a story", + "What is the recipe of mayonnaise in two sentences?", + "Describe this image", + "What is the capital of Italy famous for?", +] +IMAGES = [ + get_image( + "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500" + ), + None, + get_image( + "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500" + ), + None, +] +SAMPLING_PARAMS = [ + dict(top_k=1, temperature=1.0, top_p=1.0, max_tokens=16) + for _ in range(len(PROMPTS)) +] + + +def get_VLLM_mllama_model_inputs(prompt, single_image, sampling_params): + # Prepare all inputs for mllama generation, including: + # 1. put text prompt into instruct chat template + # 2. compose single text and single image prompt into Vllm's prompt class + # 3. prepare sampling parameters + input_image = single_image + has_image = torch.tensor([1]) + if isinstance(single_image, torch.Tensor) and single_image.numel() == 0: + has_image = torch.tensor([0]) + + instruct_prompt = add_instruct(prompt, has_image) + inputs = TextPrompt(prompt=instruct_prompt) + + if input_image is not None: + inputs["multi_modal_data"] = {"image": input_image} + + sampling_params = SamplingParams(**sampling_params) + return inputs, sampling_params + + +def print_outputs(outputs): + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + +if __name__ == "__main__": + assert ( + len(PROMPTS) == len(IMAGES) == len(SAMPLING_PARAMS) + ), f"""Text, image prompts and sampling parameters should have the + same batch size; but got {len(PROMPTS)}, {len(IMAGES)}, + and {len(SAMPLING_PARAMS)}""" + + # Create an LLM. + llm = LLM( + model="meta-llama/Llama-3.2-11B-Vision-Instruct", + max_num_seqs=1, + max_model_len=4096, + block_size=4096, + device="neuron", + tensor_parallel_size=32, + override_neuron_config={ + "sequence_parallel_enabled": False, + "skip_warmup": True, + "save_sharded_checkpoint": True, + "on_device_sampling_config": { + "global_topk": 1, + "dynamic": False, + "deterministic": False, + }, + }, + ) + + batched_inputs = [] + batched_sample_params = [] + for pmpt, img, params in zip(PROMPTS, IMAGES, SAMPLING_PARAMS): + inputs, sampling_params = get_VLLM_mllama_model_inputs(pmpt, img, params) + # test batch-size = 1 + outputs = llm.generate(inputs, sampling_params) + print_outputs(outputs) + batched_inputs.append(inputs) + batched_sample_params.append(sampling_params) + + # test batch-size = 4 + outputs = llm.generate(batched_inputs, batched_sample_params) + print_outputs(outputs) diff --git a/examples/offline_inference/neuron_speculation.py b/examples/offline_inference/neuron_speculation.py index bef434bae5bac..ecacbab771c2a 100644 --- a/examples/offline_inference/neuron_speculation.py +++ b/examples/offline_inference/neuron_speculation.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """ -This example shows how to run offline inference with a speculative +This example shows how to run offline inference with a speculative decoding model on neuron. """ @@ -19,9 +19,9 @@ prompts = [ def config_buckets(): """Configure context length and token gen buckets.""" # creates XLA hlo graphs for all the context length buckets. - os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048" + os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048" # creates XLA hlo graphs for all the token gen buckets. - os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048" + os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048" def initialize_model(): @@ -31,7 +31,7 @@ def initialize_model(): speculative_config={ "model": "openlm-research/open_llama_3b", "num_speculative_tokens": 4, - "max_model_len": 2048 + "max_model_len": 2048, }, max_num_seqs=4, max_model_len=2048, @@ -60,5 +60,5 @@ def main(): process_requests(model, sampling_params) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/examples/offline_inference/openai_batch/README.md b/examples/offline_inference/openai_batch/README.md index 42a19f71e9de3..ce75297821221 100644 --- a/examples/offline_inference/openai_batch/README.md +++ b/examples/offline_inference/openai_batch/README.md @@ -48,7 +48,19 @@ The batch running tool is designed to be used from the command line. You can run the batch with the following command, which will write its results to a file called `results.jsonl` ```console -python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai_batch/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch \ + -i offline_inference/openai_batch/openai_example_batch.jsonl \ + -o results.jsonl \ + --model meta-llama/Meta-Llama-3-8B-Instruct +``` + +or use command-line: + +```console +vllm run-batch \ + -i offline_inference/openai_batch/openai_example_batch.jsonl \ + -o results.jsonl \ + --model meta-llama/Meta-Llama-3-8B-Instruct ``` ### Step 3: Check your results @@ -68,7 +80,19 @@ The batch runner supports remote input and output urls that are accessible via h For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl`, you can run ```console -python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch \ + -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \ + -o results.jsonl \ + --model meta-llama/Meta-Llama-3-8B-Instruct +``` + +or use command-line: + +```console +vllm run-batch \ + -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \ + -o results.jsonl \ + --model meta-llama/Meta-Llama-3-8B-Instruct ``` ## Example 3: Integrating with AWS S3 @@ -164,6 +188,15 @@ python -m vllm.entrypoints.openai.run_batch \ --model --model meta-llama/Meta-Llama-3-8B-Instruct ``` +or use command-line: + +```console +vllm run-batch \ + -i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \ + -o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \ + --model --model meta-llama/Meta-Llama-3-8B-Instruct +``` + ### Step 4: View your results Your results are now on S3. You can view them in your terminal by running diff --git a/examples/offline_inference/prefix_caching.py b/examples/offline_inference/prefix_caching.py index f0bec387d3a9b..d3dad24956a69 100644 --- a/examples/offline_inference/prefix_caching.py +++ b/examples/offline_inference/prefix_caching.py @@ -16,7 +16,8 @@ prefix = ( "teaching role. They have 5 years of previous teaching experience " "as an assistant teacher at a co-ed, public school with experience " "in middle school math teaching. Based on these information, fulfill " - "the following paragraph: ") + "the following paragraph: " +) # Sample prompts. prompts = [ @@ -58,9 +59,11 @@ def main(): cleanup_dist_env_and_memory() # Create an LLM with prefix caching enabled. - prefix_cached_llm = LLM(model="facebook/opt-125m", - enable_prefix_caching=True, - gpu_memory_utilization=0.4) + prefix_cached_llm = LLM( + model="facebook/opt-125m", + enable_prefix_caching=True, + gpu_memory_utilization=0.4, + ) # Warmup so that the shared prompt's KV cache is computed. prefix_cached_llm.generate(generating_prompts[0], sampling_params) @@ -81,10 +84,12 @@ def main(): print("-" * 50) # Compare the results and display the speedup - generated_same = all([ - regular_generated_texts[i] == cached_generated_texts[i] - for i in range(len(prompts)) - ]) + generated_same = all( + [ + regular_generated_texts[i] == cached_generated_texts[i] + for i in range(len(prompts)) + ] + ) print(f"Generated answers are the same: {generated_same}") diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py index f97a1f32e6210..21f7668adc863 100644 --- a/examples/offline_inference/prithvi_geospatial_mae.py +++ b/examples/offline_inference/prithvi_geospatial_mae.py @@ -16,16 +16,17 @@ The requirements for running this script are: Run the example: python prithvi_geospatial_mae.py -""" # noqa: E501 +""" # noqa: E501 + import argparse import datetime import os -import re from typing import Union import albumentations import numpy as np import rasterio +import regex as re import torch from einops import rearrange from terratorch.datamodules import Sen1Floods11NonGeoDataModule @@ -110,77 +111,67 @@ model_config = """{ # Temporarily creating the "config.json" for the model. # This is going to disappear once the correct config.json is available on HF -with open(os.path.join(os.path.dirname(__file__), "./model/config.json"), - 'w') as config_file: +with open( + os.path.join(os.path.dirname(__file__), "./model/config.json"), "w" +) as config_file: config_file.write(model_config) datamodule_config = { - 'bands': ['BLUE', 'GREEN', 'RED', 'NIR_NARROW', 'SWIR_1', 'SWIR_2'], - 'batch_size': - 16, - 'constant_scale': - 0.0001, - 'data_root': - '/dccstor/geofm-finetuning/datasets/sen1floods11', - 'drop_last': - True, - 'no_data_replace': - 0.0, - 'no_label_replace': - -1, - 'num_workers': - 8, - 'test_transform': [ - albumentations.Resize(always_apply=False, - height=448, - interpolation=1, - p=1, - width=448), - albumentations.pytorch.ToTensorV2(transpose_mask=False, - always_apply=True, - p=1.0) + "bands": ["BLUE", "GREEN", "RED", "NIR_NARROW", "SWIR_1", "SWIR_2"], + "batch_size": 16, + "constant_scale": 0.0001, + "data_root": "/dccstor/geofm-finetuning/datasets/sen1floods11", + "drop_last": True, + "no_data_replace": 0.0, + "no_label_replace": -1, + "num_workers": 8, + "test_transform": [ + albumentations.Resize( + always_apply=False, height=448, interpolation=1, p=1, width=448 + ), + albumentations.pytorch.ToTensorV2( + transpose_mask=False, always_apply=True, p=1.0 + ), ], } class PrithviMAE: - def __init__(self): print("Initializing PrithviMAE model") - self.model = LLM(model=os.path.join(os.path.dirname(__file__), - "./model"), - skip_tokenizer_init=True, - dtype="float32") + self.model = LLM( + model=os.path.join(os.path.dirname(__file__), "./model"), + skip_tokenizer_init=True, + dtype="float32", + ) def run(self, input_data, location_coords): print("################ Running inference on vLLM ##############") # merge the inputs into one data structure mm_data = { - "pixel_values": - torch.empty(0) if input_data is None else input_data, - "location_coords": - torch.empty(0) if location_coords is None else location_coords + "pixel_values": torch.empty(0) if input_data is None else input_data, + "location_coords": torch.empty(0) + if location_coords is None + else location_coords, } prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data} outputs = self.model.encode(prompt, use_tqdm=False) - print( - "################ Inference done (it took seconds) ##############" - ) + print("################ Inference done (it took seconds) ##############") return outputs[0].outputs.data def generate_datamodule(): datamodule = Sen1Floods11NonGeoDataModule( - data_root=datamodule_config['data_root'], + data_root=datamodule_config["data_root"], batch_size=datamodule_config["batch_size"], num_workers=datamodule_config["num_workers"], bands=datamodule_config["bands"], drop_last=datamodule_config["drop_last"], - test_transform=datamodule_config["test_transform" - ""]) + test_transform=datamodule_config["test_transform"], + ) return datamodule @@ -204,8 +195,7 @@ def process_channel_group(orig_img, channels): max_value = max(3000, np.percentile(orig_img[valid_mask], PERCENTILE)) min_value = OFFSET - orig_img = torch.clamp((orig_img - min_value) / (max_value - min_value), 0, - 1) + orig_img = torch.clamp((orig_img - min_value) / (max_value - min_value), 0, 1) # No data as zeros orig_img[~valid_mask] = 0 @@ -300,18 +290,21 @@ def load_example( location_coords.append(coords) try: - match = re.search(r'(\d{7,8}T\d{6})', file) + match = re.search(r"(\d{7,8}T\d{6})", file) if match: year = int(match.group(1)[:4]) - julian_day = match.group(1).split('T')[0][4:] + julian_day = match.group(1).split("T")[0][4:] if len(julian_day) == 3: julian_day = int(julian_day) else: - julian_day = datetime.datetime.strptime( - julian_day, '%m%d').timetuple().tm_yday + julian_day = ( + datetime.datetime.strptime(julian_day, "%m%d") + .timetuple() + .tm_yday + ) temporal_coords.append([year, julian_day]) except Exception as e: - print(f'Could not extract timestamp for {file} ({e})') + print(f"Could not extract timestamp for {file} ({e})") imgs = np.stack(imgs, axis=0) # num_frames, H, W, C imgs = np.moveaxis(imgs, -1, 0).astype("float32") @@ -320,50 +313,44 @@ def load_example( return imgs, temporal_coords, location_coords, metas -def run_model(input_data, - temporal_coords, - location_coords, - model, - datamodule, - img_size, - lightning_model=None): +def run_model( + input_data, + temporal_coords, + location_coords, + model, + datamodule, + img_size, + lightning_model=None, +): # Reflect pad if not divisible by img_size original_h, original_w = input_data.shape[-2:] pad_h = (img_size - (original_h % img_size)) % img_size pad_w = (img_size - (original_w % img_size)) % img_size - input_data = np.pad(input_data, - ((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)), - mode="reflect") + input_data = np.pad( + input_data, ((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)), mode="reflect" + ) # Build sliding window batch_size = 1 batch = torch.tensor(input_data, device="cpu") - windows = (batch.unfold(3, img_size, - img_size).unfold(4, img_size, img_size)) + windows = batch.unfold(3, img_size, img_size).unfold(4, img_size, img_size) h1, w1 = windows.shape[3:5] - windows = rearrange(windows, - "b c t h1 w1 h w -> (b h1 w1) c t h w", - h=img_size, - w=img_size) + windows = rearrange( + windows, "b c t h1 w1 h w -> (b h1 w1) c t h w", h=img_size, w=img_size + ) # Split into batches if number of windows > batch_size - num_batches = windows.shape[0] // batch_size if windows.shape[ - 0] > batch_size else 1 + num_batches = windows.shape[0] // batch_size if windows.shape[0] > batch_size else 1 windows = torch.tensor_split(windows, num_batches, dim=0) - if torch.cuda.is_available(): - device = torch.device('cuda') - else: - device = torch.device('cpu') + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") if temporal_coords: - temporal_coords = torch.tensor(temporal_coords, - device=device).unsqueeze(0) + temporal_coords = torch.tensor(temporal_coords, device=device).unsqueeze(0) else: temporal_coords = None if location_coords: - location_coords = torch.tensor(location_coords[0], - device=device).unsqueeze(0) + location_coords = torch.tensor(location_coords[0], device=device).unsqueeze(0) else: location_coords = None @@ -371,26 +358,24 @@ def run_model(input_data, pred_imgs = [] for x in windows: # Apply standardization - x = datamodule.test_transform( - image=x.squeeze().numpy().transpose(1, 2, 0)) - x = datamodule.aug(x)['image'] + x = datamodule.test_transform(image=x.squeeze().numpy().transpose(1, 2, 0)) + x = datamodule.aug(x)["image"] with torch.no_grad(): x = x.to(device) pred = model.run(x, location_coords=location_coords) if lightning_model: pred_lightning = lightning_model( - x, - temporal_coords=temporal_coords, - location_coords=location_coords) + x, temporal_coords=temporal_coords, location_coords=location_coords + ) pred_lightning = pred_lightning.output.detach().cpu() if not torch.equal(pred, pred_lightning): print("Inference output is not equal") y_hat = pred.argmax(dim=1) - y_hat = torch.nn.functional.interpolate(y_hat.unsqueeze(1).float(), - size=img_size, - mode="nearest") + y_hat = torch.nn.functional.interpolate( + y_hat.unsqueeze(1).float(), size=img_size, mode="nearest" + ) pred_imgs.append(y_hat) @@ -437,8 +422,7 @@ def parse_args(): default=[1, 2, 3, 8, 11, 12], type=int, nargs="+", - help= - "0-based indices of the six Prithvi channels to be selected from the " + help="0-based indices of the six Prithvi channels to be selected from the " "input. By default selects [1,2,3,8,11,12] for S2L1C data.", ) parser.add_argument( @@ -478,17 +462,18 @@ def main( # Running model ------------------------------------------------------------ channels = [ - datamodule_config['bands'].index(b) for b in ["RED", "GREEN", "BLUE"] + datamodule_config["bands"].index(b) for b in ["RED", "GREEN", "BLUE"] ] # BGR -> RGB - pred = run_model(input_data, temporal_coords, location_coords, model_obj, - datamodule, img_size) + pred = run_model( + input_data, temporal_coords, location_coords, model_obj, datamodule, img_size + ) # Save pred meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0) pred_file = os.path.join( - output_dir, - f"pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff") + output_dir, f"pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff" + ) save_geotiff(_convert_np_uint8(pred), pred_file, meta_data) # Save image + pred @@ -502,13 +487,13 @@ def main( channels=channels, ) - pred[pred == 0.] = np.nan + pred[pred == 0.0] = np.nan img_pred = rgb_orig * 0.7 + pred * 0.3 img_pred[img_pred.isnan()] = rgb_orig[img_pred.isnan()] img_pred_file = os.path.join( - output_dir, - f"rgb_pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff") + output_dir, f"rgb_pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff" + ) save_geotiff( image=_convert_np_uint8(img_pred), output_path=img_pred_file, @@ -518,8 +503,9 @@ def main( # Save image rgb if rgb_outputs: rgb_file = os.path.join( - output_dir, "original_rgb_" - f"{os.path.splitext(os.path.basename(data_file))[0]}.tiff") + output_dir, + f"original_rgb_{os.path.splitext(os.path.basename(data_file))[0]}.tiff", + ) save_geotiff( image=_convert_np_uint8(rgb_orig), output_path=rgb_file, @@ -528,7 +514,6 @@ def main( if __name__ == "__main__": - args = parse_args() main(**vars(args)) diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py index 3cf0c340d6705..244a64b891c96 100644 --- a/examples/offline_inference/profiling.py +++ b/examples/offline_inference/profiling.py @@ -44,14 +44,17 @@ def get_dtype(dtype: str): OutputLen_NumReqs_Map: TypeAlias = dict[int, int] -def compute_request_output_lengths(batch_size: int, step_requests: list[int]) \ - -> OutputLen_NumReqs_Map: + + +def compute_request_output_lengths( + batch_size: int, step_requests: list[int] +) -> OutputLen_NumReqs_Map: """ Given the number of requests, batch_size, and the number of requests that each engine-step should process, step_requests, determine the output lengths of the requests such that step_request is honoured. - Example: + Example: if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1] then return, {2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning, @@ -100,17 +103,19 @@ def compute_request_output_lengths(batch_size: int, step_requests: list[int]) \ output_length -= 1 # sanity checks. - assert sum(ol_nr.values()) == batch_size, \ - ("Number of requests in output-length assignment does not match " - f"batch-size.\n batch size {batch_size} - " - f"step requests {step_requests} - assignments {ol_nr}") + assert sum(ol_nr.values()) == batch_size, ( + "Number of requests in output-length assignment does not match " + f"batch-size.\n batch size {batch_size} - " + f"step requests {step_requests} - assignments {ol_nr}" + ) # Check that the output-length is in [1, num-steps]. Output length must be # at least 1 as all requests must participate in the prefill-step. - assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), \ - ("Output lengths of requests should be in range " - f"[1, num-engine-steps].\n batch size {batch_size} - " - f"step requests {step_requests} - assignments {ol_nr}") + assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), ( + "Output lengths of requests should be in range " + f"[1, num-engine-steps].\n batch size {batch_size} - " + f"step requests {step_requests} - assignments {ol_nr}" + ) return ol_nr @@ -131,7 +136,7 @@ def determine_requests_per_step(context: ProfileContext) -> list[int]: context: ProfileContext object. Returns: - list[int]: Number of requests to process for all engine-steps. + list[int]: Number of requests to process for all engine-steps. output[i], contains the number of requests that the ith step should process. """ @@ -140,10 +145,13 @@ def determine_requests_per_step(context: ProfileContext) -> list[int]: # that their output lengths must be equal to num_engine_steps. return [context.batch_size] * context.num_steps - assert context.complete_num_requests_per_step and \ - context.complete_num_requests_per_step > 0, \ - (f"Expected a positive complete_num_requests_per_step argument." - f"Instead got {context.complete_num_requests_per_step}") + assert ( + context.complete_num_requests_per_step + and context.complete_num_requests_per_step > 0 + ), ( + f"Expected a positive complete_num_requests_per_step argument." + f"Instead got {context.complete_num_requests_per_step}" + ) # We start dropping after the first decode step. step_requests = [ @@ -165,8 +173,9 @@ def determine_requests_per_step(context: ProfileContext) -> list[int]: return step_requests -def run_profile(context: ProfileContext, csv_output: Optional[str], - json_output: Optional[str]): +def run_profile( + context: ProfileContext, csv_output: Optional[str], json_output: Optional[str] +): print("Run profile with:") for key, value in asdict(context).items(): print(f" {key} = {value}") @@ -174,7 +183,8 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], requests_per_step: list[int] = determine_requests_per_step(context) ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths( - context.batch_size, requests_per_step) + context.batch_size, requests_per_step + ) num_steps_to_profile: int = len(requests_per_step) max_output_len: int = max(ol_nr.keys()) @@ -186,7 +196,8 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], top_p=0.95, # max_tokens is set on a per-request basis. max_tokens=None, - ignore_eos=True) + ignore_eos=True, + ) # Create LLM llm = LLM(**asdict(context.engine_args)) @@ -199,31 +210,37 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], max_num_seqs = scheduler_config.max_num_seqs if batch_size * prompt_len > max_num_batched_tokens: - print(f"ERROR: chosen batch_size * prompt_len " - f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is " - f"larger than max_num_batched_tokens ({max_num_batched_tokens}) " - f"and therefore cannot be run in a single profile step, please " - f"choose a smaller batch size or prompt length, or increase " - f"--max-num-batched-tokens") + print( + f"ERROR: chosen batch_size * prompt_len " + f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is " + f"larger than max_num_batched_tokens ({max_num_batched_tokens}) " + f"and therefore cannot be run in a single profile step, please " + f"choose a smaller batch size or prompt length, or increase " + f"--max-num-batched-tokens" + ) sys.exit(-1) if batch_size > max_num_seqs: print( f"ERROR: chosen batch_size ({batch_size}) is larger than " f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a " - f"single profile step, please choose a smaller batch size") + f"single profile step, please choose a smaller batch size" + ) sys.exit(-1) - print("llm.llm_engine.model_config.max_model_len: ", - llm.llm_engine.model_config.max_model_len) + print( + "llm.llm_engine.model_config.max_model_len: ", + llm.llm_engine.model_config.max_model_len, + ) if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len: - print(f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + " - f"{max_output_len} = {prompt_len + max_output_len}) is larger " - f"than the model's max_model_len ({max_model_len}), please " - f"choose a smaller prompt_len or max_output_len, or increase " - f"--max-model-len") + print( + f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + " + f"{max_output_len} = {prompt_len + max_output_len}) is larger " + f"than the model's max_model_len ({max_model_len}), please " + f"choose a smaller prompt_len or max_output_len, or increase " + f"--max-model-len" + ) sys.exit(-1) def add_requests(): - def get_output_len_generator() -> Generator[int, Any, Any]: for output_len, num_reqs in ol_nr.items(): for _ in range(num_reqs): @@ -234,13 +251,15 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], sampling_params.max_tokens = next(output_len_generator) assert isinstance(sampling_params.max_tokens, int) - prompt_token_ids = torch.randint(llm.get_tokenizer().vocab_size, - size=(prompt_len, )).tolist() + prompt_token_ids = torch.randint( + llm.get_tokenizer().vocab_size, size=(prompt_len,) + ).tolist() llm.llm_engine.add_request( request_id=f"seq{i}", - prompt={'prompt_token_ids': prompt_token_ids}, - params=sampling_params) + prompt={"prompt_token_ids": prompt_token_ids}, + params=sampling_params, + ) def abort_requests(): for i in range(batch_size): @@ -261,10 +280,8 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], decode_profs = [] for _ in tqdm.tqdm(range(num_steps_to_profile - 1)): - num_running_seqs = llm.llm_engine.scheduler[ - 0].get_num_unfinished_seq_groups() - with layerwise_profile( - num_running_seqs=num_running_seqs) as decode_prof: + num_running_seqs = llm.llm_engine.scheduler[0].get_num_unfinished_seq_groups() + with layerwise_profile(num_running_seqs=num_running_seqs) as decode_prof: llm.llm_engine.step() decode_profs.append(decode_prof) @@ -274,8 +291,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], LINE_WIDTH = 80 print("=" * LINE_WIDTH) - print(f"= Prefill Model Table " - f"(prompt_len={prompt_len}, batch_size={batch_size})") + print(f"= Prefill Model Table (prompt_len={prompt_len}, batch_size={batch_size})") print("=" * LINE_WIDTH) print() prefill_results.print_model_table() @@ -283,16 +299,17 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], if has_decode: print() print("=" * LINE_WIDTH) - print(f"= First Decode Step Model Table " - f"(prompt_len={prompt_len}, batch_size={batch_size})") + print( + f"= First Decode Step Model Table " + f"(prompt_len={prompt_len}, batch_size={batch_size})" + ) print("=" * LINE_WIDTH) print() decode_results_list[0].print_model_table() print() print("=" * LINE_WIDTH) - print(f"= Prefill Summary Table " - f"(prompt_len={prompt_len}, batch_size={batch_size})") + print(f"= Prefill Summary Table (prompt_len={prompt_len}, batch_size={batch_size})") print("=" * LINE_WIDTH) print() prefill_results.print_summary_table() @@ -300,25 +317,32 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], if has_decode: print() print("=" * LINE_WIDTH) - print(f"= First Decode Step Summary Table " - f"(prompt_len={prompt_len}, batch_size={batch_size})") + print( + f"= First Decode Step Summary Table " + f"(prompt_len={prompt_len}, batch_size={batch_size})" + ) print("=" * LINE_WIDTH) print() decode_results_list[0].print_summary_table() if csv_output: - csv_filename_base = csv_output[:-4] \ - if csv_output.endswith('.csv') else csv_output + csv_filename_base = ( + csv_output[:-4] if csv_output.endswith(".csv") else csv_output + ) prefill_results.export_model_stats_table_csv( - csv_filename_base + "_prefill_model_table.csv") + csv_filename_base + "_prefill_model_table.csv" + ) prefill_results.export_summary_stats_table_csv( - csv_filename_base + "_prefill_summary_table.csv") + csv_filename_base + "_prefill_summary_table.csv" + ) if has_decode: - decode_results_list[0].export_model_stats_table_csv(\ - csv_filename_base + "_decode_model_table.csv") + decode_results_list[0].export_model_stats_table_csv( + csv_filename_base + "_decode_model_table.csv" + ) decode_results_list[0].export_summary_stats_table_csv( - csv_filename_base + "_decode_summary_table.csv") + csv_filename_base + "_decode_summary_table.csv" + ) if json_output: cuda_devices = [ @@ -332,7 +356,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], "torch_version": f"{torch.__version__}", "torch_cuda_version": f"{torch.version.cuda}", "cuda_devices": f"{cuda_devices}", - **asdict(context) + **asdict(context), }, "prefill": prefill_results.convert_stats_to_dict(), } @@ -342,8 +366,9 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict() # Add .json to json_output filename if it doesn't exist already. - json_output_file = json_output if json_output.endswith( - '.json') else json_output + '.json' + json_output_file = ( + json_output if json_output.endswith(".json") else json_output + ".json" + ) with open(json_output_file, "w+") as f: json.dump(json_dict, f, indent=2) pass @@ -351,16 +376,21 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], if context.save_chrome_traces_folder is not None: os.makedirs(context.save_chrome_traces_folder, exist_ok=True) prefill_prof.profiler.export_chrome_trace( - context.save_chrome_traces_folder + "/prefill.json") + context.save_chrome_traces_folder + "/prefill.json" + ) for idx, decode_prof in enumerate(decode_profs): decode_prof.profiler.export_chrome_trace( - context.save_chrome_traces_folder + f"/decode_{idx + 1}.json") - print("Traces saved as prefill.json and decode_1.json, etc." - f" in folder {context.save_chrome_traces_folder}") + context.save_chrome_traces_folder + f"/decode_{idx + 1}.json" + ) + print( + "Traces saved as prefill.json and decode_1.json, etc." + f" in folder {context.save_chrome_traces_folder}" + ) def parse_args(): - parser = FlexibleArgumentParser(description=""" + parser = FlexibleArgumentParser( + description=""" Profile a model example: @@ -384,7 +414,8 @@ Profile a model --output-directory profile_breakdown --plot-metric pct_cuda_time ``` """, - formatter_class=RawTextHelpFormatter) + formatter_class=RawTextHelpFormatter, + ) parser.add_argument( "--csv", type=str, @@ -393,59 +424,68 @@ Profile a model "filename, will create <filename>_prefill_model_table.csv, " "<filename>_prefill_summary_table.csv, " "<filename>_decode_model_table.csv, and " - "<filename>_decode_summary_table.csv") + "<filename>_decode_summary_table.csv", + ) parser.add_argument( "--json", type=str, default=None, - help="Export the results as a json file. This should be the filename") - parser.add_argument("--save-chrome-traces-folder", - type=str, - help="Save chrome traces for the prefill and decode " - "will save traces as prefill.json and decode_1.json, " - "etc. inside this folder") + help="Export the results as a json file. This should be the filename", + ) + parser.add_argument( + "--save-chrome-traces-folder", + type=str, + help="Save chrome traces for the prefill and decode " + "will save traces as prefill.json and decode_1.json, " + "etc. inside this folder", + ) parser.add_argument( "--prompt-len", type=int, default=PROMPT_LEN_DEFAULT, help=f"Length of the random prompt to use when profiling, all batched " - f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}") - parser.add_argument("--batch-size", - type=int, - default=BATCH_SIZE_DEFAULT, - help=f"Number of requests to run as a single batch, " - f"default={BATCH_SIZE_DEFAULT}") + f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}", + ) + parser.add_argument( + "--batch-size", + type=int, + default=BATCH_SIZE_DEFAULT, + help=f"Number of requests to run as a single batch, " + f"default={BATCH_SIZE_DEFAULT}", + ) subparsers = parser.add_subparsers(dest="cmd") run_num_steps_parser = subparsers.add_parser( - "run_num_steps", - help="This variation profiles n engine.step() invocations.") + "run_num_steps", help="This variation profiles n engine.step() invocations." + ) run_num_steps_parser.add_argument( - '-n', - '--num-steps', + "-n", + "--num-steps", type=int, help="Number of engine steps to profile.\n" "Setting it to 1, profiles only the prefill step.\n" "Setting it to 2, profiles the prefill and first decode step\n" "Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n" - "and so on ...") + "and so on ...", + ) run_to_completion_parser = subparsers.add_parser( "run_to_completion", help="This variation profiles all the engine.step() invocations" - "until the engine exhausts all submitted requests.") + "until the engine exhausts all submitted requests.", + ) run_to_completion_parser.add_argument( - '-n', - '--complete-num-requests-per-step', + "-n", + "--complete-num-requests-per-step", type=int, - help= - "Complete complete_num_requests_per_step requests every decode step." + help="Complete complete_num_requests_per_step requests every decode step." "For e.g., with batch_size 128 and complete_num_requests_per_step 32," "the profiler is run for 6 engine steps, with the steps processing, " "128, 128, 96, 64, 32, 1 requests respectively.\n" "Note that we tack-on a one-request step at the end as it is often " - "useful.") + "useful.", + ) EngineArgs.add_cli_args(parser) @@ -459,7 +499,8 @@ def main(args): k: v for k, v in vars(args).items() if k in inspect.signature(ProfileContext).parameters - }) + }, + ) run_profile(context, csv_output=args.csv, json_output=args.json) diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py index 61da4705e18e8..82737d538df4f 100644 --- a/examples/offline_inference/profiling_tpu/profiling.py +++ b/examples/offline_inference/profiling_tpu/profiling.py @@ -31,18 +31,16 @@ def main(args: argparse.Namespace): max_tokens=args.output_len, ) print(sampling_params) - dummy_prompt_token_ids = np.random.randint(10000, - size=(args.batch_size, - args.input_len)) - dummy_prompts: list[PromptType] = [{ - "prompt_token_ids": batch - } for batch in dummy_prompt_token_ids.tolist()] + dummy_prompt_token_ids = np.random.randint( + 10000, size=(args.batch_size, args.input_len) + ) + dummy_prompts: list[PromptType] = [ + {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist() + ] def run_to_completion(): start_time = time.perf_counter() - llm.generate(dummy_prompts, - sampling_params=sampling_params, - use_tqdm=False) + llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False) end_time = time.perf_counter() latency = end_time - start_time return latency @@ -58,10 +56,9 @@ def main(args: argparse.Namespace): profile_dir = args.profile_result_dir print(f"Profiling (results will be saved to '{profile_dir}')...") # Enable tracing on server - xp.trace_detached("localhost:9012", - profile_dir, - delay_ms=DELAY_MS, - duration_ms=DURATION_MS) + xp.trace_detached( + "localhost:9012", profile_dir, delay_ms=DELAY_MS, duration_ms=DURATION_MS + ) if DELAY_MS == 0: time.sleep(1.0) profile_latencies = [] @@ -72,30 +69,36 @@ def main(args: argparse.Namespace): return -if __name__ == '__main__': +if __name__ == "__main__": parser = FlexibleArgumentParser( - description='Benchmark the latency of processing a single batch of ' - 'requests till completion.') - parser.add_argument('--input-len', type=int, default=32) - parser.add_argument('--output-len', type=int, default=128) - parser.add_argument('--batch-size', type=int, default=8) - parser.add_argument('--num-iters-warmup', - type=int, - default=5, - help='Number of iterations to run for warmup.') - parser.add_argument('--num-iters', - type=int, - default=1, - help='Number of iterations to run for profiling.') + description="Benchmark the latency of processing a single batch of " + "requests till completion." + ) + parser.add_argument("--input-len", type=int, default=32) + parser.add_argument("--output-len", type=int, default=128) + parser.add_argument("--batch-size", type=int, default=8) parser.add_argument( - '--profile-result-dir', + "--num-iters-warmup", + type=int, + default=5, + help="Number of iterations to run for warmup.", + ) + parser.add_argument( + "--num-iters", + type=int, + default=1, + help="Number of iterations to run for profiling.", + ) + parser.add_argument( + "--profile-result-dir", type=str, default="profiles", - help= - ('path to save the pytorch profiler output. Can be visualized ' - 'with ui.perfetto.dev or Tensorboard ' - '(https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).' - )) + help=( + "path to save the pytorch profiler output. Can be visualized " + "with ui.perfetto.dev or Tensorboard " + "(https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm)." + ), + ) parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() diff --git a/examples/offline_inference/prompt_embed_inference.py b/examples/offline_inference/prompt_embed_inference.py index 99c5a682fb277..9f6a602233f8a 100644 --- a/examples/offline_inference/prompt_embed_inference.py +++ b/examples/offline_inference/prompt_embed_inference.py @@ -18,8 +18,7 @@ Run: """ import torch -from transformers import (AutoModelForCausalLM, AutoTokenizer, - PreTrainedTokenizer) +from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer from vllm import LLM @@ -32,27 +31,29 @@ def init_tokenizer_and_llm(model_name: str): return tokenizer, embedding_layer, llm -def get_prompt_embeds(chat: list[dict[str, - str]], tokenizer: PreTrainedTokenizer, - embedding_layer: torch.nn.Module): - token_ids = tokenizer.apply_chat_template(chat, - add_generation_prompt=True, - return_tensors='pt') +def get_prompt_embeds( + chat: list[dict[str, str]], + tokenizer: PreTrainedTokenizer, + embedding_layer: torch.nn.Module, +): + token_ids = tokenizer.apply_chat_template( + chat, add_generation_prompt=True, return_tensors="pt" + ) prompt_embeds = embedding_layer(token_ids).squeeze(0) return prompt_embeds -def single_prompt_inference(llm: LLM, tokenizer: PreTrainedTokenizer, - embedding_layer: torch.nn.Module): - chat = [{ - "role": "user", - "content": "Please tell me about the capital of France." - }] +def single_prompt_inference( + llm: LLM, tokenizer: PreTrainedTokenizer, embedding_layer: torch.nn.Module +): + chat = [{"role": "user", "content": "Please tell me about the capital of France."}] prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer) - outputs = llm.generate({ - "prompt_embeds": prompt_embeds, - }) + outputs = llm.generate( + { + "prompt_embeds": prompt_embeds, + } + ) print("\n[Single Inference Output]") print("-" * 30) @@ -61,34 +62,26 @@ def single_prompt_inference(llm: LLM, tokenizer: PreTrainedTokenizer, print("-" * 30) -def batch_prompt_inference(llm: LLM, tokenizer: PreTrainedTokenizer, - embedding_layer: torch.nn.Module): - chats = [[{ - "role": "user", - "content": "Please tell me about the capital of France." - }], - [{ - "role": "user", - "content": "When is the day longest during the year?" - }], - [{ - "role": "user", - "content": "Where is bigger, the moon or the sun?" - }]] +def batch_prompt_inference( + llm: LLM, tokenizer: PreTrainedTokenizer, embedding_layer: torch.nn.Module +): + chats = [ + [{"role": "user", "content": "Please tell me about the capital of France."}], + [{"role": "user", "content": "When is the day longest during the year?"}], + [{"role": "user", "content": "Where is bigger, the moon or the sun?"}], + ] prompt_embeds_list = [ get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats ] - outputs = llm.generate([{ - "prompt_embeds": embeds - } for embeds in prompt_embeds_list]) + outputs = llm.generate([{"prompt_embeds": embeds} for embeds in prompt_embeds_list]) print("\n[Batch Inference Outputs]") print("-" * 30) for i, o in enumerate(outputs): - print(f"Q{i+1}: {chats[i][0]['content']}") - print(f"A{i+1}: {o.outputs[0].text}\n") + print(f"Q{i + 1}: {chats[i][0]['content']}") + print(f"A{i + 1}: {o.outputs[0].text}\n") print("-" * 30) diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md index c30541a598cee..16d44cbadbc98 100644 --- a/examples/offline_inference/qwen2_5_omni/README.md +++ b/examples/offline_inference/qwen2_5_omni/README.md @@ -6,14 +6,19 @@ This folder provides several example scripts on how to inference Qwen2.5-Omni of ```bash # Audio + image + video -python examples/offline_inference/qwen2_5_omni/only_thinker.py -q mixed_modalities +python examples/offline_inference/qwen2_5_omni/only_thinker.py \ + -q mixed_modalities # Read vision and audio inputs from a single video file # NOTE: V1 engine does not support interleaved modalities yet. -VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q use_audio_in_video +VLLM_USE_V1=0 \ +python examples/offline_inference/qwen2_5_omni/only_thinker.py \ + -q use_audio_in_video # Multiple audios -VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q multi_audios +VLLM_USE_V1=0 \ +python examples/offline_inference/qwen2_5_omni/only_thinker.py \ + -q multi_audios ``` This script will run the thinker part of Qwen2.5-Omni, and generate text response. @@ -22,11 +27,16 @@ You can also test Qwen2.5-Omni on a single modality: ```bash # Process audio inputs -python examples/offline_inference/audio_language.py --model-type qwen2_5_omni +python examples/offline_inference/audio_language.py \ + --model-type qwen2_5_omni # Process image inputs -python examples/offline_inference/vision_language.py --modality image --model-type qwen2_5_omni +python examples/offline_inference/vision_language.py \ + --modality image \ + --model-type qwen2_5_omni # Process video inputs -python examples/offline_inference/vision_language.py --modality video --model-type qwen2_5_omni +python examples/offline_inference/vision_language.py \ + --modality video \ + --model-type qwen2_5_omni ``` diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py index 52b6e977eaa2a..6482490d1a93a 100644 --- a/examples/offline_inference/qwen2_5_omni/only_thinker.py +++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """ -This example shows how to use vLLM for running offline inference +This example shows how to use vLLM for running offline inference with the correct prompt format on Qwen2.5-Omni (thinker only). """ @@ -11,6 +11,7 @@ from vllm import LLM, SamplingParams from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset +from vllm.multimodal.image import convert_image_mode from vllm.utils import FlexibleArgumentParser @@ -26,50 +27,55 @@ class QueryResult(NamedTuple): default_system = ( "You are Qwen, a virtual human developed by the Qwen Team, Alibaba " "Group, capable of perceiving auditory and visual inputs, as well as " - "generating text and speech.") + "generating text and speech." +) def get_mixed_modalities_query() -> QueryResult: - question = ("What is recited in the audio? " - "What is the content of this image? Why is this video funny?") - prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n" - "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>" - "<|vision_bos|><|IMAGE|><|vision_eos|>" - "<|vision_bos|><|VIDEO|><|vision_eos|>" - f"{question}<|im_end|>\n" - f"<|im_start|>assistant\n") + question = ( + "What is recited in the audio? " + "What is the content of this image? Why is this video funny?" + ) + prompt = ( + f"<|im_start|>system\n{default_system}<|im_end|>\n" + "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>" + "<|vision_bos|><|IMAGE|><|vision_eos|>" + "<|vision_bos|><|VIDEO|><|vision_eos|>" + f"{question}<|im_end|>\n" + f"<|im_start|>assistant\n" + ) return QueryResult( inputs={ "prompt": prompt, "multi_modal_data": { - "audio": - AudioAsset("mary_had_lamb").audio_and_sample_rate, - "image": - ImageAsset("cherry_blossom").pil_image.convert("RGB"), - "video": - VideoAsset(name="baby_reading", num_frames=16).np_ndarrays, + "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate, + "image": convert_image_mode( + ImageAsset("cherry_blossom").pil_image, "RGB" + ), + "video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays, }, }, - limit_mm_per_prompt={ - "audio": 1, - "image": 1, - "video": 1 - }, + limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1}, ) def get_use_audio_in_video_query() -> QueryResult: - question = ("Describe the content of the video, " - "then convert what the baby say into text.") - prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n" - "<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>" - f"{question}<|im_end|>\n" - f"<|im_start|>assistant\n") + question = ( + "Describe the content of the video, then convert what the baby say into text." + ) + prompt = ( + f"<|im_start|>system\n{default_system}<|im_end|>\n" + "<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>" + f"{question}<|im_end|>\n" + f"<|im_start|>assistant\n" + ) asset = VideoAsset(name="baby_reading", num_frames=16) audio = asset.get_audio(sampling_rate=16000) - assert not envs.VLLM_USE_V1, ("V1 does not support use_audio_in_video. " - "Please launch this example with " - "`VLLM_USE_V1=0`.") + assert not envs.VLLM_USE_V1, ( + "V1 does not support use_audio_in_video. " + "Please launch this example with " + "`VLLM_USE_V1=0`." + ) return QueryResult( inputs={ "prompt": prompt, @@ -81,20 +87,19 @@ def get_use_audio_in_video_query() -> QueryResult: "use_audio_in_video": True, }, }, - limit_mm_per_prompt={ - "audio": 1, - "video": 1 - }, + limit_mm_per_prompt={"audio": 1, "video": 1}, ) def get_multi_audios_query() -> QueryResult: question = "Are these two audio clips the same?" - prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n" - "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>" - "<|audio_bos|><|AUDIO|><|audio_eos|>" - f"{question}<|im_end|>\n" - f"<|im_start|>assistant\n") + prompt = ( + f"<|im_start|>system\n{default_system}<|im_end|>\n" + "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>" + "<|audio_bos|><|AUDIO|><|audio_eos|>" + f"{question}<|im_end|>\n" + f"<|im_start|>assistant\n" + ) return QueryResult( inputs={ "prompt": prompt, @@ -122,18 +127,19 @@ def main(args): model_name = "Qwen/Qwen2.5-Omni-7B" query_result = query_map[args.query_type]() - llm = LLM(model=model_name, - max_model_len=5632, - max_num_seqs=5, - limit_mm_per_prompt=query_result.limit_mm_per_prompt, - seed=args.seed) + llm = LLM( + model=model_name, + max_model_len=5632, + max_num_seqs=5, + limit_mm_per_prompt=query_result.limit_mm_per_prompt, + seed=args.seed, + ) # We set temperature to 0.2 so that outputs can be different # even when all prompts are identical when running batch inference. sampling_params = SamplingParams(temperature=0.2, max_tokens=64) - outputs = llm.generate(query_result.inputs, - sampling_params=sampling_params) + outputs = llm.generate(query_result.inputs, sampling_params=sampling_params) for o in outputs: generated_text = o.outputs[0].text @@ -142,18 +148,23 @@ def main(args): def parse_args(): parser = FlexibleArgumentParser( - description='Demo on using vLLM for offline inference with ' - 'audio language models') - parser.add_argument('--query-type', - '-q', - type=str, - default="mixed_modalities", - choices=query_map.keys(), - help='Query type.') - parser.add_argument("--seed", - type=int, - default=None, - help="Set the seed when initializing `vllm.LLM`.") + description="Demo on using vLLM for offline inference with " + "audio language models" + ) + parser.add_argument( + "--query-type", + "-q", + type=str, + default="mixed_modalities", + choices=query_map.keys(), + help="Query type.", + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.", + ) return parser.parse_args() diff --git a/examples/offline_inference/qwen_1m.py b/examples/offline_inference/qwen_1m.py index 64a1f4c54b670..856a35b0e59be 100644 --- a/examples/offline_inference/qwen_1m.py +++ b/examples/offline_inference/qwen_1m.py @@ -17,10 +17,10 @@ def load_prompt() -> str: # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/1m.txt with urlopen( - "https://qianwen-res.oss-cn-beijing.aliyuncs.com" - "/Qwen2.5-1M/test-data/600k.txt", - timeout=5) as response: - prompt = response.read().decode('utf-8') + "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/600k.txt", + timeout=5, + ) as response: + prompt = response.read().decode("utf-8") return prompt @@ -41,18 +41,22 @@ def process_requests(llm: LLM, prompts: list[str]) -> None: for output in outputs: prompt_token_ids = output.prompt_token_ids generated_text = output.outputs[0].text - print(f"Prompt length: {len(prompt_token_ids)}, " - f"Generated text: {generated_text!r}") + print( + f"Prompt length: {len(prompt_token_ids)}, " + f"Generated text: {generated_text!r}" + ) # Create an LLM. def initialize_engine() -> LLM: - llm = LLM(model="Qwen/Qwen2.5-7B-Instruct-1M", - max_model_len=1048576, - tensor_parallel_size=4, - enforce_eager=True, - enable_chunked_prefill=True, - max_num_batched_tokens=131072) + llm = LLM( + model="Qwen/Qwen2.5-7B-Instruct-1M", + max_model_len=1048576, + tensor_parallel_size=4, + enforce_eager=True, + enable_chunked_prefill=True, + max_num_batched_tokens=131072, + ) return llm @@ -62,5 +66,5 @@ def main(): process_requests(llm, [prompt]) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/examples/offline_inference/reproducibility.py b/examples/offline_inference/reproducibility.py index b2be117d1a0a9..6d048986e7109 100644 --- a/examples/offline_inference/reproducibility.py +++ b/examples/offline_inference/reproducibility.py @@ -1,24 +1,22 @@ # SPDX-License-Identifier: Apache-2.0 +""" +Demonstrates how to achieve reproducibility in vLLM. + +Main article: https://docs.vllm.ai/en/latest/usage/reproducibility.html +""" + import os +import random from vllm import LLM, SamplingParams -# vLLM does not guarantee the reproducibility of the results by default, -# for the sake of performance. You need to do the following to achieve -# reproducible results: -# 1. Turn off multiprocessing to make the scheduling deterministic. -# NOTE(woosuk): This is not needed and will be ignored for V0. +# V1 only: Turn off multiprocessing to make the scheduling deterministic. os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" -# 2. Fix the global seed for reproducibility. The default seed is None, which is + +# V0 only: Set the global seed. The default seed is None, which is # not reproducible. SEED = 42 -# NOTE(woosuk): Even with the above two settings, vLLM only provides -# reproducibility when it runs on the same hardware and the same vLLM version. -# Also, the online serving API (`vllm serve`) does not support reproducibility -# because it is almost impossible to make the scheduling deterministic in the -# online serving setting. - prompts = [ "Hello, my name is", "The president of the United States is", @@ -38,6 +36,11 @@ def main(): print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") print("-" * 50) + # Try generating random numbers outside vLLM + # The same number is output across runs, meaning that the random state + # in the user code has been updated by vLLM + print(random.randint(0, 100)) + if __name__ == "__main__": main() diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py index e0ed0ac49754b..a8f6977e29a49 100644 --- a/examples/offline_inference/rlhf.py +++ b/examples/offline_inference/rlhf.py @@ -12,6 +12,7 @@ inference instance. In practice, there could be multiple training instances and multiple inference instances. For the full implementation, please refer to the OpenRLHF framework. """ + import os import ray @@ -26,7 +27,6 @@ from vllm.utils import get_ip, get_open_port class MyLLM(LLM): - def __init__(self, *args, **kwargs): # a hack to make the script work. # stop ray from manipulating CUDA_VISIBLE_DEVICES @@ -89,8 +89,7 @@ print("-" * 50) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}\n" - f"Generated text: {generated_text!r}") + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") print("-" * 50) # set up the communication between the training process @@ -98,11 +97,13 @@ for output in outputs: master_address = get_ip() master_port = get_open_port() -handle = llm.collective_rpc.remote("init_weight_update_group", - args=(master_address, master_port, 1, 3)) +handle = llm.collective_rpc.remote( + "init_weight_update_group", args=(master_address, master_port, 1, 3) +) -model_update_group = stateless_init_process_group(master_address, master_port, - 0, 3, torch.device("cuda:0")) +model_update_group = stateless_init_process_group( + master_address, master_port, 0, 3, torch.device("cuda:0") +) ray.get(handle) # simulate training, modify the weights of the model. @@ -111,8 +112,7 @@ for name, p in train_model.named_parameters(): # sync weight from the training process to the inference engine. for name, p in train_model.named_parameters(): - handle = llm.collective_rpc.remote("update_weight", - args=(name, p.dtype, p.shape)) + handle = llm.collective_rpc.remote("update_weight", args=(name, p.dtype, p.shape)) model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream()) ray.get(handle) @@ -126,6 +126,5 @@ print("-" * 50) for output in outputs_updated: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}\n" - f"Generated text: {generated_text!r}") + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") print("-" * 50) diff --git a/examples/offline_inference/rlhf_colocate.py b/examples/offline_inference/rlhf_colocate.py index 3ceac0fa2e203..76eafdca1f6c7 100644 --- a/examples/offline_inference/rlhf_colocate.py +++ b/examples/offline_inference/rlhf_colocate.py @@ -9,6 +9,7 @@ The key points: - Use cuda-ipc to pass tensors, since NCCL does not work when we have multiple processes on the same GPU. """ + import os import ray @@ -20,7 +21,6 @@ from vllm import LLM class MyLLM(LLM): - def __init__(self, *args, bundle_indices: list, **kwargs): # a hack to make the script work. # stop ray from manipulating CUDA_VISIBLE_DEVICES @@ -29,17 +29,16 @@ class MyLLM(LLM): # every worker will use 0.4 GPU, so that we can schedule # 2 instances on the same GPUs. os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4" - os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join( - map(str, bundle_indices)) + os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join(map(str, bundle_indices)) print(f"creating LLM with bundle_indices={bundle_indices}") super().__init__(*args, **kwargs) class RayTrainingActor: - def __init__(self): # ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs from transformers import AutoModelForCausalLM + self.model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") self.model.to("cuda:0") for name, p in self.model.named_parameters(): @@ -48,6 +47,7 @@ class RayTrainingActor: # the argument for get_device_uuid is the index # of the GPU in the visible devices. from vllm.platforms import current_platform + self.device_uuid = current_platform.get_device_uuid(0) def report_device_id(self) -> str: @@ -55,6 +55,7 @@ class RayTrainingActor: def get_weight_ipc_handles(self): from torch.multiprocessing.reductions import reduce_tensor + data = {} for name, p in self.model.named_parameters(): # the training actor might only have a subset of the weights @@ -101,7 +102,7 @@ for bundle_index, training_actor in enumerate(training_actors): print(f"training actor {bundle_index} is on {device_id}") training_actor_device_ids.append(device_id) -for (i, bundle_indices) in enumerate([[0, 1], [2, 3]]): +for i, bundle_indices in enumerate([[0, 1], [2, 3]]): # IMPORTANT: when creating vLLM instances, we need to # make sure there are no GPU activities on the target GPUs, # otherwise, they will interfere with the vLLM memory profiling, @@ -128,7 +129,8 @@ for (i, bundle_indices) in enumerate([[0, 1], [2, 3]]): for i, llm in enumerate(inference_engines): inference_engine_device_ids.append( - ray.get(llm.collective_rpc.remote("report_device_id", args=tuple()))) + ray.get(llm.collective_rpc.remote("report_device_id", args=tuple())) + ) print(f"inference engine {i} is on {inference_engine_device_ids[-1]}") # check the placement @@ -147,9 +149,10 @@ for actor in training_actors: print("update the weights of the inference engines") for llm in inference_engines: ray.get( - llm.collective_rpc.remote("update_weights_from_ipc_handles", - args=(ipc_handles, ))) + llm.collective_rpc.remote( + "update_weights_from_ipc_handles", args=(ipc_handles,) + ) + ) print("check if the weights are updated") for llm in inference_engines: - assert ray.get( - llm.collective_rpc.remote("check_weights_changed", args=tuple())) + assert ray.get(llm.collective_rpc.remote("check_weights_changed", args=tuple())) diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py index 11b73b7c4a0ab..3461af707eba8 100644 --- a/examples/offline_inference/rlhf_utils.py +++ b/examples/offline_inference/rlhf_utils.py @@ -2,21 +2,20 @@ import torch -def stateless_init_process_group(master_address, master_port, rank, world_size, - device): +def stateless_init_process_group(master_address, master_port, rank, world_size, device): """ vLLM provides `StatelessProcessGroup` to create a process group without considering the global process group in torch.distributed. It is recommended to create `StatelessProcessGroup`, and then initialize - the data-plane communication (NCCL) between external (train processes) + the data-plane communication (NCCL) between external (train processes) and vLLM workers. """ from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator from vllm.distributed.utils import StatelessProcessGroup - pg = StatelessProcessGroup.create(host=master_address, - port=master_port, - rank=rank, - world_size=world_size) + + pg = StatelessProcessGroup.create( + host=master_address, port=master_port, rank=rank, world_size=world_size + ) pynccl = PyNcclCommunicator(pg, device=device) return pynccl @@ -31,9 +30,11 @@ class WorkerExtension: should pass the full qualified name as `worker_extension_cls` argument. """ - def init_weight_update_group(self, master_address, master_port, - rank_offset, world_size): + def init_weight_update_group( + self, master_address, master_port, rank_offset, world_size + ): from vllm.distributed.parallel_state import get_world_group + rank = get_world_group().rank + rank_offset self.model_update_group = stateless_init_process_group( master_address, @@ -45,9 +46,9 @@ class WorkerExtension: def update_weight(self, name, dtype, shape): weight = torch.empty(shape, dtype=dtype, device="cuda") - self.model_update_group.broadcast(weight, - src=0, - stream=torch.cuda.current_stream()) + self.model_update_group.broadcast( + weight, src=0, stream=torch.cuda.current_stream() + ) self.model_runner.model.load_weights(weights=[(name, weight)]) @@ -59,8 +60,7 @@ class WorkerExtension: """ weights_updated = True for name, p in self.model_runner.model.named_parameters(): - weights_updated = weights_updated and torch.allclose( - p, torch.zeros_like(p)) + weights_updated = weights_updated and torch.allclose(p, torch.zeros_like(p)) return weights_updated @@ -76,6 +76,7 @@ class ColocateWorkerExtension: def report_device_id(self) -> str: from vllm.platforms import current_platform + self.device_uuid = current_platform.get_device_uuid(self.device.index) return self.device_uuid @@ -100,6 +101,5 @@ class ColocateWorkerExtension: """ weights_updated = True for name, p in self.model_runner.model.named_parameters(): - weights_updated = weights_updated and torch.allclose( - p, torch.zeros_like(p)) + weights_updated = weights_updated and torch.allclose(p, torch.zeros_like(p)) return weights_updated diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py index 338380cc96841..860fe2b5fe067 100644 --- a/examples/offline_inference/save_sharded_state.py +++ b/examples/offline_inference/save_sharded_state.py @@ -21,6 +21,7 @@ llm = LLM( tensor_parallel_size=8, ) """ + import dataclasses import os import shutil @@ -33,18 +34,18 @@ from vllm.utils import FlexibleArgumentParser def parse_args(): parser = FlexibleArgumentParser() EngineArgs.add_cli_args(parser) - parser.add_argument("--output", - "-o", - required=True, - type=str, - help="path to output checkpoint") - parser.add_argument("--file-pattern", - type=str, - help="string pattern of saved filenames") - parser.add_argument("--max-file-size", - type=str, - default=5 * 1024**3, - help="max size (in bytes) of each safetensors file") + parser.add_argument( + "--output", "-o", required=True, type=str, help="path to output checkpoint" + ) + parser.add_argument( + "--file-pattern", type=str, help="string pattern of saved filenames" + ) + parser.add_argument( + "--max-file-size", + type=str, + default=5 * 1024**3, + help="max size (in bytes) of each safetensors file", + ) return parser.parse_args() @@ -68,23 +69,23 @@ def main(args): # For V1 engine, we need to use engine_core.save_sharded_state print("Using V1 engine save path") llm.llm_engine.engine_core.save_sharded_state( - path=args.output, - pattern=args.file_pattern, - max_size=args.max_file_size) + path=args.output, pattern=args.file_pattern, max_size=args.max_file_size + ) else: # For V0 engine print("Using V0 engine save path") model_executor = llm.llm_engine.model_executor - model_executor.save_sharded_state(path=args.output, - pattern=args.file_pattern, - max_size=args.max_file_size) + model_executor.save_sharded_state( + path=args.output, pattern=args.file_pattern, max_size=args.max_file_size + ) # Copy metadata files to output directory for file in os.listdir(model_path): if os.path.splitext(file)[1] not in (".bin", ".pt", ".safetensors"): if os.path.isdir(os.path.join(model_path, file)): - shutil.copytree(os.path.join(model_path, file), - os.path.join(args.output, file)) + shutil.copytree( + os.path.join(model_path, file), os.path.join(args.output, file) + ) else: shutil.copy(os.path.join(model_path, file), args.output) diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py index 363b500e0adf8..9ed7299606b7e 100644 --- a/examples/offline_inference/structured_outputs.py +++ b/examples/offline_inference/structured_outputs.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 """ -This file demonstrates the example usage of guided decoding -to generate structured outputs using vLLM. It shows how to apply -different guided decoding techniques such as Choice, Regex, JSON schema, -and Grammar to produce structured and formatted results +This file demonstrates the example usage of guided decoding +to generate structured outputs using vLLM. It shows how to apply +different guided decoding techniques such as Choice, Regex, JSON schema, +and Grammar to produce structured and formatted results based on specific prompts. """ @@ -15,20 +15,20 @@ from vllm import LLM, SamplingParams from vllm.sampling_params import GuidedDecodingParams # Guided decoding by Choice (list of possible options) -guided_decoding_params_choice = GuidedDecodingParams( - choice=["Positive", "Negative"]) -sampling_params_choice = SamplingParams( - guided_decoding=guided_decoding_params_choice) +guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"]) +sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice) prompt_choice = "Classify this sentiment: vLLM is wonderful!" # Guided decoding by Regex guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n") sampling_params_regex = SamplingParams( - guided_decoding=guided_decoding_params_regex, stop=["\n"]) + guided_decoding=guided_decoding_params_regex, stop=["\n"] +) prompt_regex = ( "Generate an email address for Alan Turing, who works in Enigma." "End in .com and new line. Example result:" - "alan.turing@enigma.com\n") + "alan.turing@enigma.com\n" +) # Guided decoding by JSON using Pydantic schema @@ -47,10 +47,11 @@ class CarDescription(BaseModel): json_schema = CarDescription.model_json_schema() guided_decoding_params_json = GuidedDecodingParams(json=json_schema) -sampling_params_json = SamplingParams( - guided_decoding=guided_decoding_params_json) -prompt_json = ("Generate a JSON with the brand, model and car_type of" - "the most iconic car from the 90's") +sampling_params_json = SamplingParams(guided_decoding=guided_decoding_params_json) +prompt_json = ( + "Generate a JSON with the brand, model and car_type of" + "the most iconic car from the 90's" +) # Guided decoding by Grammar simplified_sql_grammar = """ @@ -61,12 +62,11 @@ table ::= "table_1 " | "table_2 " condition ::= column "= " number number ::= "1 " | "2 " """ -guided_decoding_params_grammar = GuidedDecodingParams( - grammar=simplified_sql_grammar) -sampling_params_grammar = SamplingParams( - guided_decoding=guided_decoding_params_grammar) -prompt_grammar = ("Generate an SQL query to show the 'username' and 'email'" - "from the 'users' table.") +guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar) +sampling_params_grammar = SamplingParams(guided_decoding=guided_decoding_params_grammar) +prompt_grammar = ( + "Generate an SQL query to show the 'username' and 'email'from the 'users' table." +) def format_output(title: str, output: str): @@ -90,8 +90,7 @@ def main(): json_output = generate_output(prompt_json, sampling_params_json, llm) format_output("Guided decoding by JSON", json_output) - grammar_output = generate_output(prompt_grammar, sampling_params_grammar, - llm) + grammar_output = generate_output(prompt_grammar, sampling_params_grammar, llm) format_output("Guided decoding by Grammar", grammar_output) diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py index bb61a0a29e322..2fa49c0835e32 100644 --- a/examples/offline_inference/torchrun_example.py +++ b/examples/offline_inference/torchrun_example.py @@ -45,8 +45,7 @@ if dist.get_rank() == 0: for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}\n" - f"Generated text: {generated_text!r}\n") + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n") print("-" * 50) """ Further tips: diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py index 71cd88f2788ad..e4a75b3f93803 100644 --- a/examples/offline_inference/tpu.py +++ b/examples/offline_inference/tpu.py @@ -20,10 +20,12 @@ sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16) def main(): # Set `enforce_eager=True` to avoid ahead-of-time compilation. # In real workloads, `enforace_eager` should be `False`. - llm = LLM(model="Qwen/Qwen2-1.5B-Instruct", - max_num_batched_tokens=64, - max_num_seqs=4, - max_model_len=128) + llm = LLM( + model="Qwen/Qwen2-1.5B-Instruct", + max_num_batched_tokens=64, + max_num_seqs=4, + max_model_len=128, + ) outputs = llm.generate(prompts, sampling_params) print("-" * 50) for output, answer in zip(outputs, answers): diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index c54f328c7a382..f0504501639d2 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -6,6 +6,7 @@ the correct prompt format on vision language models for text generation. For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ + import os import random from contextlib import contextmanager @@ -19,6 +20,7 @@ from vllm import LLM, EngineArgs, SamplingParams from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.lora.request import LoRARequest +from vllm.multimodal.image import convert_image_mode from vllm.utils import FlexibleArgumentParser @@ -48,9 +50,13 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData: limit_mm_per_prompt={modality: 1}, ) - prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}" - "<|im_end|>\n<|im_start|>assistant\n") - for question in questions] + prompts = [ + ( + f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}" + "<|im_end|>\n<|im_start|>assistant\n" + ) + for question in questions + ] stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] @@ -134,8 +140,7 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData: ) prompts = [ - f"<|User|>: <image>\n{question}\n\n<|Assistant|>:" - for question in questions + f"<|User|>: <image>\n{question}\n\n<|Assistant|>:" for question in questions ] return ModelRequestData( @@ -197,9 +202,14 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData: limit_mm_per_prompt={modality: 1}, ) - prompts = [("<bos><start_of_turn>user\n" - f"<start_of_image>{question}<end_of_turn>\n" - "<start_of_turn>model\n") for question in questions] + prompts = [ + ( + "<bos><start_of_turn>user\n" + f"<start_of_image>{question}<end_of_turn>\n" + "<start_of_turn>model\n" + ) + for question in questions + ] return ModelRequestData( engine_args=engine_args, @@ -224,7 +234,8 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData: prompts = [ f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\ - {question}<|assistant|>" for question in questions + {question}<|assistant|>" + for question in questions ] stop_token_ids = [151329, 151336, 151338] @@ -249,15 +260,13 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData: limit_mm_per_prompt={modality: 1}, ) - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - messages = [[{ - 'role': 'user', - 'content': f"<image>\n{question}" - }] for question in questions] - prompts = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + messages = [ + [{"role": "user", "content": f"<image>\n{question}"}] for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) # Stop tokens for H2OVL-Mississippi # https://huggingface.co/h2oai/h2ovl-mississippi-800m @@ -283,15 +292,14 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: # if you are running out of memory, you can reduce the "longest_edge". # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations mm_processor_kwargs={ - "size": { - "longest_edge": 3 * 364 - }, + "size": {"longest_edge": 3 * 364}, }, limit_mm_per_prompt={modality: 1}, ) - prompts = [( - f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:" - ) for question in questions] + prompts = [ + (f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:") + for question in questions + ] return ModelRequestData( engine_args=engine_args, @@ -310,9 +318,7 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: max_num_seqs=2, enforce_eager=True, mm_processor_kwargs={ - "max_image_size": { - "longest_edge": 384 - }, + "max_image_size": {"longest_edge": 384}, }, limit_mm_per_prompt={modality: 1}, ) @@ -329,26 +335,28 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: # InternVL def run_internvl(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - - model_name = "OpenGVLab/InternVL2-2B" + model_name = "OpenGVLab/InternVL3-2B" engine_args = EngineArgs( model=model_name, trust_remote_code=True, - max_model_len=4096, + max_model_len=8192, limit_mm_per_prompt={modality: 1}, ) - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - messages = [[{ - 'role': 'user', - 'content': f"<image>\n{question}" - }] for question in questions] - prompts = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + if modality == "image": + placeholder = "<image>" + elif modality == "video": + placeholder = "<video>" + + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + messages = [ + [{"role": "user", "content": f"{placeholder}\n{question}"}] + for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) # Stop tokens for InternVL # models variants may have different stop tokens @@ -356,6 +364,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData: # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None] return ModelRequestData( engine_args=engine_args, @@ -371,7 +380,8 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData: prompts = [ "<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>" f"<|media_pad|><|media_end|>{question}<|im_end|>" - "<|im_assistant|>assistant<|im_middle|>" for question in questions + "<|im_assistant|>assistant<|im_middle|>" + for question in questions ] engine_args = EngineArgs( @@ -391,9 +401,7 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData: def run_llava(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" - prompts = [ - f"USER: <image>\n{question}\nASSISTANT:" for question in questions - ] + prompts = [f"USER: <image>\n{question}\nASSISTANT:" for question in questions] engine_args = EngineArgs( model="llava-hf/llava-1.5-7b-hf", @@ -426,13 +434,10 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData: # LlaVA-NeXT-Video # Currently only support for video input -def run_llava_next_video(questions: list[str], - modality: str) -> ModelRequestData: +def run_llava_next_video(questions: list[str], modality: str) -> ModelRequestData: assert modality == "video" - prompts = [ - f"USER: <video>\n{question} ASSISTANT:" for question in questions - ] + prompts = [f"USER: <video>\n{question} ASSISTANT:" for question in questions] engine_args = EngineArgs( model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192, @@ -447,19 +452,19 @@ def run_llava_next_video(questions: list[str], # LLaVA-OneVision -def run_llava_onevision(questions: list[str], - modality: str) -> ModelRequestData: - +def run_llava_onevision(questions: list[str], modality: str) -> ModelRequestData: if modality == "video": prompts = [ f"<|im_start|>user <video>\n{question}<|im_end|> \ - <|im_start|>assistant\n" for question in questions + <|im_start|>assistant\n" + for question in questions ] elif modality == "image": prompts = [ f"<|im_start|>user <image>\n{question}<|im_end|> \ - <|im_start|>assistant\n" for question in questions + <|im_start|>assistant\n" + for question in questions ] engine_args = EngineArgs( @@ -478,11 +483,8 @@ def run_llava_onevision(questions: list[str], def run_mantis(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" - llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' # noqa: E501 - prompts = [ - llama3_template.format(f"{question}\n<image>") - for question in questions - ] + llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" # noqa: E501 + prompts = [llama3_template.format(f"{question}\n<image>") for question in questions] engine_args = EngineArgs( model="TIGER-Lab/Mantis-8B-siglip-llama3", @@ -522,8 +524,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name): # 2.6: image, video # o2.6: image, video, audio # model_name = "openbmb/MiniCPM-o-2_6" - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) engine_args = EngineArgs( model=model_name, max_model_len=4096, @@ -539,7 +540,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name): # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id] # 2.6 / o2.6 - stop_tokens = ['<|im_end|>', '<|endoftext|>'] + stop_tokens = ["<|im_end|>", "<|endoftext|>"] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] modality_placeholder = { @@ -549,12 +550,16 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name): prompts = [ tokenizer.apply_chat_template( - [{ - 'role': 'user', - 'content': f"{modality_placeholder[modality]}\n{question}" - }], + [ + { + "role": "user", + "content": f"{modality_placeholder[modality]}\n{question}", + } + ], tokenize=False, - add_generation_prompt=True) for question in questions + add_generation_prompt=True, + ) + for question in questions ] return ModelRequestData( @@ -614,19 +619,18 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData: ) tokenizer = AutoTokenizer.from_pretrained(model_name) - messages = [[{ - "role": - "user", - "content": [{ - "type": "image" - }, { - "type": "text", - "text": question - }] - }] for question in questions] - prompts = tokenizer.apply_chat_template(messages, - add_generation_prompt=True, - tokenize=False) + messages = [ + [ + { + "role": "user", + "content": [{"type": "image"}, {"type": "text", "text": question}], + } + ] + for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, add_generation_prompt=True, tokenize=False + ) return ModelRequestData( engine_args=engine_args, @@ -649,19 +653,18 @@ def run_llama4(questions: list[str], modality: str) -> ModelRequestData: ) tokenizer = AutoTokenizer.from_pretrained(model_name) - messages = [[{ - "role": - "user", - "content": [{ - "type": "image" - }, { - "type": "text", - "text": f"{question}" - }] - }] for question in questions] - prompts = tokenizer.apply_chat_template(messages, - add_generation_prompt=True, - tokenize=False) + messages = [ + [ + { + "role": "user", + "content": [{"type": "image"}, {"type": "text", "text": f"{question}"}], + } + ] + for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, add_generation_prompt=True, tokenize=False + ) stop_token_ids = None return ModelRequestData( engine_args=engine_args, @@ -685,7 +688,8 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData: prompts = [ f"<|im_start|>user <image>\n{question}<|im_end|> \ - <|im_start|>assistant\n" for question in questions + <|im_start|>assistant\n" + for question in questions ] return ModelRequestData( @@ -709,15 +713,13 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData: limit_mm_per_prompt={modality: 1}, ) - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - messages = [[{ - 'role': 'user', - 'content': f"<image>\n{question}" - }] for question in questions] - prompts = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + messages = [ + [{"role": "user", "content": f"<image>\n{question}"}] for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) return ModelRequestData( engine_args=engine_args, @@ -740,15 +742,13 @@ def run_ovis(questions: list[str], modality: str) -> ModelRequestData: limit_mm_per_prompt={modality: 1}, ) - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - messages = [[{ - 'role': 'user', - 'content': f"<image>\n{question}" - }] for question in questions] - prompts = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + messages = [ + [{"role": "user", "content": f"<image>\n{question}"}] for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) return ModelRequestData( engine_args=engine_args, @@ -839,8 +839,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData: # we have to manually specify the path of the lora weights. vision_lora_path = os.path.join(model_path, "vision-lora") prompts = [ - f"<|user|><|image_1|>{question}<|end|><|assistant|>" - for question in questions + f"<|user|><|image_1|>{question}<|end|><|assistant|>" for question in questions ] engine_args = EngineArgs( model=model_path, @@ -907,7 +906,6 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData: # Qwen2-VL def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData: - model_name = "Qwen/Qwen2-VL-7B-Instruct" engine_args = EngineArgs( @@ -928,10 +926,13 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData: placeholder = "<|video_pad|>" prompts = [ - ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" - f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" - f"{question}<|im_end|>\n" - "<|im_start|>assistant\n") for question in questions + ( + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n" + ) + for question in questions ] return ModelRequestData( @@ -942,7 +943,6 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData: # Qwen2.5-VL def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData: - model_name = "Qwen/Qwen2.5-VL-3B-Instruct" engine_args = EngineArgs( @@ -963,10 +963,13 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData: placeholder = "<|video_pad|>" prompts = [ - ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" - f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" - f"{question}<|im_end|>\n" - "<|im_start|>assistant\n") for question in questions + ( + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n" + ) + for question in questions ] return ModelRequestData( @@ -999,12 +1002,18 @@ def run_qwen2_5_omni(questions: list[str], modality: str): default_system = ( "You are Qwen, a virtual human developed by the Qwen Team, Alibaba " "Group, capable of perceiving auditory and visual inputs, as well as " - "generating text and speech.") + "generating text and speech." + ) - prompts = [(f"<|im_start|>system\n{default_system}<|im_end|>\n" - f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>" - f"{question}<|im_end|>\n" - "<|im_start|>assistant\n") for question in questions] + prompts = [ + ( + f"<|im_start|>system\n{default_system}<|im_end|>\n" + f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n" + ) + for question in questions + ] return ModelRequestData( engine_args=engine_args, prompts=prompts, @@ -1024,15 +1033,13 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData: limit_mm_per_prompt={modality: 1}, ) - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - messages = [[{ - 'role': 'user', - 'content': f"<image>\n{question}" - }] for question in questions] - prompts = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + messages = [ + [{"role": "user", "content": f"<image>\n{question}"}] for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) # Stop tokens for SkyworkR1V # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py @@ -1096,8 +1103,7 @@ def get_multi_modal_input(args): """ if args.modality == "image": # Input image and question - image = ImageAsset("cherry_blossom") \ - .pil_image.convert("RGB") + image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB") img_questions = [ "What is the content of this image?", "Describe the content of this image in detail.", @@ -1112,8 +1118,7 @@ def get_multi_modal_input(args): if args.modality == "video": # Input video and question - video = VideoAsset(name="baby_reading", - num_frames=args.num_frames).np_ndarrays + video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays vid_questions = ["Why is this video funny?"] return { @@ -1125,12 +1130,13 @@ def get_multi_modal_input(args): raise ValueError(msg) -def apply_image_repeat(image_repeat_prob, num_prompts, data, - prompts: list[str], modality): - """Repeats images with provided probability of "image_repeat_prob". +def apply_image_repeat( + image_repeat_prob, num_prompts, data, prompts: list[str], modality +): + """Repeats images with provided probability of "image_repeat_prob". Used to simulate hit/miss for the MM preprocessor cache. """ - assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0) + assert image_repeat_prob <= 1.0 and image_repeat_prob >= 0 no_yes = [0, 1] probs = [1.0 - image_repeat_prob, image_repeat_prob] @@ -1145,12 +1151,12 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data, new_val = (i // 256 // 256, i // 256, i % 256) cur_image.putpixel((0, 0), new_val) - inputs.append({ - "prompt": prompts[i % len(prompts)], - "multi_modal_data": { - modality: cur_image + inputs.append( + { + "prompt": prompts[i % len(prompts)], + "multi_modal_data": {modality: cur_image}, } - }) + ) return inputs @@ -1159,6 +1165,7 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data, def time_counter(enable: bool): if enable: import time + start_time = time.time() yield elapsed_time = time.time() - start_time @@ -1171,54 +1178,65 @@ def time_counter(enable: bool): def parse_args(): parser = FlexibleArgumentParser( - description='Demo on using vLLM for offline inference with ' - 'vision language models for text generation') - parser.add_argument('--model-type', - '-m', - type=str, - default="llava", - choices=model_example_map.keys(), - help='Huggingface "model_type".') - parser.add_argument('--num-prompts', - type=int, - default=4, - help='Number of prompts to run.') - parser.add_argument('--modality', - type=str, - default="image", - choices=['image', 'video'], - help='Modality of the input.') - parser.add_argument('--num-frames', - type=int, - default=16, - help='Number of frames to extract from the video.') - parser.add_argument("--seed", - type=int, - default=None, - help="Set the seed when initializing `vllm.LLM`.") + description="Demo on using vLLM for offline inference with " + "vision language models for text generation" + ) + parser.add_argument( + "--model-type", + "-m", + type=str, + default="llava", + choices=model_example_map.keys(), + help='Huggingface "model_type".', + ) + parser.add_argument( + "--num-prompts", type=int, default=4, help="Number of prompts to run." + ) + parser.add_argument( + "--modality", + type=str, + default="image", + choices=["image", "video"], + help="Modality of the input.", + ) + parser.add_argument( + "--num-frames", + type=int, + default=16, + help="Number of frames to extract from the video.", + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.", + ) parser.add_argument( - '--image-repeat-prob', + "--image-repeat-prob", type=float, default=None, - help='Simulates the hit-ratio for multi-modal preprocessor cache' - ' (if enabled)') + help="Simulates the hit-ratio for multi-modal preprocessor cache (if enabled)", + ) parser.add_argument( - '--disable-mm-preprocessor-cache', - action='store_true', - help='If True, disables caching of multi-modal preprocessor/mapper.') + "--disable-mm-preprocessor-cache", + action="store_true", + help="If True, disables caching of multi-modal preprocessor/mapper.", + ) parser.add_argument( - '--time-generate', - action='store_true', - help='If True, then print the total generate() call time') + "--time-generate", + action="store_true", + help="If True, then print the total generate() call time", + ) parser.add_argument( - '--use-different-prompt-per-request', - action='store_true', - help='If True, then use different prompt (with the same multi-modal ' - 'data) for each request.') + "--use-different-prompt-per-request", + action="store_true", + help="If True, then use different prompt (with the same multi-modal " + "data) for each request.", + ) return parser.parse_args() @@ -1237,7 +1255,8 @@ def main(args): # Disable other modalities to save memory default_limits = {"image": 0, "video": 0, "audio": 0} req_data.engine_args.limit_mm_per_prompt = default_limits | dict( - req_data.engine_args.limit_mm_per_prompt or {}) + req_data.engine_args.limit_mm_per_prompt or {} + ) engine_args = asdict(req_data.engine_args) | { "seed": args.seed, @@ -1246,44 +1265,46 @@ def main(args): llm = LLM(**engine_args) # Don't want to check the flag multiple times, so just hijack `prompts`. - prompts = req_data.prompts if args.use_different_prompt_per_request else [ - req_data.prompts[0] - ] + prompts = ( + req_data.prompts + if args.use_different_prompt_per_request + else [req_data.prompts[0]] + ) # We set temperature to 0.2 so that outputs can be different # even when all prompts are identical when running batch inference. - sampling_params = SamplingParams(temperature=0.2, - max_tokens=64, - stop_token_ids=req_data.stop_token_ids) + sampling_params = SamplingParams( + temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids + ) assert args.num_prompts > 0 if args.num_prompts == 1: # Single inference inputs = { "prompt": prompts[0], - "multi_modal_data": { - modality: data - }, + "multi_modal_data": {modality: data}, } else: # Batch inference if args.image_repeat_prob is not None: # Repeat images with specified probability of "image_repeat_prob" - inputs = apply_image_repeat(args.image_repeat_prob, - args.num_prompts, data, prompts, - modality) + inputs = apply_image_repeat( + args.image_repeat_prob, args.num_prompts, data, prompts, modality + ) else: # Use the same image for all prompts - inputs = [{ - "prompt": prompts[i % len(prompts)], - "multi_modal_data": { - modality: data - }, - } for i in range(args.num_prompts)] + inputs = [ + { + "prompt": prompts[i % len(prompts)], + "multi_modal_data": {modality: data}, + } + for i in range(args.num_prompts) + ] # Add LoRA request if applicable - lora_request = (req_data.lora_requests * - args.num_prompts if req_data.lora_requests else None) + lora_request = ( + req_data.lora_requests * args.num_prompts if req_data.lora_requests else None + ) with time_counter(args.time_generate): outputs = llm.generate( diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py index 2637949551a1a..cee02d06c607c 100644 --- a/examples/offline_inference/vision_language_embedding.py +++ b/examples/offline_inference/vision_language_embedding.py @@ -6,6 +6,7 @@ the correct prompt format on vision language models for multimodal embedding. For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ + from argparse import Namespace from dataclasses import asdict from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args @@ -44,19 +45,17 @@ class ModelRequestData(NamedTuple): def run_e5_v(query: Query) -> ModelRequestData: - llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501 + llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501 if query["modality"] == "text": text = query["text"] - prompt = llama3_template.format( - f"{text}\nSummary above sentence in one word: ") + prompt = llama3_template.format(f"{text}\nSummary above sentence in one word: ") image = None elif query["modality"] == "image": - prompt = llama3_template.format( - "<image>\nSummary above image in one word: ") + prompt = llama3_template.format("<image>\nSummary above image in one word: ") image = query["image"] else: - modality = query['modality'] + modality = query["modality"] raise ValueError(f"Unsupported query modality: '{modality}'") engine_args = EngineArgs( @@ -83,10 +82,12 @@ def run_vlm2vec(query: Query) -> ModelRequestData: image = query["image"] elif query["modality"] == "text+image": text = query["text"] - prompt = f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501 + prompt = ( + f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501 + ) image = query["image"] else: - modality = query['modality'] + modality = query["modality"] raise ValueError(f"Unsupported query modality: '{modality}'") engine_args = EngineArgs( @@ -136,7 +137,8 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]): # Disable other modalities to save memory default_limits = {"image": 0, "video": 0, "audio": 0} req_data.engine_args.limit_mm_per_prompt = default_limits | dict( - req_data.engine_args.limit_mm_per_prompt or {}) + req_data.engine_args.limit_mm_per_prompt or {} + ) engine_args = asdict(req_data.engine_args) | {"seed": seed} llm = LLM(**engine_args) @@ -145,10 +147,12 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]): if req_data.image is not None: mm_data["image"] = req_data.image - outputs = llm.embed({ - "prompt": req_data.prompt, - "multi_modal_data": mm_data, - }) + outputs = llm.embed( + { + "prompt": req_data.prompt, + "multi_modal_data": mm_data, + } + ) print("-" * 50) for output in outputs: @@ -164,23 +168,30 @@ model_example_map = { def parse_args(): parser = FlexibleArgumentParser( - description='Demo on using vLLM for offline inference with ' - 'vision language models for multimodal embedding') - parser.add_argument('--model-name', - '-m', - type=str, - default="vlm2vec", - choices=model_example_map.keys(), - help='The name of the embedding model.') - parser.add_argument('--modality', - type=str, - default="image", - choices=get_args(QueryModality), - help='Modality of the input.') - parser.add_argument("--seed", - type=int, - default=None, - help="Set the seed when initializing `vllm.LLM`.") + description="Demo on using vLLM for offline inference with " + "vision language models for multimodal embedding" + ) + parser.add_argument( + "--model-name", + "-m", + type=str, + default="vlm2vec", + choices=model_example_map.keys(), + help="The name of the embedding model.", + ) + parser.add_argument( + "--modality", + type=str, + default="image", + choices=get_args(QueryModality), + help="Modality of the input.", + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.", + ) return parser.parse_args() diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 20a8e635e322f..e776ff7fe6aec 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -4,6 +4,7 @@ This example shows how to use vLLM for running offline inference with multi-image input on vision language models for text generation, using the chat template defined by the model. """ + import os from argparse import Namespace from dataclasses import asdict @@ -59,8 +60,9 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData: limit_mm_per_prompt={"image": len(image_urls)}, ) placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls) - prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n" - "<|im_start|>assistant\n") + prompt = ( + f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n<|im_start|>assistant\n" + ) stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] return ModelRequestData( @@ -81,23 +83,21 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData: ) placeholders = [{"type": "image", "image": url} for url in image_urls] - messages = [{ - "role": - "user", - "content": [ - *placeholders, - { - "type": "text", - "text": question - }, - ], - }] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] processor = AutoProcessor.from_pretrained(model_name) - prompt = processor.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) return ModelRequestData( engine_args=engine_args, @@ -106,8 +106,7 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData: ) -def load_deepseek_vl2(question: str, - image_urls: list[str]) -> ModelRequestData: +def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "deepseek-ai/deepseek-vl2-tiny" engine_args = EngineArgs( @@ -118,8 +117,9 @@ def load_deepseek_vl2(question: str, limit_mm_per_prompt={"image": len(image_urls)}, ) - placeholder = "".join(f"image_{i}:<image>\n" - for i, _ in enumerate(image_urls, start=1)) + placeholder = "".join( + f"image_{i}:<image>\n" for i, _ in enumerate(image_urls, start=1) + ) prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:" return ModelRequestData( @@ -140,23 +140,21 @@ def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData: ) placeholders = [{"type": "image", "image": url} for url in image_urls] - messages = [{ - "role": - "user", - "content": [ - *placeholders, - { - "type": "text", - "text": question - }, - ], - }] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] processor = AutoProcessor.from_pretrained(model_name) - prompt = processor.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) return ModelRequestData( engine_args=engine_args, @@ -176,15 +174,15 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData: mm_processor_kwargs={"max_dynamic_patch": 4}, ) - placeholders = "\n".join(f"Image-{i}: <image>\n" - for i, _ in enumerate(image_urls, start=1)) - messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] + placeholders = "\n".join( + f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1) + ) + messages = [{"role": "user", "content": f"{placeholders}\n{question}"}] - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) # Stop tokens for H2OVL-Mississippi # https://huggingface.co/h2oai/h2ovl-mississippi-800m @@ -211,14 +209,13 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData: # if you are running out of memory, you can reduce the "longest_edge". # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations mm_processor_kwargs={ - "size": { - "longest_edge": 2 * 364 - }, + "size": {"longest_edge": 2 * 364}, }, ) - placeholders = "\n".join(f"Image-{i}: <image>\n" - for i, _ in enumerate(image_urls, start=1)) + placeholders = "\n".join( + f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1) + ) prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501 return ModelRequestData( engine_args=engine_args, @@ -238,15 +235,16 @@ def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: enforce_eager=True, limit_mm_per_prompt={"image": len(image_urls)}, mm_processor_kwargs={ - "max_image_size": { - "longest_edge": 384 - }, + "max_image_size": {"longest_edge": 384}, }, ) - placeholders = "\n".join(f"Image-{i}: <image>\n" - for i, _ in enumerate(image_urls, start=1)) - prompt = f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501 + placeholders = "\n".join( + f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1) + ) + prompt = ( + f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501 + ) return ModelRequestData( engine_args=engine_args, prompt=prompt, @@ -265,15 +263,15 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: mm_processor_kwargs={"max_dynamic_patch": 4}, ) - placeholders = "\n".join(f"Image-{i}: <image>\n" - for i, _ in enumerate(image_urls, start=1)) - messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] + placeholders = "\n".join( + f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1) + ) + messages = [{"role": "user", "content": f"{placeholders}\n{question}"}] - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) # Stop tokens for InternVL # models variants may have different stop tokens @@ -301,23 +299,21 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData: ) placeholders = [{"type": "image", "image": url} for url in image_urls] - messages = [{ - "role": - "user", - "content": [ - *placeholders, - { - "type": "text", - "text": question - }, - ], - }] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] processor = AutoProcessor.from_pretrained(model_name) - prompt = processor.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) return ModelRequestData( engine_args=engine_args, @@ -338,24 +334,21 @@ def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData: ) placeholders = [{"type": "image", "image": url} for url in image_urls] - messages = [{ - "role": - "user", - "content": [ - *placeholders, - { - "type": "text", - "text": question - }, - ], - }] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] - processor = AutoProcessor.from_pretrained(model_name, - trust_remote_code=True) + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) - prompt = processor.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) return ModelRequestData( engine_args=engine_args, @@ -419,15 +412,15 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData: mm_processor_kwargs={"max_dynamic_patch": 4}, ) - placeholders = "\n".join(f"Image-{i}: <image>\n" - for i, _ in enumerate(image_urls, start=1)) - messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] + placeholders = "\n".join( + f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1) + ) + messages = [{"role": "user", "content": f"{placeholders}\n{question}"}] - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) return ModelRequestData( engine_args=engine_args, @@ -449,15 +442,15 @@ def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData: limit_mm_per_prompt={"image": len(image_urls)}, ) - placeholders = "\n".join(f"Image-{i}: <image>\n" - for i, _ in enumerate(image_urls, start=1)) - messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] + placeholders = "\n".join( + f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1) + ) + messages = [{"role": "user", "content": f"{placeholders}\n{question}"}] - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) return ModelRequestData( engine_args=engine_args, @@ -509,8 +502,9 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData: limit_mm_per_prompt={"image": len(image_urls)}, mm_processor_kwargs={"num_crops": 4}, ) - placeholders = "\n".join(f"<|image_{i}|>" - for i, _ in enumerate(image_urls, start=1)) + placeholders = "\n".join( + f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1) + ) prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n" return ModelRequestData( @@ -542,8 +536,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData: mm_processor_kwargs={"dynamic_hd": 4}, ) - placeholders = "".join(f"<|image_{i}|>" - for i, _ in enumerate(image_urls, start=1)) + placeholders = "".join(f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1)) prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>" return ModelRequestData( @@ -554,8 +547,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData: ) -def load_qwen_vl_chat(question: str, - image_urls: list[str]) -> ModelRequestData: +def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "Qwen/Qwen-VL-Chat" engine_args = EngineArgs( model=model_name, @@ -565,24 +557,26 @@ def load_qwen_vl_chat(question: str, hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}, limit_mm_per_prompt={"image": len(image_urls)}, ) - placeholders = "".join(f"Picture {i}: <img></img>\n" - for i, _ in enumerate(image_urls, start=1)) + placeholders = "".join( + f"Picture {i}: <img></img>\n" for i, _ in enumerate(image_urls, start=1) + ) # This model does not have a chat_template attribute on its tokenizer, # so we need to explicitly pass it. We use ChatML since it's used in the # generation utils of the model: # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265 - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" # noqa: E501 - messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True, - chat_template=chat_template) + messages = [{"role": "user", "content": f"{placeholders}\n{question}"}] + prompt = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + chat_template=chat_template, + ) stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] @@ -600,9 +594,11 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData: try: from qwen_vl_utils import process_vision_info except ModuleNotFoundError: - print('WARNING: `qwen-vl-utils` not installed, input images will not ' - 'be automatically resized. You can enable this functionality by ' - '`pip install qwen-vl-utils`.') + print( + "WARNING: `qwen-vl-utils` not installed, input images will not " + "be automatically resized. You can enable this functionality by " + "`pip install qwen-vl-utils`." + ) process_vision_info = None model_name = "Qwen/Qwen2-VL-7B-Instruct" @@ -616,26 +612,22 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData: ) placeholders = [{"type": "image", "image": url} for url in image_urls] - messages = [{ - "role": "system", - "content": "You are a helpful assistant." - }, { - "role": - "user", - "content": [ - *placeholders, - { - "type": "text", - "text": question - }, - ], - }] + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + }, + ] processor = AutoProcessor.from_pretrained(model_name) - prompt = processor.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) if process_vision_info is None: image_data = [fetch_image(url) for url in image_urls] @@ -653,9 +645,11 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: try: from qwen_vl_utils import process_vision_info except ModuleNotFoundError: - print('WARNING: `qwen-vl-utils` not installed, input images will not ' - 'be automatically resized. You can enable this functionality by ' - '`pip install qwen-vl-utils`.') + print( + "WARNING: `qwen-vl-utils` not installed, input images will not " + "be automatically resized. You can enable this functionality by " + "`pip install qwen-vl-utils`." + ) process_vision_info = None model_name = "Qwen/Qwen2.5-VL-3B-Instruct" @@ -668,32 +662,27 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: ) placeholders = [{"type": "image", "image": url} for url in image_urls] - messages = [{ - "role": "system", - "content": "You are a helpful assistant." - }, { - "role": - "user", - "content": [ - *placeholders, - { - "type": "text", - "text": question - }, - ], - }] + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + }, + ] processor = AutoProcessor.from_pretrained(model_name) - prompt = processor.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) if process_vision_info is None: image_data = [fetch_image(url) for url in image_urls] else: - image_data, _ = process_vision_info(messages, - return_video_kwargs=False) + image_data, _ = process_vision_info(messages, return_video_kwargs=False) return ModelRequestData( engine_args=engine_args, @@ -726,23 +715,20 @@ model_example_map = { } -def run_generate(model, question: str, image_urls: list[str], - seed: Optional[int]): +def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]): req_data = model_example_map[model](question, image_urls) engine_args = asdict(req_data.engine_args) | {"seed": args.seed} llm = LLM(**engine_args) - sampling_params = SamplingParams(temperature=0.0, - max_tokens=256, - stop_token_ids=req_data.stop_token_ids) + sampling_params = SamplingParams( + temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids + ) outputs = llm.generate( { "prompt": req_data.prompt, - "multi_modal_data": { - "image": req_data.image_data - }, + "multi_modal_data": {"image": req_data.image_data}, }, sampling_params=sampling_params, lora_request=req_data.lora_requests, @@ -755,38 +741,40 @@ def run_generate(model, question: str, image_urls: list[str], print("-" * 50) -def run_chat(model: str, question: str, image_urls: list[str], - seed: Optional[int]): +def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]): req_data = model_example_map[model](question, image_urls) # Disable other modalities to save memory default_limits = {"image": 0, "video": 0, "audio": 0} req_data.engine_args.limit_mm_per_prompt = default_limits | dict( - req_data.engine_args.limit_mm_per_prompt or {}) + req_data.engine_args.limit_mm_per_prompt or {} + ) engine_args = asdict(req_data.engine_args) | {"seed": seed} llm = LLM(**engine_args) - sampling_params = SamplingParams(temperature=0.0, - max_tokens=256, - stop_token_ids=req_data.stop_token_ids) + sampling_params = SamplingParams( + temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids + ) outputs = llm.chat( - [{ - "role": - "user", - "content": [ - { - "type": "text", - "text": question, - }, - *({ - "type": "image_url", - "image_url": { - "url": image_url + [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": question, }, - } for image_url in image_urls), - ], - }], + *( + { + "type": "image_url", + "image_url": {"url": image_url}, + } + for image_url in image_urls + ), + ], + } + ], sampling_params=sampling_params, chat_template=req_data.chat_template, lora_request=req_data.lora_requests, @@ -801,32 +789,39 @@ def run_chat(model: str, question: str, image_urls: list[str], def parse_args(): parser = FlexibleArgumentParser( - description='Demo on using vLLM for offline inference with ' - 'vision language models that support multi-image input for text ' - 'generation') - parser.add_argument('--model-type', - '-m', - type=str, - default="phi3_v", - choices=model_example_map.keys(), - help='Huggingface "model_type".') - parser.add_argument("--method", - type=str, - default="generate", - choices=["generate", "chat"], - help="The method to run in `vllm.LLM`.") - parser.add_argument("--seed", - type=int, - default=None, - help="Set the seed when initializing `vllm.LLM`.") + description="Demo on using vLLM for offline inference with " + "vision language models that support multi-image input for text " + "generation" + ) + parser.add_argument( + "--model-type", + "-m", + type=str, + default="phi3_v", + choices=model_example_map.keys(), + help='Huggingface "model_type".', + ) + parser.add_argument( + "--method", + type=str, + default="generate", + choices=["generate", "chat"], + help="The method to run in `vllm.LLM`.", + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.", + ) parser.add_argument( "--num-images", "-n", type=int, - choices=list(range(1, - len(IMAGE_URLS) + 1)), # the max number of images + choices=list(range(1, len(IMAGE_URLS) + 1)), # the max number of images default=2, - help="Number of images to use for the demo.") + help="Number of images to use for the demo.", + ) return parser.parse_args() @@ -835,7 +830,7 @@ def main(args: Namespace): method = args.method seed = args.seed - image_urls = IMAGE_URLS[:args.num_images] + image_urls = IMAGE_URLS[: args.num_images] if method == "generate": run_generate(model, QUESTION, image_urls, seed) diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py index 36079ff11d07e..cc190e91c141d 100644 --- a/examples/online_serving/api_client.py +++ b/examples/online_serving/api_client.py @@ -17,16 +17,15 @@ import requests def clear_line(n: int = 1) -> None: - LINE_UP = '\033[1A' - LINE_CLEAR = '\x1b[2K' + LINE_UP = "\033[1A" + LINE_CLEAR = "\x1b[2K" for _ in range(n): print(LINE_UP, end=LINE_CLEAR, flush=True) -def post_http_request(prompt: str, - api_url: str, - n: int = 1, - stream: bool = False) -> requests.Response: +def post_http_request( + prompt: str, api_url: str, n: int = 1, stream: bool = False +) -> requests.Response: headers = {"User-Agent": "Test Client"} pload = { "prompt": prompt, @@ -35,17 +34,14 @@ def post_http_request(prompt: str, "max_tokens": 16, "stream": stream, } - response = requests.post(api_url, - headers=headers, - json=pload, - stream=stream) + response = requests.post(api_url, headers=headers, json=pload, stream=stream) return response def get_streaming_response(response: requests.Response) -> Iterable[list[str]]: - for chunk in response.iter_lines(chunk_size=8192, - decode_unicode=False, - delimiter=b"\n"): + for chunk in response.iter_lines( + chunk_size=8192, decode_unicode=False, delimiter=b"\n" + ): if chunk: data = json.loads(chunk.decode("utf-8")) output = data["text"] diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/cohere_rerank_client.py index c2d4ef08ddbbe..e57b94e8805f9 100644 --- a/examples/online_serving/cohere_rerank_client.py +++ b/examples/online_serving/cohere_rerank_client.py @@ -6,6 +6,7 @@ Note that `pip install cohere` is needed to run this example. run: vllm serve BAAI/bge-reranker-base """ + from typing import Union import cohere @@ -16,28 +17,28 @@ model = "BAAI/bge-reranker-base" query = "What is the capital of France?" documents = [ - "The capital of France is Paris", "Reranking is fun!", - "vLLM is an open-source framework for fast AI serving" + "The capital of France is Paris", + "Reranking is fun!", + "vLLM is an open-source framework for fast AI serving", ] -def cohere_rerank(client: Union[Client, ClientV2], model: str, query: str, - documents: list[str]) -> dict: +def cohere_rerank( + client: Union[Client, ClientV2], model: str, query: str, documents: list[str] +) -> dict: return client.rerank(model=model, query=query, documents=documents) def main(): # cohere v1 client - cohere_v1 = cohere.Client(base_url="http://localhost:8000", - api_key="sk-fake-key") + cohere_v1 = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key") rerank_v1_result = cohere_rerank(cohere_v1, model, query, documents) print("-" * 50) print("rerank_v1_result:\n", rerank_v1_result) print("-" * 50) # or the v2 - cohere_v2 = cohere.ClientV2("sk-fake-key", - base_url="http://localhost:8000") + cohere_v2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000") rerank_v2_result = cohere_rerank(cohere_v2, model, query, documents) print("rerank_v2_result:\n", rerank_v2_result) print("-" * 50) diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py index c6d26778ee497..2ffba4a7ed3f9 100644 --- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py +++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py @@ -13,6 +13,7 @@ launch this proxy demo through: Note: This demo will be removed once the PDController implemented in PR 15343 (https://github.com/vllm-project/vllm/pull/15343) supports XpYd. """ + import argparse import ipaddress import itertools @@ -26,8 +27,7 @@ from typing import Callable, Optional import aiohttp import requests import uvicorn -from fastapi import (APIRouter, Depends, FastAPI, Header, HTTPException, - Request, status) +from fastapi import APIRouter, Depends, FastAPI, Header, HTTPException, Request, status from fastapi.responses import JSONResponse, StreamingResponse AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) @@ -36,24 +36,24 @@ logging.basicConfig(level=logging.INFO) class SchedulingPolicy(ABC): - @abstractmethod def schedule(self, cycler: itertools.cycle): raise NotImplementedError("Scheduling Proxy is not set.") class Proxy: - def __init__( self, prefill_instances: list[str], decode_instances: list[str], model: str, scheduling_policy: SchedulingPolicy, - custom_create_completion: Optional[Callable[[Request], - StreamingResponse]] = None, - custom_create_chat_completion: Optional[Callable[ - [Request], StreamingResponse]] = None, + custom_create_completion: Optional[ + Callable[[Request], StreamingResponse] + ] = None, + custom_create_chat_completion: Optional[ + Callable[[Request], StreamingResponse] + ] = None, ): self.prefill_instances = prefill_instances self.decode_instances = decode_instances @@ -68,30 +68,30 @@ class Proxy: def setup_routes(self): self.router.post( - "/v1/completions", - dependencies=[ - Depends(self.validate_json_request) - ])(self.custom_create_completion if self. - custom_create_completion else self.create_completion) + "/v1/completions", dependencies=[Depends(self.validate_json_request)] + )( + self.custom_create_completion + if self.custom_create_completion + else self.create_completion + ) self.router.post( - "/v1/chat/completions", - dependencies=[ - Depends(self.validate_json_request) - ])(self.custom_create_chat_completion if self. - custom_create_chat_completion else self.create_chat_completion) - self.router.get("/status", - response_class=JSONResponse)(self.get_status) - self.router.post("/instances/add", - dependencies=[Depends(self.api_key_authenticate) - ])(self.add_instance_endpoint) + "/v1/chat/completions", dependencies=[Depends(self.validate_json_request)] + )( + self.custom_create_chat_completion + if self.custom_create_chat_completion + else self.create_chat_completion + ) + self.router.get("/status", response_class=JSONResponse)(self.get_status) + self.router.post( + "/instances/add", dependencies=[Depends(self.api_key_authenticate)] + )(self.add_instance_endpoint) async def validate_json_request(self, raw_request: Request): content_type = raw_request.headers.get("content-type", "").lower() if content_type != "application/json": raise HTTPException( status_code=415, - detail= - "Unsupported Media Type: Only 'application/json' is allowed", + detail="Unsupported Media Type: Only 'application/json' is allowed", ) def api_key_authenticate(self, x_api_key: str = Header(...)): @@ -103,8 +103,7 @@ class Proxy: detail="Server configuration error.", ) if x_api_key != expected_api_key: - logger.warning("Unauthorized access attempt with API Key: %s", - x_api_key) + logger.warning("Unauthorized access attempt with API Key: %s", x_api_key) raise HTTPException( status_code=status.HTTP_403_FORBIDDEN, detail="Forbidden: Invalid API Key.", @@ -113,8 +112,7 @@ class Proxy: async def validate_instance(self, instance: str) -> bool: url = f"http://{instance}/v1/models" try: - async with aiohttp.ClientSession( - timeout=AIOHTTP_TIMEOUT) as client: + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as client: logger.info("Verifying %s ...", instance) async with client.get(url) as response: if response.status == 200: @@ -122,12 +120,15 @@ class Proxy: if "data" in data and len(data["data"]) > 0: model_cur = data["data"][0].get("id", "") if model_cur == self.model: - logger.info("Instance: %s could be added.", - instance) + logger.info("Instance: %s could be added.", instance) return True else: - logger.warning("Mismatch model %s : %s != %s", - instance, model_cur, self.model) + logger.warning( + "Mismatch model %s : %s != %s", + instance, + model_cur, + self.model, + ) return False else: return False @@ -147,48 +148,47 @@ class Proxy: instance_type = data.get("type") instance = data.get("instance") if instance_type not in ["prefill", "decode"]: - raise HTTPException(status_code=400, - detail="Invalid instance type.") + raise HTTPException(status_code=400, detail="Invalid instance type.") if not instance or ":" not in instance: - raise HTTPException(status_code=400, - detail="Invalid instance format.") + raise HTTPException(status_code=400, detail="Invalid instance format.") host, port_str = instance.split(":") try: if host != "localhost": ipaddress.ip_address(host) port = int(port_str) if not (0 < port < 65536): - raise HTTPException(status_code=400, - detail="Invalid port number.") + raise HTTPException(status_code=400, detail="Invalid port number.") except Exception as e: - raise HTTPException(status_code=400, - detail="Invalid instance address.") from e + raise HTTPException( + status_code=400, detail="Invalid instance address." + ) from e is_valid = await self.validate_instance(instance) if not is_valid: - raise HTTPException(status_code=400, - detail="Instance validation failed.") + raise HTTPException( + status_code=400, detail="Instance validation failed." + ) if instance_type == "prefill": if instance not in self.prefill_instances: self.prefill_instances.append(instance) - self.prefill_cycler = itertools.cycle( - self.prefill_instances) + self.prefill_cycler = itertools.cycle(self.prefill_instances) else: - raise HTTPException(status_code=400, - detail="Instance already exists.") + raise HTTPException( + status_code=400, detail="Instance already exists." + ) else: if instance not in self.decode_instances: self.decode_instances.append(instance) self.decode_cycler = itertools.cycle(self.decode_instances) else: - raise HTTPException(status_code=400, - detail="Instance already exists.") + raise HTTPException( + status_code=400, detail="Instance already exists." + ) - return JSONResponse(content={ - "message": - f"Added {instance} to {instance_type}_instances." - }) + return JSONResponse( + content={"message": f"Added {instance} to {instance_type}_instances."} + ) except HTTPException as http_exc: raise http_exc except Exception as e: @@ -197,16 +197,16 @@ class Proxy: async def forward_request(self, url, data, use_chunked=True): async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" - } + headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} try: - async with session.post(url=url, json=data, - headers=headers) as response: + async with session.post( + url=url, json=data, headers=headers + ) as response: if 200 <= response.status < 300 or 400 <= response.status < 500: # noqa: E501 if use_chunked: async for chunk_bytes in response.content.iter_chunked( # noqa: E501 - 1024): + 1024 + ): yield chunk_bytes else: content = await response.read() @@ -217,20 +217,21 @@ class Proxy: error_content = json.loads(error_content) except json.JSONDecodeError: error_content = error_content - logger.error("Request failed with status %s: %s", - response.status, error_content) + logger.error( + "Request failed with status %s: %s", + response.status, + error_content, + ) raise HTTPException( status_code=response.status, - detail= - f"Request failed with status {response.status}: " + detail=f"Request failed with status {response.status}: " f"{error_content}", ) except aiohttp.ClientError as e: logger.error("ClientError occurred: %s", str(e)) raise HTTPException( status_code=502, - detail= - "Bad Gateway: Error communicating with upstream server.", + detail="Bad Gateway: Error communicating with upstream server.", ) from e except Exception as e: logger.error("Unexpected error: %s", str(e)) @@ -258,8 +259,8 @@ class Proxy: prefill_instance = self.schedule(self.prefill_cycler) try: async for _ in self.forward_request( - f"http://{prefill_instance}/v1/completions", - kv_prepare_request): + f"http://{prefill_instance}/v1/completions", kv_prepare_request + ): continue except HTTPException as http_exc: self.remove_instance_endpoint("prefill", prefill_instance) @@ -270,7 +271,8 @@ class Proxy: try: generator = self.forward_request( - f"http://{decode_instance}/v1/completions", request) + f"http://{decode_instance}/v1/completions", request + ) except HTTPException as http_exc: self.remove_instance_endpoint("decode", decode_instance) raise http_exc @@ -295,8 +297,8 @@ class Proxy: prefill_instance = self.schedule(self.prefill_cycler) try: async for _ in self.forward_request( - f"http://{prefill_instance}/v1/chat/completions", - kv_prepare_request): + f"http://{prefill_instance}/v1/chat/completions", kv_prepare_request + ): continue except HTTPException as http_exc: self.remove_instance_endpoint("prefill", prefill_instance) @@ -306,8 +308,8 @@ class Proxy: try: generator = self.forward_request( - "http://" + decode_instance + "/v1/chat/completions", - request) + "http://" + decode_instance + "/v1/chat/completions", request + ) except HTTPException as http_exc: self.remove_instance_endpoint("decode", decode_instance) raise http_exc @@ -318,20 +320,20 @@ class Proxy: error_messages = [str(e) for e in exc_info if e] print("Error occurred in disagg proxy server") print(error_messages) - return StreamingResponse(content=iter(error_messages), - media_type="text/event-stream") + return StreamingResponse( + content=iter(error_messages), media_type="text/event-stream" + ) def remove_instance_endpoint(self, instance_type, instance): - if (instance_type == "decode" and instance in self.decode_instances): + if instance_type == "decode" and instance in self.decode_instances: self.decode_instances.remove(instance) self.decode_cycler = itertools.cycle(self.decode_instances) - if (instance_type == "prefill" and instance in self.decode_instances): + if instance_type == "prefill" and instance in self.decode_instances: self.prefill_instances.remove(instance) self.prefill_cycler = itertools.cycle(self.decode_instances) class RoundRobinSchedulingPolicy(SchedulingPolicy): - def __init__(self): super().__init__() @@ -340,15 +342,12 @@ class RoundRobinSchedulingPolicy(SchedulingPolicy): class ProxyServer: - def __init__( self, args: argparse.Namespace, scheduling_policy: Optional[SchedulingPolicy] = None, - create_completion: Optional[Callable[[Request], - StreamingResponse]] = None, - create_chat_completion: Optional[Callable[[Request], - StreamingResponse]] = None, + create_completion: Optional[Callable[[Request], StreamingResponse]] = None, + create_chat_completion: Optional[Callable[[Request], StreamingResponse]] = None, ): self.validate_parsed_serve_args(args) self.port = args.port @@ -356,8 +355,11 @@ class ProxyServer: prefill_instances=[] if args.prefill is None else args.prefill, decode_instances=[] if args.decode is None else args.decode, model=args.model, - scheduling_policy=(scheduling_policy if scheduling_policy - is not None else RoundRobinSchedulingPolicy()), + scheduling_policy=( + scheduling_policy + if scheduling_policy is not None + else RoundRobinSchedulingPolicy() + ), custom_create_completion=create_completion, custom_create_chat_completion=create_chat_completion, ) @@ -382,11 +384,9 @@ class ProxyServer: ipaddress.ip_address(host) port = int(port) if not (0 < port < 65536): - raise ValueError( - f"Invalid port number in instance: {instance}") + raise ValueError(f"Invalid port number in instance: {instance}") except Exception as e: - raise ValueError( - f"Invalid instance {instance}: {str(e)}") from e + raise ValueError(f"Invalid instance {instance}: {str(e)}") from e def verify_model_config(self, instances: list, model: str) -> None: model_suffix = model.split("/")[-1] @@ -399,12 +399,14 @@ class ProxyServer: if model_cur_suffix != model_suffix: raise ValueError( f"{instance} serves a different model: " - f"{model_cur} != {model}") + f"{model_cur} != {model}" + ) else: raise ValueError(f"Cannot get model id from {instance}!") except requests.RequestException as e: raise ValueError( - f"Error communicating with {instance}: {str(e)}") from e + f"Error communicating with {instance}: {str(e)}" + ) from e def run_server(self): app = FastAPI() @@ -417,11 +419,7 @@ class ProxyServer: def parse_args(): # Todo: allow more config parser = argparse.ArgumentParser("vLLM disaggregated proxy server.") - parser.add_argument("--model", - "-m", - type=str, - required=True, - help="Model name") + parser.add_argument("--model", "-m", type=str, required=True, help="Model name") parser.add_argument( "--prefill", diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py index 314f1c5b73951..3f2a3d01b4563 100644 --- a/examples/online_serving/gradio_openai_chatbot_webserver.py +++ b/examples/online_serving/gradio_openai_chatbot_webserver.py @@ -17,6 +17,7 @@ you can install it manually by following these steps: 2. Rename the downloaded file to: frpc_linux_amd64_v0.3 3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc """ + import argparse import gradio as gr @@ -24,16 +25,12 @@ from openai import OpenAI def format_history_to_openai(history): - history_openai_format = [{ - "role": "system", - "content": "You are a great AI assistant." - }] + history_openai_format = [ + {"role": "system", "content": "You are a great AI assistant."} + ] for human, assistant in history: history_openai_format.append({"role": "user", "content": human}) - history_openai_format.append({ - "role": "assistant", - "content": assistant - }) + history_openai_format.append({"role": "assistant", "content": assistant}) return history_openai_format @@ -49,17 +46,17 @@ def predict(message, history, client, model_name, temp, stop_token_ids): temperature=temp, stream=True, extra_body={ - 'repetition_penalty': - 1, - 'stop_token_ids': - [int(id.strip()) - for id in stop_token_ids.split(',')] if stop_token_ids else [] - }) + "repetition_penalty": 1, + "stop_token_ids": [int(id.strip()) for id in stop_token_ids.split(",")] + if stop_token_ids + else [], + }, + ) # Collect all chunks and concatenate them into a full message full_message = "" for chunk in stream: - full_message += (chunk.choices[0].delta.content or "") + full_message += chunk.choices[0].delta.content or "" # Return the full message as a single response return full_message @@ -67,38 +64,34 @@ def predict(message, history, client, model_name, temp, stop_token_ids): def parse_args(): parser = argparse.ArgumentParser( - description='Chatbot Interface with Customizable Parameters') - parser.add_argument('--model-url', - type=str, - default='http://localhost:8000/v1', - help='Model URL') - parser.add_argument('-m', - '--model', - type=str, - required=True, - help='Model name for the chatbot') - parser.add_argument('--temp', - type=float, - default=0.8, - help='Temperature for text generation') - parser.add_argument('--stop-token-ids', - type=str, - default='', - help='Comma-separated stop token IDs') + description="Chatbot Interface with Customizable Parameters" + ) + parser.add_argument( + "--model-url", type=str, default="http://localhost:8000/v1", help="Model URL" + ) + parser.add_argument( + "-m", "--model", type=str, required=True, help="Model name for the chatbot" + ) + parser.add_argument( + "--temp", type=float, default=0.8, help="Temperature for text generation" + ) + parser.add_argument( + "--stop-token-ids", type=str, default="", help="Comma-separated stop token IDs" + ) parser.add_argument("--host", type=str, default=None) parser.add_argument("--port", type=int, default=8001) return parser.parse_args() def build_gradio_interface(client, model_name, temp, stop_token_ids): - def chat_predict(message, history): - return predict(message, history, client, model_name, temp, - stop_token_ids) + return predict(message, history, client, model_name, temp, stop_token_ids) - return gr.ChatInterface(fn=chat_predict, - title="Chatbot Interface", - description="A simple chatbot powered by vLLM") + return gr.ChatInterface( + fn=chat_predict, + title="Chatbot Interface", + description="A simple chatbot powered by vLLM", + ) def main(): @@ -113,12 +106,13 @@ def main(): client = OpenAI(api_key=openai_api_key, base_url=openai_api_base) # Define the Gradio chatbot interface using the predict function - gradio_interface = build_gradio_interface(client, args.model, args.temp, - args.stop_token_ids) + gradio_interface = build_gradio_interface( + client, args.model, args.temp, args.stop_token_ids + ) - gradio_interface.queue().launch(server_name=args.host, - server_port=args.port, - share=True) + gradio_interface.queue().launch( + server_name=args.host, server_port=args.port, share=True + ) if __name__ == "__main__": diff --git a/examples/online_serving/gradio_webserver.py b/examples/online_serving/gradio_webserver.py index 2e7c2a0c5838c..fd341ff493b56 100644 --- a/examples/online_serving/gradio_webserver.py +++ b/examples/online_serving/gradio_webserver.py @@ -17,6 +17,7 @@ you can install it manually by following these steps: 2. Rename the downloaded file to: frpc_linux_amd64_v0.3 3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc """ + import argparse import json @@ -31,14 +32,11 @@ def http_bot(prompt): "stream": True, "max_tokens": 128, } - response = requests.post(args.model_url, - headers=headers, - json=pload, - stream=True) + response = requests.post(args.model_url, headers=headers, json=pload, stream=True) - for chunk in response.iter_lines(chunk_size=8192, - decode_unicode=False, - delimiter=b"\n"): + for chunk in response.iter_lines( + chunk_size=8192, decode_unicode=False, delimiter=b"\n" + ): if chunk: data = json.loads(chunk.decode("utf-8")) output = data["text"][0] @@ -48,10 +46,10 @@ def http_bot(prompt): def build_demo(): with gr.Blocks() as demo: gr.Markdown("# vLLM text completion demo\n") - inputbox = gr.Textbox(label="Input", - placeholder="Enter text and press ENTER") - outputbox = gr.Textbox(label="Output", - placeholder="Generated result from the model") + inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER") + outputbox = gr.Textbox( + label="Output", placeholder="Generated result from the model" + ) inputbox.submit(http_bot, [inputbox], [outputbox]) return demo @@ -60,17 +58,15 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--host", type=str, default=None) parser.add_argument("--port", type=int, default=8001) - parser.add_argument("--model-url", - type=str, - default="http://localhost:8000/generate") + parser.add_argument( + "--model-url", type=str, default="http://localhost:8000/generate" + ) return parser.parse_args() def main(args): demo = build_demo() - demo.queue().launch(server_name=args.host, - server_port=args.port, - share=True) + demo.queue().launch(server_name=args.host, server_port=args.port, share=True) if __name__ == "__main__": diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/jinaai_rerank_client.py index 3076bba765ce5..7eb3d2193f41b 100644 --- a/examples/online_serving/jinaai_rerank_client.py +++ b/examples/online_serving/jinaai_rerank_client.py @@ -5,6 +5,7 @@ Jina and Cohere https://jina.ai/reranker run: vllm serve BAAI/bge-reranker-base """ + import json import requests @@ -14,14 +15,13 @@ url = "http://127.0.0.1:8000/rerank" headers = {"accept": "application/json", "Content-Type": "application/json"} data = { - "model": - "BAAI/bge-reranker-base", - "query": - "What is the capital of France?", + "model": "BAAI/bge-reranker-base", + "query": "What is the capital of France?", "documents": [ "The capital of Brazil is Brasilia.", - "The capital of France is Paris.", "Horses and cows are both animals" - ] + "The capital of France is Paris.", + "Horses and cows are both animals", + ], } diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/online_serving/kv_events_subscriber.py index 88bbbebd74787..65d74dccab807 100644 --- a/examples/online_serving/kv_events_subscriber.py +++ b/examples/online_serving/kv_events_subscriber.py @@ -9,17 +9,14 @@ from msgspec.msgpack import Decoder # # Types copied from vllm.distributed.kv_events # -class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True, - gc=False): +class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True, gc=False): ts: float events: list[Any] -class KVCacheEvent(msgspec.Struct, - array_like=True, - omit_defaults=True, - gc=False, - tag=True): +class KVCacheEvent( + msgspec.Struct, array_like=True, omit_defaults=True, gc=False, tag=True +): """Base class for all KV cache-related events""" @@ -77,8 +74,9 @@ def main(): if last_seq >= 0 and seq > last_seq + 1: missed = seq - last_seq - 1 - print(f"Missed {missed} messages" - f" (last: {last_seq}, current: {seq})") + print( + f"Missed {missed} messages (last: {last_seq}, current: {seq})" + ) replay.send((last_seq + 1).to_bytes(8, "big")) diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py index 74e0c045d6214..2856e3be3e2dd 100644 --- a/examples/online_serving/openai_chat_completion_client.py +++ b/examples/online_serving/openai_chat_completion_client.py @@ -3,28 +3,35 @@ NOTE: start a supported chat completion model server with `vllm serve`, e.g. vllm serve meta-llama/Llama-2-7b-chat-hf """ + +import argparse + from openai import OpenAI # Modify OpenAI's API key and API base to use vLLM's API server. openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -messages = [{ - "role": "system", - "content": "You are a helpful assistant." -}, { - "role": "user", - "content": "Who won the world series in 2020?" -}, { - "role": "assistant", - "content": "The Los Angeles Dodgers won the World Series in 2020." -}, { - "role": "user", - "content": "Where was it played?" -}] +messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Who won the world series in 2020?"}, + { + "role": "assistant", + "content": "The Los Angeles Dodgers won the World Series in 2020.", + }, + {"role": "user", "content": "Where was it played?"}, +] -def main(): +def parse_args(): + parser = argparse.ArgumentParser(description="Client for vLLM API server") + parser.add_argument( + "--stream", action="store_true", help="Enable streaming response" + ) + return parser.parse_args() + + +def main(args): client = OpenAI( # defaults to os.environ.get("OPENAI_API_KEY") api_key=openai_api_key, @@ -34,16 +41,23 @@ def main(): models = client.models.list() model = models.data[0].id + # Chat Completion API chat_completion = client.chat.completions.create( messages=messages, model=model, + stream=args.stream, ) print("-" * 50) print("Chat completion results:") - print(chat_completion) + if args.stream: + for c in chat_completion: + print(c) + else: + print(chat_completion) print("-" * 50) if __name__ == "__main__": - main() + args = parse_args() + main(args) diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index 2707d46f46e2a..8c3c6ecdd4b01 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -43,7 +43,7 @@ def encode_base64_content_from_url(content_url: str) -> str: with requests.get(content_url) as response: response.raise_for_status() - result = base64.b64encode(response.content).decode('utf-8') + result = base64.b64encode(response.content).decode("utf-8") return result @@ -51,10 +51,7 @@ def encode_base64_content_from_url(content_url: str) -> str: # Text-only inference def run_text_only(model: str) -> None: chat_completion = client.chat.completions.create( - messages=[{ - "role": "user", - "content": "What's the capital of France?" - }], + messages=[{"role": "user", "content": "What's the capital of France?"}], model=model, max_completion_tokens=64, ) @@ -65,26 +62,21 @@ def run_text_only(model: str) -> None: # Single-image input inference def run_single_image(model: str) -> None: - ## Use image url in the payload image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": image_url + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + { + "type": "image_url", + "image_url": {"url": image_url}, }, - }, - ], - }], + ], + } + ], model=model, max_completion_tokens=64, ) @@ -95,22 +87,18 @@ def run_single_image(model: str) -> None: ## Use base64 encoded image in the payload image_base64 = encode_base64_content_from_url(image_url) chat_completion_from_base64 = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{image_base64}" + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}, }, - }, - ], - }], + ], + } + ], model=model, max_completion_tokens=64, ) @@ -124,28 +112,22 @@ def run_multi_image(model: str) -> None: image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What are the animals in these images?" - }, - { - "type": "image_url", - "image_url": { - "url": image_url_duck + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What are the animals in these images?"}, + { + "type": "image_url", + "image_url": {"url": image_url_duck}, }, - }, - { - "type": "image_url", - "image_url": { - "url": image_url_lion + { + "type": "image_url", + "image_url": {"url": image_url_lion}, }, - }, - ], - }], + ], + } + ], model=model, max_completion_tokens=64, ) @@ -161,22 +143,18 @@ def run_video(model: str) -> None: ## Use video url in the payload chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this video?" - }, - { - "type": "video_url", - "video_url": { - "url": video_url + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this video?"}, + { + "type": "video_url", + "video_url": {"url": video_url}, }, - }, - ], - }], + ], + } + ], model=model, max_completion_tokens=64, ) @@ -186,22 +164,18 @@ def run_video(model: str) -> None: ## Use base64 encoded video in the payload chat_completion_from_base64 = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this video?" - }, - { - "type": "video_url", - "video_url": { - "url": f"data:video/mp4;base64,{video_base64}" + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this video?"}, + { + "type": "video_url", + "video_url": {"url": f"data:video/mp4;base64,{video_base64}"}, }, - }, - ], - }], + ], + } + ], model=model, max_completion_tokens=64, ) @@ -219,24 +193,22 @@ def run_audio(model: str) -> None: # OpenAI-compatible schema (`input_audio`) chat_completion_from_base64 = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "input_audio", - "input_audio": { - # Any format supported by librosa is supported - "data": audio_base64, - "format": "wav" + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this audio?"}, + { + "type": "input_audio", + "input_audio": { + # Any format supported by librosa is supported + "data": audio_base64, + "format": "wav", + }, }, - }, - ], - }], + ], + } + ], model=model, max_completion_tokens=64, ) @@ -246,23 +218,21 @@ def run_audio(model: str) -> None: # HTTP URL chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "audio_url", - "audio_url": { - # Any format supported by librosa is supported - "url": audio_url + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this audio?"}, + { + "type": "audio_url", + "audio_url": { + # Any format supported by librosa is supported + "url": audio_url + }, }, - }, - ], - }], + ], + } + ], model=model, max_completion_tokens=64, ) @@ -272,23 +242,21 @@ def run_audio(model: str) -> None: # base64 URL chat_completion_from_base64 = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "audio_url", - "audio_url": { - # Any format supported by librosa is supported - "url": f"data:audio/ogg;base64,{audio_base64}" + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this audio?"}, + { + "type": "audio_url", + "audio_url": { + # Any format supported by librosa is supported + "url": f"data:audio/ogg;base64,{audio_base64}" + }, }, - }, - ], - }], + ], + } + ], model=model, max_completion_tokens=64, ) @@ -308,14 +276,17 @@ example_function_map = { def parse_args(): parser = FlexibleArgumentParser( - description='Demo on using OpenAI client for online serving with ' - 'multimodal language models served with vLLM.') - parser.add_argument('--chat-type', - '-c', - type=str, - default="single-image", - choices=list(example_function_map.keys()), - help='Conversation type with multimodal data.') + description="Demo on using OpenAI client for online serving with " + "multimodal language models served with vLLM." + ) + parser.add_argument( + "--chat-type", + "-c", + type=str, + default="single-image", + choices=list(example_function_map.keys()), + help="Conversation type with multimodal data.", + ) return parser.parse_args() diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py index 94f9c15705864..a0d7841f644fc 100644 --- a/examples/online_serving/openai_chat_completion_client_with_tools.py +++ b/examples/online_serving/openai_chat_completion_client_with_tools.py @@ -16,6 +16,7 @@ vllm serve NousResearch/Hermes-2-Pro-Llama-3-8B \ --chat-template examples/tool_chat_template_hermes.jinja \ --enable-auto-tool-choice --tool-call-parser hermes """ + import json from typing import Any @@ -25,55 +26,55 @@ from openai import OpenAI openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -tools = [{ - "type": "function", - "function": { - "name": "get_current_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "city": { - "type": - "string", - "description": - "The city to find the weather for, e.g. 'San Francisco'" - }, - "state": { - "type": - "string", - "description": - "the two-letter abbreviation for the state that the city is" - " in, e.g. 'CA' which would mean 'California'" - }, - "unit": { - "type": "string", - "description": "The unit to fetch the temperature in", - "enum": ["celsius", "fahrenheit"] - } +properties = { + "city": { + "type": "string", + "description": "The city to find the weather for, e.g. 'San Francisco'", + }, + "state": { + "type": "string", + "description": "the two-letter abbreviation for the state that the city is" + " in, e.g. 'CA' which would mean 'California'", + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"], + }, +} + +tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": properties, + "required": ["city", "state", "unit"], }, - "required": ["city", "state", "unit"] - } + }, } -}] +] -messages = [{ - "role": "user", - "content": "Hi! How are you doing today?" -}, { - "role": "assistant", - "content": "I'm doing well! How can I help you?" -}, { - "role": - "user", - "content": - "Can you tell me what the temperate will be in Dallas, in fahrenheit?" -}] +messages = [ + {"role": "user", "content": "Hi! How are you doing today?"}, + {"role": "assistant", "content": "I'm doing well! How can I help you?"}, + { + "role": "user", + "content": ( + "Can you tell me what the temperate will be in Dallas, in fahrenheit?" + ), + }, +] -def get_current_weather(city: str, state: str, unit: 'str'): - return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is " - "partly cloudly, with highs in the 90's.") +def get_current_weather(city: str, state: str, unit: "str"): + return ( + "The weather in Dallas, Texas is 85 degrees fahrenheit. It is " + "partly cloudly, with highs in the 90's." + ) def handle_tool_calls_stream( @@ -82,10 +83,9 @@ def handle_tool_calls_stream( model: str, tools: list[dict[str, Any]], ) -> list[Any]: - tool_calls_stream = client.chat.completions.create(messages=messages, - model=model, - tools=tools, - stream=True) + tool_calls_stream = client.chat.completions.create( + messages=messages, model=model, tools=tools, stream=True + ) chunks = [] print("chunks: ") for chunk in tool_calls_stream: @@ -106,8 +106,7 @@ def handle_tool_calls_arguments(chunks: list[Any]) -> list[str]: tool_call = chunk.choices[0].delta.tool_calls[0] if tool_call.index != tool_call_idx: if tool_call_idx >= 0: - print(f"streamed tool call arguments: " - f"{arguments[tool_call_idx]}") + print(f"streamed tool call arguments: {arguments[tool_call_idx]}") tool_call_idx = chunk.choices[0].delta.tool_calls[0].index arguments.append("") if tool_call.id: @@ -115,8 +114,7 @@ def handle_tool_calls_arguments(chunks: list[Any]) -> list[str]: if tool_call.function: if tool_call.function.name: - print( - f"streamed tool call name: {tool_call.function.name}") + print(f"streamed tool call name: {tool_call.function.name}") if tool_call.function.arguments: arguments[tool_call_idx] += tool_call.function.arguments @@ -136,9 +134,9 @@ def main(): models = client.models.list() model = models.data[0].id - chat_completion = client.chat.completions.create(messages=messages, - model=model, - tools=tools) + chat_completion = client.chat.completions.create( + messages=messages, model=model, tools=tools + ) print("-" * 70) print("Chat completion results:") @@ -158,10 +156,12 @@ def main(): print("-" * 70) # Add tool call results to the conversation - messages.append({ - "role": "assistant", - "tool_calls": chat_completion.choices[0].message.tool_calls - }) + messages.append( + { + "role": "assistant", + "tool_calls": chat_completion.choices[0].message.tool_calls, + } + ) # Now, simulate a tool call available_tools = {"get_current_weather": get_current_weather} @@ -172,17 +172,18 @@ def main(): args = json.loads(call.function.arguments) result = tool_to_call(**args) print("tool_to_call result: ", result) - messages.append({ - "role": "tool", - "content": result, - "tool_call_id": call.id, - "name": call.function.name - }) + messages.append( + { + "role": "tool", + "content": result, + "tool_call_id": call.id, + "name": call.function.name, + } + ) - chat_completion_2 = client.chat.completions.create(messages=messages, - model=model, - tools=tools, - stream=False) + chat_completion_2 = client.chat.completions.create( + messages=messages, model=model, tools=tools, stream=False + ) print("Chat completion2 results:") print(chat_completion_2) print("-" * 70) diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py index 97d900bb75f1a..45c4232fe1dea 100644 --- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py +++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py @@ -28,18 +28,16 @@ tools = [ "type": "object", "properties": { "city": { - "type": - "string", - "description": - "The city to find the weather for" + "type": "string", + "description": "The city to find the weather for" ", e.g. 'San Francisco'", }, "state": { - "type": - "string", - "description": - "the two-letter abbreviation for the state that the " - "city is in, e.g. 'CA' which would mean 'California'", + "type": "string", + "description": ( + "the two-letter abbreviation for the state that the " + "city is in, e.g. 'CA' which would mean 'California'" + ), }, "unit": { "type": "string", @@ -60,22 +58,20 @@ tools = [ "type": "object", "properties": { "city": { - "type": - "string", - "description": - "The city to get the forecast for, e.g. 'New York'", + "type": "string", + "description": ( + "The city to get the forecast for, e.g. 'New York'" + ), }, "state": { - "type": - "string", - "description": - "The two-letter abbreviation for the state, e.g. 'NY'", + "type": "string", + "description": ( + "The two-letter abbreviation for the state, e.g. 'NY'" + ), }, "days": { - "type": - "integer", - "description": - "Number of days to get the forecast for (1-7)", + "type": "integer", + "description": "Number of days to get the forecast for (1-7)", }, "unit": { "type": "string", @@ -90,19 +86,11 @@ tools = [ ] messages = [ + {"role": "user", "content": "Hi! How are you doing today?"}, + {"role": "assistant", "content": "I'm doing well! How can I help you?"}, { "role": "user", - "content": "Hi! How are you doing today?" - }, - { - "role": "assistant", - "content": "I'm doing well! How can I help you?" - }, - { - "role": - "user", - "content": - "Can you tell me what the current weather is in Dallas \ + "content": "Can you tell me what the current weather is in Dallas \ and the forecast for the next 5 days, in fahrenheit?", }, ] @@ -123,17 +111,16 @@ def main(): model=model, tools=tools, tool_choice="required", - stream=True # Enable streaming response + stream=True, # Enable streaming response ) for chunk in chat_completion: if chunk.choices and chunk.choices[0].delta.tool_calls: print(chunk.choices[0].delta.tool_calls) - chat_completion = client.chat.completions.create(messages=messages, - model=model, - tools=tools, - tool_choice="required") + chat_completion = client.chat.completions.create( + messages=messages, model=model, tools=tools, tool_choice="required" + ) print(chat_completion.choices[0].message.tool_calls) diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py index 660369e55d40e..a4134ea43c4b3 100644 --- a/examples/online_serving/openai_chat_completion_structured_outputs.py +++ b/examples/online_serving/openai_chat_completion_structured_outputs.py @@ -12,15 +12,17 @@ from enum import Enum from openai import BadRequestError, OpenAI from pydantic import BaseModel +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + # Guided decoding by Choice (list of possible options) def guided_choice_completion(client: OpenAI, model: str): completion = client.chat.completions.create( model=model, - messages=[{ - "role": "user", - "content": "Classify this sentiment: vLLM is wonderful!" - }], + messages=[ + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + ], extra_body={"guided_choice": ["positive", "negative"]}, ) return completion.choices[0].message.content @@ -28,20 +30,21 @@ def guided_choice_completion(client: OpenAI, model: str): # Guided decoding by Regex def guided_regex_completion(client: OpenAI, model: str): - prompt = ("Generate an email address for Alan Turing, who works in Enigma." - "End in .com and new line. Example result:" - "alan.turing@enigma.com\n") + prompt = ( + "Generate an email address for Alan Turing, who works in Enigma." + "End in .com and new line. Example result:" + "alan.turing@enigma.com\n" + ) completion = client.chat.completions.create( model=model, - messages=[{ - "role": "user", - "content": prompt, - }], - extra_body={ - "guided_regex": r"\w+@\w+\.com\n", - "stop": ["\n"] - }, + messages=[ + { + "role": "user", + "content": prompt, + } + ], + extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]}, ) return completion.choices[0].message.content @@ -63,14 +66,18 @@ class CarDescription(BaseModel): def guided_json_completion(client: OpenAI, model: str): json_schema = CarDescription.model_json_schema() - prompt = ("Generate a JSON with the brand, model and car_type of" - "the most iconic car from the 90's") + prompt = ( + "Generate a JSON with the brand, model and car_type of" + "the most iconic car from the 90's" + ) completion = client.chat.completions.create( model=model, - messages=[{ - "role": "user", - "content": prompt, - }], + messages=[ + { + "role": "user", + "content": prompt, + } + ], extra_body={"guided_json": json_schema}, ) return completion.choices[0].message.content @@ -92,14 +99,18 @@ def guided_grammar_completion(client: OpenAI, model: str): number ::= "1 " | "2 " """ - prompt = ("Generate an SQL query to show the 'username' and 'email'" - "from the 'users' table.") + prompt = ( + "Generate an SQL query to show the 'username' and 'email'" + "from the 'users' table." + ) completion = client.chat.completions.create( model=model, - messages=[{ - "role": "user", - "content": prompt, - }], + messages=[ + { + "role": "user", + "content": prompt, + } + ], extra_body={"guided_grammar": simplified_sql_grammar}, ) return completion.choices[0].message.content @@ -107,19 +118,23 @@ def guided_grammar_completion(client: OpenAI, model: str): # Extra backend options def extra_backend_options_completion(client: OpenAI, model: str): - prompt = ("Generate an email address for Alan Turing, who works in Enigma." - "End in .com and new line. Example result:" - "alan.turing@enigma.com\n") + prompt = ( + "Generate an email address for Alan Turing, who works in Enigma." + "End in .com and new line. Example result:" + "alan.turing@enigma.com\n" + ) try: # The guided_decoding_disable_fallback option forces vLLM to use # xgrammar, so when it fails you get a 400 with the reason why completion = client.chat.completions.create( model=model, - messages=[{ - "role": "user", - "content": prompt, - }], + messages=[ + { + "role": "user", + "content": prompt, + } + ], extra_body={ "guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"], @@ -134,8 +149,8 @@ def extra_backend_options_completion(client: OpenAI, model: str): def main(): client: OpenAI = OpenAI( - base_url="http://localhost:8000/v1", - api_key="-", + base_url=openai_api_base, + api_key=openai_api_key, ) model = client.models.list().data[0].id diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py index 42aa12c451c04..c73208abe6005 100644 --- a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py +++ b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py @@ -7,18 +7,20 @@ from openai import OpenAI # to enforce the format of a tool call response, but it could be used for # any structured output within a subset of the response. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + def main(): client = OpenAI( - base_url="http://localhost:8000/v1", - api_key="-", + base_url=openai_api_base, + api_key=openai_api_key, ) - messages = [{ - "role": - "user", - "content": - """ + messages = [ + { + "role": "user", + "content": """ You have access to the following function to retrieve the weather in a city: { @@ -55,29 +57,28 @@ You are a helpful assistant. Given the previous instructions, what is the weather in New York City, Boston, and San Francisco? -""" - }] +""", + } + ] response = client.chat.completions.create( model=client.models.list().data[0].id, messages=messages, response_format={ - "type": - "structural_tag", - "structures": [{ - "begin": "<function=get_weather>", - "schema": { - "type": "object", - "properties": { - "city": { - "type": "string" - } - } - }, - "end": "</function>" - }], - "triggers": ["<function="] - }) + "type": "structural_tag", + "structures": [ + { + "begin": "<function=get_weather>", + "schema": { + "type": "object", + "properties": {"city": {"type": "string"}}, + }, + "end": "</function>", + } + ], + "triggers": ["<function="], + }, + ) print(response) diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py index a04f0cdf12f76..1ca61a8d5895f 100644 --- a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py @@ -27,21 +27,22 @@ openai_api_base = "http://localhost:8000/v1" def print_completion_details(completion): - print("reasoning_content: ", - completion.choices[0].message.reasoning_content) + print("reasoning_content: ", completion.choices[0].message.reasoning_content) print("content: ", completion.choices[0].message.content) # Guided decoding by Regex def guided_regex_completion(client: OpenAI, model: str): - prompt = ("What is the capital of France?") + prompt = "What is the capital of France?" completion = client.chat.completions.create( model=model, - messages=[{ - "role": "user", - "content": prompt, - }], + messages=[ + { + "role": "user", + "content": prompt, + } + ], extra_body={ "guided_regex": "(Paris|London)", }, @@ -57,13 +58,15 @@ class People(BaseModel): def guided_json_completion(client: OpenAI, model: str): json_schema = People.model_json_schema() - prompt = ("Generate a JSON with the name and age of one random person.") + prompt = "Generate a JSON with the name and age of one random person." completion = client.chat.completions.create( model=model, - messages=[{ - "role": "user", - "content": prompt, - }], + messages=[ + { + "role": "user", + "content": prompt, + } + ], extra_body={"guided_json": json_schema}, ) print_completion_details(completion) @@ -86,14 +89,18 @@ class CarDescription(BaseModel): def guided_car_json_completion(client: OpenAI, model: str): json_schema = CarDescription.model_json_schema() - prompt = ("Generate a JSON with the brand, model and car_type of" - "the most iconic car from the 90's") + prompt = ( + "Generate a JSON with the brand, model and car_type of" + "the most iconic car from the 90's" + ) completion = client.chat.completions.create( model=model, - messages=[{ - "role": "user", - "content": prompt, - }], + messages=[ + { + "role": "user", + "content": prompt, + } + ], extra_body={"guided_json": json_schema}, ) print_completion_details(completion) @@ -116,14 +123,18 @@ def guided_grammar_completion(client: OpenAI, model: str): """ # This may be very slow https://github.com/vllm-project/vllm/issues/12122 - prompt = ("Generate an SQL query to show the 'username' and 'email'" - "from the 'users' table.") + prompt = ( + "Generate an SQL query to show the 'username' and 'email'" + "from the 'users' table." + ) completion = client.chat.completions.create( model=model, - messages=[{ - "role": "user", - "content": prompt, - }], + messages=[ + { + "role": "user", + "content": prompt, + } + ], extra_body={"guided_grammar": simplified_sql_grammar}, ) print_completion_details(completion) diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py index 9417abd3989a2..a5febad45863b 100644 --- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py @@ -20,9 +20,11 @@ from openai import OpenAI # Now, simulate a tool call -def get_current_weather(city: str, state: str, unit: 'str'): - return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is " - "partly cloudly, with highs in the 90's.") +def get_current_weather(city: str, state: str, unit: "str"): + return ( + "The weather in Dallas, Texas is 85 degrees fahrenheit. It is " + "partly cloudly, with highs in the 90's." + ) available_tools = {"get_current_weather": get_current_weather} @@ -31,49 +33,47 @@ available_tools = {"get_current_weather": get_current_weather} openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -tools = [{ - "type": "function", - "function": { - "name": "get_current_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "city": { - "type": - "string", - "description": - "The city to find the weather for, e.g. 'San Francisco'" - }, - "state": { - "type": - "string", - "description": - "the two-letter abbreviation for the state that the city is" - " in, e.g. 'CA' which would mean 'California'" - }, - "unit": { - "type": "string", - "description": "The unit to fetch the temperature in", - "enum": ["celsius", "fahrenheit"] - } +properties = { + "city": { + "type": "string", + "description": "The city to find the weather for, e.g. 'San Francisco'", + }, + "state": { + "type": "string", + "description": "the two-letter abbreviation for the state that the city is" + " in, e.g. 'CA' which would mean 'California'", + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"], + }, +} + +tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": properties, + "required": ["city", "state", "unit"], }, - "required": ["city", "state", "unit"] - } + }, } -}] -messages = [{ - "role": "user", - "content": "Hi! How are you doing today?" -}, { - "role": "assistant", - "content": "I'm doing well! How can I help you?" -}, { - "role": - "user", - "content": - "Can you tell me what the temperate will be in Dallas, in fahrenheit?" -}] +] +messages = [ + {"role": "user", "content": "Hi! How are you doing today?"}, + {"role": "assistant", "content": "I'm doing well! How can I help you?"}, + { + "role": "user", + "content": ( + "Can you tell me what the temperate will be in Dallas, in fahrenheit?" + ), + }, +] def extract_reasoning_and_calls(chunks: list): @@ -110,73 +110,55 @@ def main(): models = client.models.list() model = models.data[0].id - print( - "---------Full Generate With Automatic Function Calling-------------") - tool_calls = client.chat.completions.create(messages=messages, - model=model, - tools=tools) - print( - f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}" + print("---------Full Generate With Automatic Function Calling-------------") + tool_calls = client.chat.completions.create( + messages=messages, model=model, tools=tools ) - print(f"function name: " - f"{tool_calls.choices[0].message.tool_calls[0].function.name}") - print(f"function arguments: " - f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}") - + print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}") + print(f"function name: {tool_calls.choices[0].message.tool_calls[0].function.name}") print( - "----------Stream Generate With Automatic Function Calling-----------") - tool_calls_stream = client.chat.completions.create(messages=messages, - model=model, - tools=tools, - stream=True) + f"function arguments: " + f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}" + ) + + print("----------Stream Generate With Automatic Function Calling-----------") + tool_calls_stream = client.chat.completions.create( + messages=messages, model=model, tools=tools, stream=True + ) chunks = list(tool_calls_stream) - reasoning_content, arguments, function_names = extract_reasoning_and_calls( - chunks) + reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks) print(f"reasoning_content: {reasoning_content}") print(f"function name: {function_names[0]}") print(f"function arguments: {arguments[0]}") - print( - "----------Full Generate With Named Function Calling-----------------") - tool_calls = client.chat.completions.create(messages=messages, - model=model, - tools=tools, - tool_choice={ - "type": "function", - "function": { - "name": - "get_current_weather" - } - }) + print("----------Full Generate With Named Function Calling-----------------") + tool_calls = client.chat.completions.create( + messages=messages, + model=model, + tools=tools, + tool_choice={"type": "function", "function": {"name": "get_current_weather"}}, + ) tool_call = tool_calls.choices[0].message.tool_calls[0].function - print( - f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}" - ) + print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}") print(f"function name: {tool_call.name}") print(f"function arguments: {tool_call.arguments}") - print( - "----------Stream Generate With Named Function Calling--------------") + print("----------Stream Generate With Named Function Calling--------------") tool_calls_stream = client.chat.completions.create( messages=messages, model=model, tools=tools, - tool_choice={ - "type": "function", - "function": { - "name": "get_current_weather" - } - }, - stream=True) + tool_choice={"type": "function", "function": {"name": "get_current_weather"}}, + stream=True, + ) chunks = list(tool_calls_stream) - reasoning_content, arguments, function_names = extract_reasoning_and_calls( - chunks) + reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks) print(f"reasoning_content: {reasoning_content}") print(f"function name: {function_names[0]}") print(f"function arguments: {arguments[0]}") diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py index 4bf7731cb41e3..f6b8082115f12 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning.py @@ -45,12 +45,12 @@ def main(): # Round 2 messages.append({"role": "assistant", "content": content}) - messages.append({ - "role": - "user", - "content": - "How many Rs are there in the word 'strawberry'?", - }) + messages.append( + { + "role": "user", + "content": "How many Rs are there in the word 'strawberry'?", + } + ) response = client.chat.completions.create(model=model, messages=messages) reasoning_content = response.choices[0].message.reasoning_content diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py index 9cc0a5f2476b3..f984fbabf24fd 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py @@ -43,9 +43,7 @@ def main(): # ruff: noqa: E501 # For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}` - stream = client.chat.completions.create(model=model, - messages=messages, - stream=True) + stream = client.chat.completions.create(model=model, messages=messages, stream=True) print("client: Start streaming chat completions...") printed_reasoning_content = False diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py index c850b5aa2f800..ee519e555ff7f 100644 --- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py @@ -14,26 +14,17 @@ def vlm2vec(): response = requests.post( "http://localhost:8000/v1/embeddings", json={ - "model": - "TIGER-Lab/VLM2Vec-Full", - "messages": [{ - "role": - "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": image_url - } - }, - { - "type": "text", - "text": "Represent the given image." - }, - ], - }], - "encoding_format": - "float", + "model": "TIGER-Lab/VLM2Vec-Full", + "messages": [ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Represent the given image."}, + ], + } + ], + "encoding_format": "float", }, ) response.raise_for_status() @@ -45,19 +36,20 @@ def vlm2vec(): def dse_qwen2_vl(inp: dict): # Embedding an Image if inp["type"] == "image": - messages = [{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": inp["image_url"], - } - }, { - "type": "text", - "text": "What is shown in this image?" - }] - }] + messages = [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": inp["image_url"], + }, + }, + {"type": "text", "text": "What is shown in this image?"}, + ], + } + ] # Embedding a Text Query else: # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image @@ -66,23 +58,21 @@ def dse_qwen2_vl(inp: dict): image_placeholder = Image.new("RGB", (56, 56)) image_placeholder.save(buffer, "png") buffer.seek(0) - image_placeholder = base64.b64encode(buffer.read()).decode('utf-8') - messages = [{ - "role": - "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{image_placeholder}", - } - }, - { - "type": "text", - "text": f"Query: {inp['content']}" - }, - ] - }] + image_placeholder = base64.b64encode(buffer.read()).decode("utf-8") + messages = [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_placeholder}", + }, + }, + {"type": "text", "text": f"Query: {inp['content']}"}, + ], + } + ] response = requests.post( "http://localhost:8000/v1/embeddings", @@ -101,12 +91,15 @@ def dse_qwen2_vl(inp: dict): def parse_args(): parser = argparse.ArgumentParser( "Script to call a specified VLM through the API. Make sure to serve " - "the model with --task embed before running this.") - parser.add_argument("--model", - type=str, - choices=["vlm2vec", "dse_qwen2_vl"], - required=True, - help="Which model to call.") + "the model with --task embed before running this." + ) + parser.add_argument( + "--model", + type=str, + choices=["vlm2vec", "dse_qwen2_vl"], + required=True, + help="Which model to call.", + ) return parser.parse_args() @@ -114,16 +107,20 @@ def main(args): if args.model == "vlm2vec": vlm2vec() elif args.model == "dse_qwen2_vl": - dse_qwen2_vl({ - "type": "image", - "image_url": image_url, - }) - dse_qwen2_vl({ - "type": "text", - "content": "What is the weather like today?", - }) + dse_qwen2_vl( + { + "type": "image", + "image_url": image_url, + } + ) + dse_qwen2_vl( + { + "type": "text", + "content": "What is the weather like today?", + } + ) -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() main(args) diff --git a/examples/online_serving/openai_classification_client.py b/examples/online_serving/openai_classification_client.py index 99241346373ea..649cfa5d6686b 100644 --- a/examples/online_serving/openai_classification_client.py +++ b/examples/online_serving/openai_classification_client.py @@ -16,9 +16,7 @@ def parse_args(): parse = argparse.ArgumentParser() parse.add_argument("--host", type=str, default="localhost") parse.add_argument("--port", type=int, default=8000) - parse.add_argument("--model", - type=str, - default="jason9693/Qwen2.5-1.5B-apeach") + parse.add_argument("--model", type=str, default="jason9693/Qwen2.5-1.5B-apeach") return parse.parse_args() diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py index 6ab7619bff192..b1d21b5e4b9f7 100644 --- a/examples/online_serving/openai_completion_client.py +++ b/examples/online_serving/openai_completion_client.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +import argparse + from openai import OpenAI # Modify OpenAI's API key and API base to use vLLM's API server. @@ -7,7 +9,15 @@ openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -def main(): +def parse_args(): + parser = argparse.ArgumentParser(description="Client for vLLM API server") + parser.add_argument( + "--stream", action="store_true", help="Enable streaming response" + ) + return parser.parse_args() + + +def main(args): client = OpenAI( # defaults to os.environ.get("OPENAI_API_KEY") api_key=openai_api_key, @@ -18,18 +28,18 @@ def main(): model = models.data[0].id # Completion API - stream = False completion = client.completions.create( model=model, prompt="A robot may not injure a human being", echo=False, n=2, - stream=stream, - logprobs=3) + stream=args.stream, + logprobs=3, + ) print("-" * 50) print("Completion results:") - if stream: + if args.stream: for c in completion: print(c) else: @@ -38,4 +48,5 @@ def main(): if __name__ == "__main__": - main() + args = parse_args() + main(args) diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py index 20a64ddb21413..7891e14cb71e2 100644 --- a/examples/online_serving/openai_cross_encoder_score.py +++ b/examples/online_serving/openai_cross_encoder_score.py @@ -4,6 +4,7 @@ Example online usage of Score API. Run `vllm serve <model> --task score` to start up the server in vLLM. """ + import argparse import pprint @@ -38,9 +39,7 @@ def main(args): pprint.pprint(score_response.json()) text_1 = "What is the capital of France?" - text_2 = [ - "The capital of Brazil is Brasilia.", "The capital of France is Paris." - ] + text_2 = ["The capital of Brazil is Brasilia.", "The capital of France is Paris."] prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} score_response = post_http_request(prompt=prompt, api_url=api_url) print("\nPrompt when text_1 is string and text_2 is a list:") @@ -48,12 +47,8 @@ def main(args): print("\nScore Response:") pprint.pprint(score_response.json()) - text_1 = [ - "What is the capital of Brazil?", "What is the capital of France?" - ] - text_2 = [ - "The capital of Brazil is Brasilia.", "The capital of France is Paris." - ] + text_1 = ["What is the capital of Brazil?", "What is the capital of France?"] + text_2 = ["The capital of Brazil is Brasilia.", "The capital of France is Paris."] prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} score_response = post_http_request(prompt=prompt, api_url=api_url) print("\nPrompt when text_1 and text_2 are both lists:") diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py index bc217f7ca7a0b..a055654e91332 100644 --- a/examples/online_serving/openai_embedding_client.py +++ b/examples/online_serving/openai_embedding_client.py @@ -21,7 +21,7 @@ def main(): # ruff: noqa: E501 input=[ "Hello my name is", - "The best thing about vLLM is that it supports many different models" + "The best thing about vLLM is that it supports many different models", ], model=model, ) diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py index abcfe27c27699..2620a12320241 100644 --- a/examples/online_serving/openai_pooling_client.py +++ b/examples/online_serving/openai_pooling_client.py @@ -5,6 +5,7 @@ Example online usage of Pooling API. Run `vllm serve <model> --task <embed|classify|reward|score>` to start up the server in vLLM. """ + import argparse import pprint @@ -21,9 +22,7 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) - parser.add_argument("--model", - type=str, - default="jason9693/Qwen2.5-1.5B-apeach") + parser.add_argument("--model", type=str, default="jason9693/Qwen2.5-1.5B-apeach") return parser.parse_args() @@ -42,15 +41,13 @@ def main(args): # Input like Chat API prompt = { - "model": - model_name, - "messages": [{ - "role": "user", - "content": [{ - "type": "text", - "text": "vLLM is great!" - }], - }] + "model": model_name, + "messages": [ + { + "role": "user", + "content": [{"type": "text", "text": "vLLM is great!"}], + } + ], } pooling_response = post_http_request(prompt=prompt, api_url=api_url) print("Pooling Response:") diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py index 66e622672ef2a..eb501ae72aa9f 100644 --- a/examples/online_serving/openai_transcription_client.py +++ b/examples/online_serving/openai_transcription_client.py @@ -7,8 +7,8 @@ from openai import OpenAI from vllm.assets.audio import AudioAsset -mary_had_lamb = AudioAsset('mary_had_lamb').get_local_path() -winning_call = AudioAsset('winning_call').get_local_path() +mary_had_lamb = AudioAsset("mary_had_lamb").get_local_path() +winning_call = AudioAsset("winning_call").get_local_path() # Modify OpenAI's API key and API base to use vLLM's API server. openai_api_key = "EMPTY" @@ -31,7 +31,8 @@ def sync_openai(): extra_body=dict( seed=4419, repetition_penalty=1.3, - )) + ), + ) print("transcription result:", transcription.text) @@ -42,33 +43,30 @@ sync_openai() async def stream_openai_response(): data = { "language": "en", - 'stream': True, + "stream": True, "model": "openai/whisper-large-v3", } url = openai_api_base + "/audio/transcriptions" headers = {"Authorization": f"Bearer {openai_api_key}"} - print("transcription result:", end=' ') + print("transcription result:", end=" ") async with httpx.AsyncClient() as client: with open(str(winning_call), "rb") as f: - async with client.stream('POST', - url, - files={'file': f}, - data=data, - headers=headers) as response: + async with client.stream( + "POST", url, files={"file": f}, data=data, headers=headers + ) as response: async for line in response.aiter_lines(): # Each line is a JSON object prefixed with 'data: ' if line: - if line.startswith('data: '): - line = line[len('data: '):] + if line.startswith("data: "): + line = line[len("data: ") :] # Last chunk, stream ends - if line.strip() == '[DONE]': + if line.strip() == "[DONE]": break # Parse the JSON response chunk = json.loads(line) # Extract and print the content - content = chunk['choices'][0].get('delta', - {}).get('content') - print(content, end='') + content = chunk["choices"][0].get("delta", {}).get("content") + print(content, end="") # Run the asynchronous function diff --git a/examples/online_serving/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py index a8b353090d79b..33d365f0caa56 100644 --- a/examples/online_serving/opentelemetry/dummy_client.py +++ b/examples/online_serving/opentelemetry/dummy_client.py @@ -1,14 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 import requests -from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import ( - OTLPSpanExporter) +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import (BatchSpanProcessor, - ConsoleSpanExporter) +from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter from opentelemetry.trace import SpanKind, set_tracer_provider -from opentelemetry.trace.propagation.tracecontext import ( - TraceContextTextMapPropagator) +from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator trace_provider = TracerProvider() set_tracer_provider(trace_provider) diff --git a/examples/online_serving/prometheus_grafana/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json index fbe96b48e7995..3488956a5b24c 100644 --- a/examples/online_serving/prometheus_grafana/grafana.json +++ b/examples/online_serving/prometheus_grafana/grafana.json @@ -577,23 +577,6 @@ "refId": "A", "useBackend": false }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Num Swapped", - "range": true, - "refId": "B", - "useBackend": false - }, { "datasource": { "type": "prometheus", @@ -874,19 +857,6 @@ "legendFormat": "GPU Cache Usage", "range": true, "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}", - "hide": false, - "instant": false, - "legendFormat": "CPU Cache Usage", - "range": true, - "refId": "B" } ], "title": "Cache Utilization", diff --git a/examples/online_serving/prompt_embed_inference_with_openai_client.py b/examples/online_serving/prompt_embed_inference_with_openai_client.py index ea580f1b432b8..85ea2340736e8 100644 --- a/examples/online_serving/prompt_embed_inference_with_openai_client.py +++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py @@ -26,6 +26,7 @@ Dependencies: - torch - openai """ + import base64 import io @@ -44,17 +45,13 @@ def main(): # Transformers tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) - transformers_model = transformers.AutoModelForCausalLM.from_pretrained( - model_name) + transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name) # Refer to the HuggingFace repo for the correct format to use - chat = [{ - "role": "user", - "content": "Please tell me about the capital of France." - }] - token_ids = tokenizer.apply_chat_template(chat, - add_generation_prompt=True, - return_tensors='pt') + chat = [{"role": "user", "content": "Please tell me about the capital of France."}] + token_ids = tokenizer.apply_chat_template( + chat, add_generation_prompt=True, return_tensors="pt" + ) embedding_layer = transformers_model.get_input_embeddings() prompt_embeds = embedding_layer(token_ids).squeeze(0) @@ -64,7 +61,7 @@ def main(): torch.save(prompt_embeds, buffer) buffer.seek(0) binary_data = buffer.read() - encoded_embeds = base64.b64encode(binary_data).decode('utf-8') + encoded_embeds = base64.b64encode(binary_data).decode("utf-8") completion = client.completions.create( model=model_name, @@ -75,7 +72,8 @@ def main(): temperature=0.0, # NOTE: The OpenAI client allows passing in extra JSON body via the # `extra_body` argument. - extra_body={"prompt_embeds": encoded_embeds}) + extra_body={"prompt_embeds": encoded_embeds}, + ) print("-" * 30) print(completion.choices[0].text) diff --git a/examples/online_serving/ray_serve_deepseek.py b/examples/online_serving/ray_serve_deepseek.py index e2dce107e78a3..a76020130c3ac 100644 --- a/examples/online_serving/ray_serve_deepseek.py +++ b/examples/online_serving/ray_serve_deepseek.py @@ -28,9 +28,7 @@ llm_config = LLMConfig( }, # Change to the accelerator type of the node accelerator_type="H100", - runtime_env={"env_vars": { - "VLLM_USE_V1": "1" - }}, + runtime_env={"env_vars": {"VLLM_USE_V1": "1"}}, # Customize engine arguments as needed (e.g. vLLM engine kwargs) engine_kwargs={ "tensor_parallel_size": 8, diff --git a/examples/online_serving/retrieval_augmented_generation_with_langchain.py b/examples/online_serving/retrieval_augmented_generation_with_langchain.py index 73063065cb36e..37af3b3887f57 100644 --- a/examples/online_serving/retrieval_augmented_generation_with_langchain.py +++ b/examples/online_serving/retrieval_augmented_generation_with_langchain.py @@ -55,7 +55,7 @@ def load_and_split_documents(config: dict[str, Any]): Load and split documents from web URL """ try: - loader = WebBaseLoader(web_paths=(config["url"], )) + loader = WebBaseLoader(web_paths=(config["url"],)) docs = loader.load() text_splitter = RecursiveCharacterTextSplitter( @@ -121,64 +121,71 @@ def create_qa_chain(retriever: Any, llm: ChatOpenAI, prompt: PromptTemplate): """ Set up question answering chain """ - return ({ - "context": retriever | format_docs, - "question": RunnablePassthrough(), - } - | prompt - | llm - | StrOutputParser()) + return ( + { + "context": retriever | format_docs, + "question": RunnablePassthrough(), + } + | prompt + | llm + | StrOutputParser() + ) def get_parser() -> argparse.ArgumentParser: """ Parse command line arguments """ - parser = argparse.ArgumentParser(description='RAG with vLLM and langchain') + parser = argparse.ArgumentParser(description="RAG with vLLM and langchain") # Add command line arguments - parser.add_argument('--vllm-api-key', - default="EMPTY", - help='API key for vLLM compatible services') - parser.add_argument('--vllm-embedding-endpoint', - default="http://localhost:8000/v1", - help='Base URL for embedding service') - parser.add_argument('--vllm-chat-endpoint', - default="http://localhost:8001/v1", - help='Base URL for chat service') - parser.add_argument('--uri', - default="./milvus.db", - help='URI for Milvus database') parser.add_argument( - '--url', - default=("https://docs.vllm.ai/en/latest/getting_started/" - "quickstart.html"), - help='URL of the document to process') - parser.add_argument('--embedding-model', - default="ssmits/Qwen2-7B-Instruct-embed-base", - help='Model name for embeddings') - parser.add_argument('--chat-model', - default="qwen/Qwen1.5-0.5B-Chat", - help='Model name for chat') - parser.add_argument('-i', - '--interactive', - action='store_true', - help='Enable interactive Q&A mode') - parser.add_argument('-k', - '--top-k', - type=int, - default=3, - help='Number of top results to retrieve') - parser.add_argument('-c', - '--chunk-size', - type=int, - default=1000, - help='Chunk size for document splitting') - parser.add_argument('-o', - '--chunk-overlap', - type=int, - default=200, - help='Chunk overlap for document splitting') + "--vllm-api-key", default="EMPTY", help="API key for vLLM compatible services" + ) + parser.add_argument( + "--vllm-embedding-endpoint", + default="http://localhost:8000/v1", + help="Base URL for embedding service", + ) + parser.add_argument( + "--vllm-chat-endpoint", + default="http://localhost:8001/v1", + help="Base URL for chat service", + ) + parser.add_argument("--uri", default="./milvus.db", help="URI for Milvus database") + parser.add_argument( + "--url", + default=("https://docs.vllm.ai/en/latest/getting_started/quickstart.html"), + help="URL of the document to process", + ) + parser.add_argument( + "--embedding-model", + default="ssmits/Qwen2-7B-Instruct-embed-base", + help="Model name for embeddings", + ) + parser.add_argument( + "--chat-model", default="qwen/Qwen1.5-0.5B-Chat", help="Model name for chat" + ) + parser.add_argument( + "-i", "--interactive", action="store_true", help="Enable interactive Q&A mode" + ) + parser.add_argument( + "-k", "--top-k", type=int, default=3, help="Number of top results to retrieve" + ) + parser.add_argument( + "-c", + "--chunk-size", + type=int, + default=1000, + help="Chunk size for document splitting", + ) + parser.add_argument( + "-o", + "--chunk-overlap", + type=int, + default=200, + help="Chunk overlap for document splitting", + ) return parser @@ -198,7 +205,7 @@ def init_config(args: Namespace): "url": args.url, "chunk_size": args.chunk_size, "chunk_overlap": args.chunk_overlap, - "top_k": args.top_k + "top_k": args.top_k, } @@ -230,7 +237,7 @@ def main(): while True: question = input("\nPlease enter your question: ") - if question.lower() in ['q', 'quit']: + if question.lower() in ["q", "quit"]: print("\nThank you for using! Goodbye!") break @@ -238,7 +245,7 @@ def main(): print(output) else: # Default single question mode - question = ("How to install vLLM?") + question = "How to install vLLM?" output = qa_chain.invoke(question) print("-" * 50) print(output) diff --git a/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py b/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py index a8f76dfe4c697..08796b1b3a546 100644 --- a/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py +++ b/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py @@ -35,6 +35,7 @@ Notes: - Default ports: 8000 (embedding), 8001 (chat) - First run may take time to download models """ + import argparse from argparse import Namespace from typing import Any @@ -59,7 +60,7 @@ def init_config(args: Namespace): "db_path": args.db_path, "chunk_size": args.chunk_size, "chunk_overlap": args.chunk_overlap, - "top_k": args.top_k + "top_k": args.top_k, } @@ -117,52 +118,58 @@ def query_document(index: VectorStoreIndex, question: str, top_k: int): def get_parser() -> argparse.ArgumentParser: """Parse command line arguments""" - parser = argparse.ArgumentParser( - description='RAG with vLLM and LlamaIndex') + parser = argparse.ArgumentParser(description="RAG with vLLM and LlamaIndex") # Add command line arguments parser.add_argument( - '--url', - default=("https://docs.vllm.ai/en/latest/getting_started/" - "quickstart.html"), - help='URL of the document to process') - parser.add_argument('--embedding-model', - default="ssmits/Qwen2-7B-Instruct-embed-base", - help='Model name for embeddings') - parser.add_argument('--chat-model', - default="qwen/Qwen1.5-0.5B-Chat", - help='Model name for chat') - parser.add_argument('--vllm-api-key', - default="EMPTY", - help='API key for vLLM compatible services') - parser.add_argument('--embedding-endpoint', - default="http://localhost:8000/v1", - help='Base URL for embedding service') - parser.add_argument('--chat-endpoint', - default="http://localhost:8001/v1", - help='Base URL for chat service') - parser.add_argument('--db-path', - default="./milvus_demo.db", - help='Path to Milvus database') - parser.add_argument('-i', - '--interactive', - action='store_true', - help='Enable interactive Q&A mode') - parser.add_argument('-c', - '--chunk-size', - type=int, - default=1000, - help='Chunk size for document splitting') - parser.add_argument('-o', - '--chunk-overlap', - type=int, - default=200, - help='Chunk overlap for document splitting') - parser.add_argument('-k', - '--top-k', - type=int, - default=3, - help='Number of top results to retrieve') + "--url", + default=("https://docs.vllm.ai/en/latest/getting_started/quickstart.html"), + help="URL of the document to process", + ) + parser.add_argument( + "--embedding-model", + default="ssmits/Qwen2-7B-Instruct-embed-base", + help="Model name for embeddings", + ) + parser.add_argument( + "--chat-model", default="qwen/Qwen1.5-0.5B-Chat", help="Model name for chat" + ) + parser.add_argument( + "--vllm-api-key", default="EMPTY", help="API key for vLLM compatible services" + ) + parser.add_argument( + "--embedding-endpoint", + default="http://localhost:8000/v1", + help="Base URL for embedding service", + ) + parser.add_argument( + "--chat-endpoint", + default="http://localhost:8001/v1", + help="Base URL for chat service", + ) + parser.add_argument( + "--db-path", default="./milvus_demo.db", help="Path to Milvus database" + ) + parser.add_argument( + "-i", "--interactive", action="store_true", help="Enable interactive Q&A mode" + ) + parser.add_argument( + "-c", + "--chunk-size", + type=int, + default=1000, + help="Chunk size for document splitting", + ) + parser.add_argument( + "-o", + "--chunk-overlap", + type=int, + default=200, + help="Chunk overlap for document splitting", + ) + parser.add_argument( + "-k", "--top-k", type=int, default=3, help="Number of top results to retrieve" + ) return parser @@ -193,7 +200,7 @@ def main(): question = input("\nEnter your question: ") # Check for exit command - if question.lower() in ['quit', 'exit', 'q']: + if question.lower() in ["quit", "exit", "q"]: print("Exiting interactive mode...") break diff --git a/examples/online_serving/streamlit_openai_chatbot_webserver.py b/examples/online_serving/streamlit_openai_chatbot_webserver.py index d8a0f211d44d5..0722aa671f66b 100644 --- a/examples/online_serving/streamlit_openai_chatbot_webserver.py +++ b/examples/online_serving/streamlit_openai_chatbot_webserver.py @@ -26,6 +26,7 @@ Usage: streamlit run streamlit_openai_chatbot_webserver.py \ --logger.level=debug """ + import os from datetime import datetime @@ -33,8 +34,8 @@ import streamlit as st from openai import OpenAI # Get command line arguments from environment variables -openai_api_key = os.getenv('VLLM_API_KEY', "EMPTY") -openai_api_base = os.getenv('VLLM_API_BASE', "http://localhost:8000/v1") +openai_api_key = os.getenv("VLLM_API_KEY", "EMPTY") +openai_api_base = os.getenv("VLLM_API_BASE", "http://localhost:8000/v1") # Initialize session states for managing chat sessions if "sessions" not in st.session_state: @@ -81,9 +82,9 @@ def get_llm_response(messages, model): Streaming response object or error message string """ try: - response = client.chat.completions.create(model=model, - messages=messages, - stream=True) + response = client.chat.completions.create( + model=model, messages=messages, stream=True + ) return response except Exception as e: st.error(f"Error details: {str(e)}") @@ -92,8 +93,9 @@ def get_llm_response(messages, model): # Sidebar - API Settings first st.sidebar.title("API Settings") -new_api_base = st.sidebar.text_input("API Base URL:", - value=st.session_state.api_base_url) +new_api_base = st.sidebar.text_input( + "API Base URL:", value=st.session_state.api_base_url +) if new_api_base != st.session_state.api_base_url: st.session_state.api_base_url = new_api_base st.rerun() @@ -109,16 +111,20 @@ if st.sidebar.button("New Session"): for session_id in sorted(st.session_state.sessions.keys(), reverse=True): # Mark the active session with a pinned button if session_id == st.session_state.active_session: - st.sidebar.button(f"๐Ÿ“ {session_id}", - key=session_id, - type="primary", - on_click=switch_to_chat_session, - args=(session_id, )) + st.sidebar.button( + f"๐Ÿ“ {session_id}", + key=session_id, + type="primary", + on_click=switch_to_chat_session, + args=(session_id,), + ) else: - st.sidebar.button(f"Session {session_id}", - key=session_id, - on_click=switch_to_chat_session, - args=(session_id, )) + st.sidebar.button( + f"Session {session_id}", + key=session_id, + on_click=switch_to_chat_session, + args=(session_id,), + ) # Main interface st.title("vLLM Chat Assistant") @@ -145,18 +151,18 @@ for message in st.session_state.messages: if prompt := st.chat_input("Type your message here..."): # Save user message to session st.session_state.messages.append({"role": "user", "content": prompt}) - st.session_state.sessions[ - st.session_state.current_session] = st.session_state.messages + st.session_state.sessions[st.session_state.current_session] = ( + st.session_state.messages + ) # Display user message with st.chat_message("user"): st.write(prompt) # Prepare messages for llm - messages_for_llm = [{ - "role": m["role"], - "content": m["content"] - } for m in st.session_state.messages] + messages_for_llm = [ + {"role": m["role"], "content": m["content"]} for m in st.session_state.messages + ] # Generate and display llm response with st.chat_message("assistant"): @@ -179,7 +185,4 @@ if prompt := st.chat_input("Type your message here..."): message_placeholder.markdown(full_response) # Save llm response to session history - st.session_state.messages.append({ - "role": "assistant", - "content": full_response - }) + st.session_state.messages.append({"role": "assistant", "content": full_response}) diff --git a/examples/online_serving/utils.py b/examples/online_serving/utils.py index 4826e8e205282..0781a27f19c51 100644 --- a/examples/online_serving/utils.py +++ b/examples/online_serving/utils.py @@ -16,10 +16,10 @@ def get_first_model(client: OpenAI) -> str: f"{client.base_url} with API key {client.api_key}. Check\n" "1. the server is running\n" "2. the server URL is correct\n" - "3. the API key is correct") from e + "3. the API key is correct" + ) from e if len(models.data) == 0: - raise RuntimeError( - f"No models found on the vLLM server at {client.base_url}") + raise RuntimeError(f"No models found on the vLLM server at {client.base_url}") return models.data[0].id diff --git a/examples/lmcache/README.md b/examples/others/lmcache/README.md similarity index 100% rename from examples/lmcache/README.md rename to examples/others/lmcache/README.md diff --git a/examples/lmcache/cpu_offload_lmcache.py b/examples/others/lmcache/cpu_offload_lmcache.py similarity index 87% rename from examples/lmcache/cpu_offload_lmcache.py rename to examples/others/lmcache/cpu_offload_lmcache.py index eedb47dfc12e5..98eafb31ed4f1 100644 --- a/examples/lmcache/cpu_offload_lmcache.py +++ b/examples/others/lmcache/cpu_offload_lmcache.py @@ -20,6 +20,7 @@ Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1 Learn more about LMCache environment setup, please refer to: https://docs.lmcache.ai/getting_started/installation.html """ + import argparse import contextlib import os @@ -49,8 +50,7 @@ def setup_environment_variables(vllm_version: str): @contextlib.contextmanager -def build_llm_with_lmcache(lmcache_connector: str, model: str, - vllm_version: str): +def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str): ktc = KVTransferConfig( kv_connector=lmcache_connector, kv_role="kv_both", @@ -97,18 +97,19 @@ def print_output( for output in outputs: generated_text = output.outputs[0].text print(f"Generated text: {generated_text!r}") - print(f"Generation took {time.time() - start:.2f} seconds, " - f"{req_str} request done.") + print(f"Generation took {time.time() - start:.2f} seconds, {req_str} request done.") print("-" * 50) def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument("-v", - "--version", - choices=["v0", "v1"], - default="v1", - help="Specify vLLM version (default: v1)") + parser.add_argument( + "-v", + "--version", + choices=["v0", "v1"], + default="v1", + help="Specify vLLM version (default: v1)", + ) return parser.parse_args() @@ -125,7 +126,6 @@ def main(): setup_environment_variables(args.version) with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm: - # This example script runs two requests with a shared prefix. # Define the shared prompt and specific prompts shared_prompt = "Hello, how are you?" * 1000 @@ -136,9 +136,7 @@ def main(): shared_prompt + "Tell me a very long story", ] - sampling_params = SamplingParams(temperature=0, - top_p=0.95, - max_tokens=10) + sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) # Print the first output print_output(llm, first_prompt, sampling_params, "first") diff --git a/examples/lmcache/disagg_prefill_lmcache_v0.py b/examples/others/lmcache/disagg_prefill_lmcache_v0.py similarity index 77% rename from examples/lmcache/disagg_prefill_lmcache_v0.py rename to examples/others/lmcache/disagg_prefill_lmcache_v0.py index 66cc941852307..b2b7b3b2c1f97 100644 --- a/examples/lmcache/disagg_prefill_lmcache_v0.py +++ b/examples/others/lmcache/disagg_prefill_lmcache_v0.py @@ -4,12 +4,13 @@ This file demonstrates the example usage of disaggregated prefilling with LMCache. We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode), and launch an additional LMCache server. -KV cache is transferred in the following manner: +KV cache is transferred in the following manner: vLLM prefill node -> LMCache server -> vLLM decode node. Note that `pip install lmcache` is needed to run this example. Learn more about LMCache in https://github.com/LMCache/LMCache. """ + import os import subprocess import time @@ -49,19 +50,23 @@ def run_prefill(prefill_done, prompts): sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) - ktc = KVTransferConfig(kv_connector="LMCacheConnector", - kv_role="kv_producer", - kv_rank=0, - kv_parallel_size=2) + ktc = KVTransferConfig( + kv_connector="LMCacheConnector", + kv_role="kv_producer", + kv_rank=0, + kv_parallel_size=2, + ) # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # memory. Reduce the value if your GPU has less memory. - llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", - kv_transfer_config=ktc, - max_model_len=8000, - gpu_memory_utilization=0.8, - enforce_eager=True) + llm = LLM( + model="mistralai/Mistral-7B-Instruct-v0.2", + kv_transfer_config=ktc, + max_model_len=8000, + gpu_memory_utilization=0.8, + enforce_eager=True, + ) - #llm.generate(prompts, sampling_params) + # llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params) for output in outputs: generated_text = output.outputs[0].text @@ -79,17 +84,21 @@ def run_decode(prefill_done, prompts, timeout=1): sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) - ktc = KVTransferConfig(kv_connector="LMCacheConnector", - kv_role="kv_consumer", - kv_rank=1, - kv_parallel_size=2) + ktc = KVTransferConfig( + kv_connector="LMCacheConnector", + kv_role="kv_consumer", + kv_rank=1, + kv_parallel_size=2, + ) # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # of memory. Reduce the value if your GPU has less memory. - llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", - kv_transfer_config=ktc, - max_model_len=8000, - gpu_memory_utilization=0.8, - enforce_eager=True) + llm = LLM( + model="mistralai/Mistral-7B-Instruct-v0.2", + kv_transfer_config=ktc, + max_model_len=8000, + gpu_memory_utilization=0.8, + enforce_eager=True, + ) print("Waiting for prefill node to finish...") prefill_done.wait() @@ -105,10 +114,9 @@ def run_decode(prefill_done, prompts, timeout=1): def run_lmcache_server(port): - server_proc = subprocess.Popen([ - "python", "-m", "lmcache.experimental.server", "localhost", - str(port) - ]) + server_proc = subprocess.Popen( + ["python", "-m", "lmcache.experimental.server", "localhost", str(port)] + ) return server_proc diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml b/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml similarity index 100% rename from examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml rename to examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml b/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml similarity index 100% rename from examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml rename to examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh similarity index 100% rename from examples/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh rename to examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py similarity index 59% rename from examples/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py rename to examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py index 8db93bc8931b2..20155c2036580 100644 --- a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py +++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py @@ -17,13 +17,17 @@ async def lifespan(app: FastAPI): Lifespan context manager to handle startup and shutdown events. """ # Startup: Initialize clients - prefiller_base_url = f'http://{global_args.prefiller_host}:{global_args.prefiller_port}/v1' - decoder_base_url = f'http://{global_args.decoder_host}:{global_args.decoder_port}/v1' + prefiller_base_url = ( + f"http://{global_args.prefiller_host}:{global_args.prefiller_port}/v1" + ) + decoder_base_url = ( + f"http://{global_args.decoder_host}:{global_args.decoder_port}/v1" + ) - app.state.prefill_client = httpx.AsyncClient(timeout=None, - base_url=prefiller_base_url) - app.state.decode_client = httpx.AsyncClient(timeout=None, - base_url=decoder_base_url) + app.state.prefill_client = httpx.AsyncClient( + timeout=None, base_url=prefiller_base_url + ) + app.state.decode_client = httpx.AsyncClient(timeout=None, base_url=decoder_base_url) yield @@ -37,7 +41,6 @@ app = FastAPI(lifespan=lifespan) class StatsCalculator: - def __init__(self): self._stats = [] self._last_log_time = time.time() @@ -51,13 +54,18 @@ class StatsCalculator: def _log_stats(self): # Print average, median, and 99th percentile np_arr = np.array(self._stats) - output_str = f"\nNum requests: {len(self._stats)}" + \ - "\nPrefill node TTFT stats:" + \ - f"\n - Average (ms): {np.mean(np_arr)}" + \ - f"\n - Median (ms): {np.median(np_arr)}" + \ - f"\n - 99th Percentile (ms): {np.percentile(np_arr, 99)}\n" - print("===============================", output_str, - "===============================") + output_str = ( + f"\nNum requests: {len(self._stats)}" + + "\nPrefill node TTFT stats:" + + f"\n - Average (ms): {np.mean(np_arr)}" + + f"\n - Median (ms): {np.median(np_arr)}" + + f"\n - 99th Percentile (ms): {np.percentile(np_arr, 99)}\n" + ) + print( + "===============================", + output_str, + "===============================", + ) stats_calculator = StatsCalculator() @@ -82,15 +90,16 @@ app.state.prefill_client = None app.state.decode_client = None -async def send_request_to_service(client: httpx.AsyncClient, endpoint: str, - req_data: dict): +async def send_request_to_service( + client: httpx.AsyncClient, endpoint: str, req_data: dict +): """ Send a request to a service using a persistent client. """ req_data = req_data.copy() - req_data['max_tokens'] = 1 - if 'max_completion_tokens' in req_data: - req_data['max_completion_tokens'] = 1 + req_data["max_tokens"] = 1 + if "max_completion_tokens" in req_data: + req_data["max_completion_tokens"] = 1 headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} response = await client.post(endpoint, json=req_data, headers=headers) @@ -98,14 +107,16 @@ async def send_request_to_service(client: httpx.AsyncClient, endpoint: str, return response -async def stream_service_response(client: httpx.AsyncClient, endpoint: str, - req_data: dict): +async def stream_service_response( + client: httpx.AsyncClient, endpoint: str, req_data: dict +): """ Asynchronously stream the response from a service using a persistent client. """ headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} - async with client.stream("POST", endpoint, json=req_data, - headers=headers) as response: + async with client.stream( + "POST", endpoint, json=req_data, headers=headers + ) as response: response.raise_for_status() async for chunk in response.aiter_bytes(): yield chunk @@ -121,28 +132,28 @@ async def handle_completions(request: Request): req_data = await request.json() # Send request to prefill service, ignore the response - await send_request_to_service(app.state.prefill_client, "/completions", - req_data) + await send_request_to_service( + app.state.prefill_client, "/completions", req_data + ) et = time.time() stats_calculator.add(et - st) # Stream response from decode service async def generate_stream(): - async for chunk in stream_service_response(app.state.decode_client, - "/completions", - req_data): + async for chunk in stream_service_response( + app.state.decode_client, "/completions", req_data + ): yield chunk - return StreamingResponse(generate_stream(), - media_type="application/json") + return StreamingResponse(generate_stream(), media_type="text/event-stream") except Exception as e: import sys import traceback + exc_info = sys.exc_info() - print("Error occurred in disagg prefill proxy server" - " - completions endpoint") + print("Error occurred in disagg prefill proxy server - completions endpoint") print(e) print("".join(traceback.format_exception(*exc_info))) raise @@ -158,36 +169,39 @@ async def handle_chat_completions(request: Request): req_data = await request.json() # Send request to prefill service, ignore the response - await send_request_to_service(app.state.prefill_client, - "/chat/completions", req_data) + await send_request_to_service( + app.state.prefill_client, "/chat/completions", req_data + ) et = time.time() stats_calculator.add(et - st) # Stream response from decode service async def generate_stream(): - async for chunk in stream_service_response(app.state.decode_client, - "/chat/completions", - req_data): + async for chunk in stream_service_response( + app.state.decode_client, "/chat/completions", req_data + ): yield chunk - return StreamingResponse(generate_stream(), - media_type="application/json") + return StreamingResponse(generate_stream(), media_type="text/event-stream") except Exception as e: import sys import traceback + exc_info = sys.exc_info() - print("Error occurred in disagg prefill proxy server " - " - chat completions endpoint") + print( + "Error occurred in disagg prefill proxy server - chat completions endpoint" + ) print(e) print("".join(traceback.format_exception(*exc_info))) raise -if __name__ == '__main__': +if __name__ == "__main__": global global_args global_args = parse_args() import uvicorn + uvicorn.run(app, host=global_args.host, port=global_args.port) diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh similarity index 100% rename from examples/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh rename to examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh diff --git a/examples/lmcache/kv_cache_sharing_lmcache_v1.py b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py similarity index 81% rename from examples/lmcache/kv_cache_sharing_lmcache_v1.py rename to examples/others/lmcache/kv_cache_sharing_lmcache_v1.py index 7748f8ca6133a..89945d67a6f38 100644 --- a/examples/lmcache/kv_cache_sharing_lmcache_v1.py +++ b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py @@ -3,13 +3,14 @@ This file demonstrates the example usage of remote KV cache sharing with LMCache. We will launch 2 vllm instances, and launch an additional LMCache server. -KV cache is transferred in the following manner: +KV cache is transferred in the following manner: (1) vLLM instance 1 -> LMCache server (KV cache store). (2) LMCache server -> vLLM instance 2 (KV cache reuse/retrieve). Note that lmcache needs to be installed to run this example. Learn more about LMCache in https://github.com/LMCache/LMCache. """ + import os import subprocess import time @@ -49,15 +50,16 @@ def run_store(store_done, prompts): sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) - ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", - kv_role="kv_both") + ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", kv_role="kv_both") # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # memory. Reduce the value if your GPU has less memory. - llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", - kv_transfer_config=ktc, - max_model_len=8000, - gpu_memory_utilization=0.8, - enforce_eager=True) + llm = LLM( + model="mistralai/Mistral-7B-Instruct-v0.2", + kv_transfer_config=ktc, + max_model_len=8000, + gpu_memory_utilization=0.8, + enforce_eager=True, + ) outputs = llm.generate(prompts, sampling_params) for output in outputs: @@ -76,15 +78,16 @@ def run_retrieve(store_done, prompts, timeout=1): sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) - ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", - kv_role="kv_both") + ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", kv_role="kv_both") # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # of memory. Reduce the value if your GPU has less memory. - llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", - kv_transfer_config=ktc, - max_model_len=8000, - gpu_memory_utilization=0.8, - enforce_eager=True) + llm = LLM( + model="mistralai/Mistral-7B-Instruct-v0.2", + kv_transfer_config=ktc, + max_model_len=8000, + gpu_memory_utilization=0.8, + enforce_eager=True, + ) print("Waiting for KV cache store to finish...") store_done.wait() @@ -100,10 +103,9 @@ def run_retrieve(store_done, prompts, timeout=1): def run_lmcache_server(port): - server_proc = subprocess.Popen([ - "python", "-m", "lmcache.experimental.server", "localhost", - str(port) - ]) + server_proc = subprocess.Popen( + ["python", "-m", "lmcache.experimental.server", "localhost", str(port)] + ) return server_proc diff --git a/examples/other/logging_configuration.md b/examples/others/logging_configuration.md similarity index 100% rename from examples/other/logging_configuration.md rename to examples/others/logging_configuration.md diff --git a/examples/other/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py similarity index 72% rename from examples/other/tensorize_vllm_model.py rename to examples/others/tensorize_vllm_model.py index 7d11ba51a0943..1757776308334 100644 --- a/examples/other/tensorize_vllm_model.py +++ b/examples/others/tensorize_vllm_model.py @@ -6,11 +6,15 @@ import json import os import uuid -from vllm import LLM +from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs -from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs, - TensorizerConfig, - tensorize_vllm_model) +from vllm.lora.request import LoRARequest +from vllm.model_executor.model_loader.tensorizer import ( + TensorizerArgs, + TensorizerConfig, + tensorize_lora_adapter, + tensorize_vllm_model, +) from vllm.utils import FlexibleArgumentParser # yapf conflicts with isort for this docstring @@ -27,7 +31,7 @@ https://github.com/coreweave/tensorizer To serialize a model, install vLLM from source, then run something like this from the root level of this repository: -python -m examples.other.tensorize_vllm_model \ +python examples/others/tensorize_vllm_model.py \ --model facebook/opt-125m \ serialize \ --serialized-directory s3://my-bucket \ @@ -47,7 +51,7 @@ providing a `--keyfile` argument. To deserialize a model, you can run something like this from the root level of this repository: -python -m examples.other.tensorize_vllm_model \ +python examples/others/tensorize_vllm_model.py \ --model EleutherAI/gpt-j-6B \ --dtype float16 \ deserialize \ @@ -65,11 +69,11 @@ shard's rank. Sharded models serialized with this script will be named as model-rank-%03d.tensors For more information on the available arguments for serializing, run -`python -m examples.other.tensorize_vllm_model serialize --help`. +`python -m examples.others.tensorize_vllm_model serialize --help`. Or for deserializing: -`python -m examples.other.tensorize_vllm_model deserialize --help`. +`python examples/others/tensorize_vllm_model.py deserialize --help`. Once a model is serialized, tensorizer can be invoked with the `LLM` class directly to load models: @@ -90,11 +94,27 @@ TensorizerConfig arguments desired. In order to see all of the available arguments usable to configure loading with tensorizer that are given to `TensorizerConfig`, run: -`python -m examples.other.tensorize_vllm_model deserialize --help` +`python examples/others/tensorize_vllm_model.py deserialize --help` under the `tensorizer options` section. These can also be used for deserialization in this example script, although `--tensorizer-uri` and `--path-to-tensors` are functionally the same in this case. + +Tensorizer can also be used to save and load LoRA adapters. A LoRA adapter +can be serialized directly with the path to the LoRA adapter on HF Hub and +a TensorizerConfig object. In this script, passing a HF id to a LoRA adapter +will serialize the LoRA adapter artifacts to `--serialized-directory`. + +You can then use the LoRA adapter with `vllm serve`, for instance, by ensuring +the LoRA artifacts are in your model artifacts directory and specifying +`--enable-lora`. For instance: + +``` +vllm serve <model_path> \ + --load-format tensorizer \ + --model-loader-extra-config '{"tensorizer_uri": "<model_path>.tensors"}' \ + --enable-lora +``` """ @@ -107,6 +127,19 @@ def parse_args(): "also supported, although libsodium must be installed to " "use it.") parser = EngineArgs.add_cli_args(parser) + + parser.add_argument( + "--lora-path", + type=str, + required=False, + help="Path to a LoRA adapter to " + "serialize along with model tensors. This can then be deserialized " + "along with the model by passing a tensorizer_config kwarg to " + "LoRARequest with type TensorizerConfig. See the docstring for this " + "for a usage example." + + ) + subparsers = parser.add_subparsers(dest='command') serialize_parser = subparsers.add_parser( @@ -169,11 +202,42 @@ def parse_args(): def deserialize(): - llm = LLM(model=args.model, - load_format="tensorizer", - tensor_parallel_size=args.tensor_parallel_size, - model_loader_extra_config=tensorizer_config - ) + if args.lora_path: + tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir + llm = LLM(model=args.model, + load_format="tensorizer", + tensor_parallel_size=args.tensor_parallel_size, + model_loader_extra_config=tensorizer_config, + enable_lora=True, + ) + sampling_params = SamplingParams( + temperature=0, + max_tokens=256, + stop=["[/assistant]"] + ) + + # Truncating this as the extra text isn't necessary + prompts = [ + "[user] Write a SQL query to answer the question based on ..." + ] + + # Test LoRA load + print( + llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest("sql-lora", + 1, + args.lora_path, + tensorizer_config = tensorizer_config) + ) + ) + else: + llm = LLM(model=args.model, + load_format="tensorizer", + tensor_parallel_size=args.tensor_parallel_size, + model_loader_extra_config=tensorizer_config + ) return llm @@ -197,7 +261,10 @@ if __name__ == '__main__': model_name = model_ref.split("/")[1] - keyfile = args.keyfile if args.keyfile else None + if args.command == "serialize" or args.command == "deserialize": + keyfile = args.keyfile + else: + keyfile = None if args.model_loader_extra_config: config = json.loads(args.model_loader_extra_config) @@ -228,6 +295,10 @@ if __name__ == '__main__': encryption_keyfile=keyfile, **credentials) + if args.lora_path: + tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir + tensorize_lora_adapter(args.lora_path, tensorizer_config) + tensorize_vllm_model(engine_args, tensorizer_config) elif args.command == "deserialize": diff --git a/examples/pyproject.toml b/examples/pyproject.toml new file mode 100644 index 0000000000000..f825cb203269c --- /dev/null +++ b/examples/pyproject.toml @@ -0,0 +1,54 @@ +# This local pyproject file is part of the migration from yapf to ruff format. +# It uses the same core rules as the main pyproject.toml file, but with the +# following differences: +# - ruff line length is overridden to 88 +# - deprecated typing ignores (UP006, UP035) have been removed + +[tool.ruff] +line-length = 88 +exclude = [ + # External file, leaving license intact + "examples/other/fp8/quantizer/quantize.py", + "vllm/vllm_flash_attn/flash_attn_interface.pyi" +] + +[tool.ruff.lint.per-file-ignores] +"vllm/third_party/**" = ["ALL"] +"vllm/version.py" = ["F401"] +"vllm/_version.py" = ["ALL"] + +[tool.ruff.lint] +select = [ + # pycodestyle + "E", + # Pyflakes + "F", + # pyupgrade + "UP", + # flake8-bugbear + "B", + # flake8-simplify + "SIM", + # isort + "I", + # flake8-logging-format + "G", +] +ignore = [ + # star imports + "F405", "F403", + # lambda expression assignment + "E731", + # Loop control variable not used within loop body + "B007", + # f-string format + "UP032", + # Can remove once 3.10+ is the minimum Python version + "UP007", +] + +[tool.ruff.lint.isort] +known-first-party = ["vllm"] + +[tool.ruff.format] +docstring-code-format = true \ No newline at end of file diff --git a/examples/tool_chat_template_llama4_pythonic.jinja b/examples/tool_chat_template_llama4_pythonic.jinja index bd18a35bdda93..bbed3d8205e07 100644 --- a/examples/tool_chat_template_llama4_pythonic.jinja +++ b/examples/tool_chat_template_llama4_pythonic.jinja @@ -1,16 +1,17 @@ {{- bos_token }} -{%- if custom_tools is defined %} +{%- if custom_tools is defined and custom_tools%} {%- set tools = custom_tools %} {%- endif %} -{%- if not tools_in_user_message is defined %} - {%- set tools_in_user_message = false %} -{%- endif %} -{%- if not tools is defined %} +{%- if tools is defined and tools %} + {%- set tool_definition = tool_definition ~ (tools | tojson(indent=4)) %} +{%- else %} {%- set tools = none %} {%- endif %} + {#- This block extracts the system message, so we can slot it into the right place. #} {%- if messages[0]['role'] == 'system' %} + {%- set user_provided_system_message = true %} {%- if messages[0]['content'] is string %} {%- set system_message = messages[0]['content']|trim %} {%- else %} @@ -18,68 +19,33 @@ {%- endif %} {%- set messages = messages[1:] %} {%- else %} - {%- if tools is not none %} - {#- Add default tool system message when tools are provided #} - {%- set system_message = "You are a helpful assistant with tool calling " - "capabilities. Only reply with a tool call if the function exists in the " - "library provided by the user. If it doesn't exist, just reply directly in " - "natural language. When you receive a tool call response, use the output to " - "format an answer to the original user question." %} + {%- if tools is not none %} + {#- Since not system_message was provided by user, if tool is provided, system_message is now default tool system message #} + {#- This system message is from llama website:https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/ #} + {%- set system_message = "You are a helpful assistant and an expert in function composition. You can answer general questions using your internal knowledge OR invoke functions when necessary. Follow these strict guidelines:\n\n1. FUNCTION CALLS:\n- ONLY use functions that are EXPLICITLY listed in the function list below\n- If NO functions are listed (empty function list []), respond ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\"\n- If a function is not in the list, respond ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\"\n- If ALL required parameters are present AND the query EXACTLY matches a listed function's purpose: output ONLY the function call(s)\n- Use exact format: [func_name1(param1=value1, param2=value2), func_name2(...)]\nExamples:\nCORRECT: [get_weather(location=\"Vancouver\"), calculate_route(start=\"Boston\", end=\"New York\")] <- Only if get_weather and calculate_route are in function list\nINCORRECT: get_weather(location=\"New York\")\nINCORRECT: Let me check the weather: [get_weather(location=\"New York\")]\nINCORRECT: [get_events(location=\"Singapore\")] <- If function not in list\n\n2. RESPONSE RULES:\n- For pure function requests matching a listed function: ONLY output the function call(s)\n- For knowledge questions: ONLY output text\n- For missing parameters: ONLY request the specific missing parameters\n- For unavailable services (not in function list): output ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\". Do NOT execute a function call.\n- If the query asks for information beyond what a listed function provides: output ONLY with internal knowledge about your limitations\n- NEVER combine text and function calls in the same response\n- NEVER suggest alternative functions when the requested service is unavailable\n- NEVER create or invent new functions not listed below\n\n3. STRICT BOUNDARIES:\n- ONLY use functions from the list below - no exceptions\n- NEVER use a function as an alternative to unavailable information\n- NEVER call functions not present in the function list\n- NEVER add explanatory text to function calls\n- NEVER respond with empty brackets\n- Use proper Python/JSON syntax for function calls\n- Check the function list carefully before responding\n\n4. TOOL RESPONSE HANDLING:\n- When receiving tool responses: provide concise, natural language responses\n- Don't repeat tool response verbatim\n- Don't add supplementary information\n\nHere is a list of functions in JSON format that you can invoke:\n" %} {%- else %} {%- set system_message = "" %} {%- endif %} {%- endif %} - -{#- System message if the user supplied one, or if tools are used (default tool system message) #} +{#- Now writing the system message: use the user provided system message if user_provided_system_message, else default tool system message if tools presented #} {%- if system_message %} {#- always use user provided system message to override default tool system message #} {{- "<|header_start|>system<|header_end|>\n\n" }} {{- system_message }} - {%- if tools is not none and not tools_in_user_message %} - {{- "Tools: You have access to the following tools. You might need to use one " - "or more function/tool calls to fulfill the task. \n" - "If none are needed, then proceed to the response.\n\n" - "Tool Call Syntax: You can call tools using the following syntax:\n" - "[func_name1(params_name1=params_value1, params_name2=params_value2, ...), ...]\n" - "Do not include anything else when calling the tools with the syntax above.\n\n" - "Here is a list of functions in JSON format that you can invoke.\n " }} - {%- for t in tools %} - {{- t | tojson(indent=4) }} - {{- "\n\n" }} - {%- endfor %} + {%- if user_provided_system_message and tools %} + {{- "\nHere is a list of functions in JSON format that you can invoke. Use exact format: [func_name1(param1=value1, param2=value2), func_name2(...)]\n" }} + {{- tool_definition -}} + {%- elif tool_definition %} + {{- tool_definition -}} {%- endif %} {{- "<|eot|>" }} {%- endif %} -{#- Custom tools are passed in a user message with some extra guidance #} -{%- if tools_in_user_message and tools is not none %} - {#- Extract the first user message so we can plug it in here #} - {%- if messages | length != 0 %} - {%- if messages[0]['content'] is string %} - {%- set first_user_message = messages[0]['content']|trim %} - {%- else %} - {%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %} - {%- endif %} - {%- set messages = messages[1:] %} - {%- else %} - {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} - {%- endif %} - {{- '<|header_start|>user<|header_end|>\n\n' -}} - {{- first_user_message}} - {{- "\nHere is a list of functions in JSON format that you can invoke:"}} - {%- for t in tools %} - {{- t | tojson(indent=4) }} - {{- "\n\n" }} - {%- endfor %} - {{- "Should you decide to return the function call(s), put them in the format " - "of [func_name1(params_name1=params_value1, params_name2=params_value2, " - "...), ...]\nDo not include anything else when calling the tools with the " - "syntax above." }} -{%- endif %} - +{#- Now deal with all other messages #} {%- for message in messages %} - {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} - {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }} + {#- Base case: messages that are not from tool role and has empty tool_call list #} + {%- if not (message.role == 'ipython' or message.role == 'tool' or ('tool_calls' in message and message.tool_calls|length != 0 )) %} + {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }} {%- if message['content'] is string %} {{- message['content'] }} {%- else %} @@ -91,10 +57,12 @@ {%- endif %} {%- endfor %} {%- endif %} - {{- "<|eot|>" }} - {%- elif 'tool_calls' in message and message.tool_calls|length > 0 %} - {%- set tool_call = message.tool_calls[0].function %} - {{- '<|header_start|>assistant<|header_end|>\n\n' -}} + {{- "<|eot|>" }} + {#- Tool case: messages has non-empty tool_call list, must from assistant #} + {%- elif 'tool_calls' in message %} + {#- assume tool_calls are always coming from assistant #} + {%- if message.role == 'assistant' %} + {{- '<|header_start|>assistant<|header_end|>\n\n' -}} {%- if message['content'] is string %} {{- message['content'] }} {%- else %} @@ -106,32 +74,36 @@ {%- endif %} {%- endfor %} {%- endif %} + {{- "[" }} {%- for tool_call in message.tool_calls %} {%- if tool_call.function is defined %} {%- set tool_call = tool_call.function %} {%- endif %} - {{- tool_call.name + '(' -}} + {{- tool_call.name + '(' -}} {%- for param in tool_call.arguments %} - {{- param + '=' -}} + {{- param + '="' -}} {{- "%s" | format(tool_call.arguments[param]) -}} + {{- '"' -}} {% if not loop.last %}, {% endif %} {%- endfor %} {{- ')' -}} {% if not loop.last %}, {% endif %} {%- endfor %} - {{- "<|eom|>" }} + {{- "]<|eot|>" }} +{%- endif %} +{#- Tool_response case: messages are from tool_response #} {%- elif message.role == "tool" or message.role == "ipython" %} {{- "<|header_start|>ipython<|header_end|>\n\n" }} {%- if message.content is string %} - {{- message.content | tojson }} + {{- message.content | tojson }} {%- else %} {%- for content in message['content'] %} {%- if content['type'] == 'text' %} - {{- content['text'] | tojson }} + {{- content['text'] | tojson }} {%- endif %} {%- endfor %} {%- endif %} - {{- "<|eom|>" }} + {{- "<|eot|>" }} {%- endif %} {%- endfor %} {%- if add_generation_prompt %} diff --git a/mkdocs.yaml b/mkdocs.yaml new file mode 100644 index 0000000000000..52de643f5e2bc --- /dev/null +++ b/mkdocs.yaml @@ -0,0 +1,130 @@ +site_name: vLLM +site_url: https://docs.vllm.ai +repo_url: https://github.com/vllm-project/vllm +exclude_docs: | + *.inc.md + *.template.md +theme: + name: material + logo: assets/logos/vllm-logo-only-light.ico + favicon: assets/logos/vllm-logo-only-light.ico + palette: + # Palette toggle for automatic mode + - media: "(prefers-color-scheme)" + toggle: + icon: material/brightness-auto + name: Switch to light mode + # Palette toggle for light mode + - media: "(prefers-color-scheme: light)" + scheme: default + primary: white + toggle: + icon: material/brightness-7 + name: Switch to dark mode + # Palette toggle for dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: black + toggle: + icon: material/brightness-2 + name: Switch to system preference + features: + - content.code.copy + - content.tabs.link + - navigation.tracking + - navigation.tabs + - navigation.sections + - navigation.prune + - navigation.top + - search.highlight + - search.share + - toc.follow + custom_dir: docs/mkdocs/overrides + +hooks: + - docs/mkdocs/hooks/remove_announcement.py + - docs/mkdocs/hooks/generate_examples.py + - docs/mkdocs/hooks/url_schemes.py + +# Required to stop api-autonav from raising an error +# https://github.com/tlambert03/mkdocs-api-autonav/issues/16 +nav: + - api + +plugins: + - meta + - search + - autorefs + - awesome-nav + # For API reference generation + - api-autonav: + modules: ["vllm"] + api_root_uri: "api" + exclude: + - "re:vllm\\._.*" # Internal modules + - "vllm.third_party" + - "vllm.vllm_flash_attn" + - mkdocstrings: + handlers: + python: + options: + show_symbol_type_heading: true + show_symbol_type_toc: true + filters: [] + summary: + modules: true + show_if_no_docstring: true + show_signature_annotations: true + separate_signature: true + show_overloads: true + signature_crossrefs: true + inventories: + - https://docs.python.org/3/objects.inv + - https://typing-extensions.readthedocs.io/en/latest/objects.inv + - https://docs.aiohttp.org/en/stable/objects.inv + - https://pillow.readthedocs.io/en/stable/objects.inv + - https://numpy.org/doc/stable/objects.inv + - https://pytorch.org/docs/stable/objects.inv + - https://psutil.readthedocs.io/en/stable/objects.inv + +markdown_extensions: + - attr_list + - md_in_html + - admonition + - pymdownx.details + # For content tabs + - pymdownx.superfences + - pymdownx.tabbed: + slugify: !!python/object/apply:pymdownx.slugs.slugify + kwds: + case: lower + alternate_style: true + # For code highlighting + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.snippets + # For emoji and icons + - pymdownx.emoji: + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg + # For in page [TOC] (not sidebar) + - toc: + permalink: true + # For math rendering + - mdx_math: + enable_dollar_delimiter: true + +extra_css: + - mkdocs/stylesheets/extra.css + +extra_javascript: + - mkdocs/javascript/run_llm_widget.js + - https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML + +# Makes the url format end in .html rather than act as a dir +# So index.md generates as index.html and is available under URL /index.html +# https://www.mkdocs.org/user-guide/configuration/#use_directory_urls +use_directory_urls: false diff --git a/pyproject.toml b/pyproject.toml index 0b803a26b6581..10f5dbeae6851 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,8 +35,8 @@ dynamic = [ "version", "dependencies", "optional-dependencies"] [project.urls] Homepage="https://github.com/vllm-project/vllm" -Documentation="https://vllm.readthedocs.io/en/latest/" -Slack="http://slack.vllm.ai/" +Documentation="https://docs.vllm.ai/en/latest/" +Slack="https://slack.vllm.ai/" [project.scripts] vllm = "vllm.entrypoints.cli.main:main" @@ -56,16 +56,12 @@ ignore_patterns = [ ".buildkite/**", "benchmarks/**", "build/**", + "examples/**", ] [tool.ruff] # Allow lines to be as long as 80. line-length = 80 -exclude = [ - # External file, leaving license intact - "examples/other/fp8/quantizer/quantize.py", - "vllm/vllm_flash_attn/flash_attn_interface.pyi" -] [tool.ruff.lint.per-file-ignores] "vllm/third_party/**" = ["ALL"] @@ -113,6 +109,7 @@ ignore = [ ] [tool.mypy] +plugins = ['pydantic.mypy'] ignore_missing_imports = true check_untyped_defs = true follow_imports = "silent" @@ -148,6 +145,7 @@ skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora skip_glob = [ ".buildkite/*", "benchmarks/*", + "examples/*", ] use_parentheses = true skip_gitignore = true @@ -165,12 +163,16 @@ markers = [ [tool.pymarkdown] plugins.md004.style = "sublist" # ul-style +plugins.md007.indent = 4 # ul-indent +plugins.md007.start_indented = true # ul-indent plugins.md013.enabled = false # line-length plugins.md041.enabled = false # first-line-h1 plugins.md033.enabled = false # inline-html +plugins.md046.enabled = false # code-block-style plugins.md024.allow_different_nesting = true # no-duplicate-headers -[tool.ty] +[tool.ty.src] +root = "./vllm" respect-ignore-files = true [tool.ty.environment] diff --git a/requirements/build.txt b/requirements/build.txt index 5edc593b92700..320e5b8925843 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -7,3 +7,4 @@ setuptools-scm>=8 torch==2.7.0 wheel jinja2>=3.1.6 +regex diff --git a/requirements/common.txt b/requirements/common.txt index 80f90e60007e0..de4b3b53166c9 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -1,3 +1,4 @@ +regex # Replace re for higher-performance regex matching cachetools psutil sentencepiece # Required for LLaMA tokenizer. @@ -7,13 +8,13 @@ tqdm blake3 py-cpuinfo transformers >= 4.51.1 -huggingface-hub[hf_xet] >= 0.30.0 # Required for Xet downloads. +huggingface-hub[hf_xet] >= 0.32.0 # Required for Xet downloads. tokenizers >= 0.21.1 # Required for fast incremental detokenization. protobuf # Required by LlamaTokenizer. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. aiohttp openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support) -pydantic >= 2.9 +pydantic >= 2.10 prometheus_client >= 0.18.0 pillow # Required for image processing prometheus-fastapi-instrumentator >= 7.0.0 @@ -40,7 +41,7 @@ compressed-tensors == 0.9.4 # required for compressed-tensors depyf==0.18.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py watchfiles # required for http server to monitor the updates of TLS files -python-json-logger # Used by logging as per examples/other/logging_configuration.md +python-json-logger # Used by logging as per examples/others/logging_configuration.md scipy # Required for phi-4-multimodal-instruct ninja # Required for xgrammar, rocm, tpu, xpu opentelemetry-sdk>=1.26.0 # vllm.tracing diff --git a/requirements/cpu.txt b/requirements/cpu.txt index d89847fe71fd0..1213301584ce3 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -8,7 +8,6 @@ setuptools>=77.0.3,<80.0.0 torch==2.7.0+cpu; platform_machine == "x86_64" torch==2.7.0; platform_system == "Darwin" torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64" -torch==2.7.0.dev20250304; platform_machine == "s390x" # required for the image processor of minicpm-o-2_6, this must be updated alongside torch torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x" @@ -21,3 +20,7 @@ datasets # for benchmark scripts # cpu cannot use triton 3.3.0 triton==3.2.0; platform_machine == "x86_64" + +# Intel Extension for PyTorch, only for x86_64 CPUs +intel-openmp==2024.2.1; platform_machine == "x86_64" +intel_extension_for_pytorch==2.7.0; platform_machine == "x86_64" diff --git a/requirements/docs.txt b/requirements/docs.txt index 9c267edaceaf1..64c70cb65c550 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -1,19 +1,9 @@ -sphinx==7.4.7 -sphinx-argparse==0.5.2 -sphinx-book-theme==1.1.4 -sphinx-copybutton==0.5.2 -sphinx-design==0.6.1 -sphinx-togglebutton==0.3.2 -myst-parser==3.0.1 # `myst-parser==4.0.1` breaks inline code in titles -msgspec -snowballstemmer<3 # https://github.com/snowballstem/snowball/issues/229 -commonmark # Required by sphinx-argparse when using :markdownhelp: - -# Custom autodoc2 is necessary for faster docstring processing -# see: https://github.com/sphinx-extensions2/sphinx-autodoc2/issues/33#issuecomment-2856386035 -git+https://github.com/hmellor/sphinx-autodoc2.git # sphinx-autodoc2==0.5.0 - -# packages to install to build the documentation -cachetools --f https://download.pytorch.org/whl/cpu -torch \ No newline at end of file +mkdocs +mkdocs-api-autonav +mkdocs-material +mkdocstrings-python +mkdocs-gen-files +mkdocs-awesome-nav +python-markdown-math +regex +ruff diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index 3aebcaa623c03..e9b466d3a82d6 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -38,4 +38,4 @@ matplotlib # required for qwen-vl test # required for Multi-Modal Models Test (Standard) num2words # required for smolvlm test pqdm -timm # required for internvl test +timm # required for internvl test \ No newline at end of file diff --git a/requirements/test.in b/requirements/test.in index 87af617690388..e906752ff875b 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -51,3 +51,4 @@ numpy runai-model-streamer==0.11.0 runai-model-streamer-s3==0.11.0 fastsafetensors>=0.1.10 +pydantic>=2.10 # 2.9 leads to error on python 3.10 \ No newline at end of file diff --git a/requirements/test.txt b/requirements/test.txt index 89d477017342e..60dcaca816a2b 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -480,12 +480,13 @@ pycparser==2.22 # via cffi pycryptodomex==3.22.0 # via blobfile -pydantic==2.9.2 +pydantic==2.11.5 # via + # -r requirements/test.in # datamodel-code-generator # mistral-common # mteb -pydantic-core==2.23.4 +pydantic-core==2.33.2 # via pydantic pygments==2.18.0 # via rich @@ -784,6 +785,9 @@ typing-extensions==4.12.2 # pydantic-core # torch # typer + # typing-inspection +typing-inspection==0.4.1 + # via pydantic tzdata==2024.2 # via pandas uri-template==1.3.0 diff --git a/requirements/tpu.txt b/requirements/tpu.txt index 11501bc5d92f3..edc8b2a456670 100644 --- a/requirements/tpu.txt +++ b/requirements/tpu.txt @@ -18,9 +18,9 @@ setuptools==78.1.0 --find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -torch==2.8.0.dev20250430 -torchvision==0.22.0.dev20250430 -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250430-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250430-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250430-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" +torch==2.8.0.dev20250529 +torchvision==0.22.0.dev20250529 +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250529-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250529-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250529-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" diff --git a/setup.py b/setup.py old mode 100755 new mode 100644 index 7675fbdf3efec..c190864dda94e --- a/setup.py +++ b/setup.py @@ -251,11 +251,8 @@ class cmake_build_ext(build_ext): # CMake appends the extension prefix to the install path, # and outdir already contains that prefix, so we need to remove it. - # We assume only the final component of extension prefix is added by - # CMake, this is currently true for current extensions but may not - # always be the case. prefix = outdir - if '.' in ext.name: + for _ in range(ext.name.count('.')): prefix = prefix.parent # prefix here should actually be the same for all components @@ -389,7 +386,6 @@ class repackage_wheel(build_ext): # vllm_flash_attn python code: # Regex from # `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)` - import re compiled_regex = re.compile( r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py") file_members += list( diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 86b5e1e0ab7cf..11c8e7a4b9d1c 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -60,7 +60,6 @@ def _fix_prompt_embed_outputs( @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("backend", ["FLASH_ATTN"]) -@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("enforce_eager", [False]) @pytest.mark.parametrize("enable_prompt_embeds", [True, False]) @@ -69,7 +68,6 @@ def test_models( hf_runner, model: str, backend: str, - dtype: str, max_tokens: int, enforce_eager: bool, enable_prompt_embeds: bool, @@ -97,7 +95,7 @@ def test_models( str(i) for i in range(1024)) + " are:" example_prompts = [prompt] - with hf_runner(model, dtype=dtype) as hf_model: + with hf_runner(model) as hf_model: hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) if enable_prompt_embeds: with torch.no_grad(): @@ -106,7 +104,6 @@ def test_models( with VllmRunner(model, max_model_len=8192, - dtype=dtype, enforce_eager=enforce_eager, enable_prompt_embeds=enable_prompt_embeds, gpu_memory_utilization=0.7) as vllm_model: diff --git a/tests/compile/backend.py b/tests/compile/backend.py index a21e8eca3a6e1..5a02c4e2b3782 100644 --- a/tests/compile/backend.py +++ b/tests/compile/backend.py @@ -5,6 +5,8 @@ from typing import Callable, Union from torch import fx +from vllm.compilation.fx_utils import (find_specified_fn, + find_specified_fn_maybe) from vllm.compilation.inductor_pass import InductorPass from vllm.config import get_current_vllm_config @@ -44,3 +46,19 @@ class TestBackend: self.graph_post_pass = deepcopy(graph) # assign by reference, will reflect the final state of the graph self.final_graph = graph + + def check_before_ops(self, ops, + find_fn=find_specified_fn, \ + find_fn_maybe=find_specified_fn_maybe, \ + ops_fully_replaced=True): + for op in ops: + find_fn(self.graph_pre_pass.nodes, op) + if ops_fully_replaced: + assert find_fn_maybe(self.graph_post_pass.nodes, op) is None + + def check_after_ops(self, ops, + find_fn=find_specified_fn, \ + find_fn_maybe=find_specified_fn_maybe): + for op in ops: + find_fn(self.graph_post_pass.nodes, op) + assert find_fn_maybe(self.graph_pre_pass.nodes, op) is None diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 143cb49697f5b..5ce520a440257 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -74,11 +74,12 @@ class SillyModel(nn.Module): return x -def test_simple_piecewise_compile(): +def _test_simple_piecewise_compile(*, use_inductor): vllm_config = VllmConfig(compilation_config=CompilationConfig( level=CompilationLevel.PIECEWISE, use_cudagraph=True, + use_inductor=use_inductor, splitting_ops=["silly.attention"], cudagraph_copy_inputs=True, cudagraph_capture_sizes=[1, 2], @@ -108,3 +109,11 @@ def test_simple_piecewise_compile(): output = model(input) assert global_counter == 2 assert torch.allclose(output.cpu(), torch.tensor([3., 1.])) + + +def test_simple_piecewise_compile_inductor(): + _test_simple_piecewise_compile(use_inductor=True) + + +def test_simple_piecewise_compile_no_inductor(): + _test_simple_piecewise_compile(use_inductor=False) diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index d4551b1cc3aec..22560befcbd56 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -261,12 +261,14 @@ def tractable_computation(input_ids: torch.Tensor, @torch.inference_mode def run_model(llama_config, use_compile: bool, + use_inductor: bool, split_attn: bool = False) -> torch.Tensor: if use_compile: compilation_config = CompilationConfig( level=CompilationLevel.PIECEWISE, use_cudagraph=True, + use_inductor=use_inductor, cudagraph_capture_sizes=[1, 2], ) if split_attn: @@ -304,7 +306,7 @@ def run_model(llama_config, return output.cpu() -def test_toy_llama(): +def _test_toy_llama(*, use_inductor): # compare output with and without piecewise compilation llama_config = LlamaConfig(hidden_size=128, @@ -326,8 +328,14 @@ def test_toy_llama(): num_backend_compilations=0, num_cudagraph_caputured=0, ): - outputs.append(run_model(llama_config, use_compile=False)) - run_model(tractable_config, use_compile=False) + outputs.append( + run_model(llama_config, use_inductor=False, use_compile=False)) + run_model(tractable_config, use_inductor=False, use_compile=False) + + if use_inductor: + kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0} + else: + kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0} with compilation_counter.expect( num_graphs_seen=1, # one graph for the model @@ -336,9 +344,13 @@ def test_toy_llama(): num_backend_compilations=1, # num_piecewise_capturable_graphs_seen num_cudagraph_caputured= 2, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + **kwargs, ): - outputs.append(run_model(llama_config, use_compile=True)) - run_model(tractable_config, use_compile=True) + outputs.append( + run_model(llama_config, + use_inductor=use_inductor, + use_compile=True)) + run_model(tractable_config, use_inductor=use_inductor, use_compile=True) with compilation_counter.expect( num_graphs_seen=1, # one graph for the model @@ -353,13 +365,27 @@ def test_toy_llama(): ), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen ): outputs.append( - run_model(llama_config, use_compile=True, split_attn=True)) - run_model(tractable_config, use_compile=True, split_attn=True) + run_model(llama_config, + use_inductor=use_inductor, + use_compile=True, + split_attn=True)) + run_model(tractable_config, + use_inductor=use_inductor, + use_compile=True, + split_attn=True) for i in range(1, len(outputs)): assert torch.allclose(outputs[0], outputs[i]) +def test_toy_llama_inductor(): + _test_toy_llama(use_inductor=True) + + +def test_toy_no_inductor(): + _test_toy_llama(use_inductor=False) + + @torch.inference_mode def benchmark(): from triton.testing import do_bench diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py new file mode 100644 index 0000000000000..8e4e0ba835793 --- /dev/null +++ b/tests/compile/test_async_tp.py @@ -0,0 +1,248 @@ +# SPDX-License-Identifier: Apache-2.0 + +import json + +import pytest +import torch + +import vllm.envs as envs +from vllm.compilation.collective_fusion import AsyncTPPass +from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig, + PassConfig, VllmConfig) +from vllm.distributed import (tensor_model_parallel_all_gather, + tensor_model_parallel_reduce_scatter) +from vllm.distributed.parallel_state import (init_distributed_environment, + initialize_model_parallel) +from vllm.platforms import current_platform +from vllm.utils import update_environment_variables + +from ..models.registry import HF_EXAMPLE_MODELS +from ..utils import (compare_two_settings, create_new_process_for_each_test, + multi_gpu_test) +from .backend import TestBackend + +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + + +class TestMMRSModel(torch.nn.Module): + + def __init__(self, hidden_size=16): + super().__init__() + self.hidden_size = hidden_size + self.gate_proj = torch.nn.Parameter(torch.empty( + (self.hidden_size * 2, hidden_size)), + requires_grad=False) + # Initialize weights + torch.nn.init.normal_(self.gate_proj, std=0.02) + + def forward(self, hidden_states): + """ + Forward pass implementing the mm + reduce scatter in the FX graph + + """ + # Reshape input + view = hidden_states.reshape(-1, self.hidden_size) + + # matrix multiplication + permute = self.gate_proj.permute(1, 0) + mm = torch.mm(view, permute) + reduce_scatter = tensor_model_parallel_reduce_scatter(mm, dim=0) + return reduce_scatter + + def ops_in_model_before(self): + return [torch.ops.vllm.reduce_scatter.default] + + def ops_in_model_after(self): + return [torch.ops.symm_mem.fused_matmul_reduce_scatter.default] + + +class TestAGMMModel(torch.nn.Module): + + def __init__(self, hidden_size=16): + super().__init__() + self.hidden_size = hidden_size + self.weight = torch.nn.Parameter(torch.empty( + (hidden_size, hidden_size)), + requires_grad=False) + # Initialize weights + torch.nn.init.normal_(self.weight, std=0.02) + + def forward(self, hidden_states): + """ + Forward pass implementing the mm + all gather in the FX graph + """ + # Reshape input + view = hidden_states.reshape(-1, self.hidden_size) + all_gather = tensor_model_parallel_all_gather(view, dim=0) + permute = self.weight.permute(1, 0) + mm = torch.mm(all_gather, permute) + return mm + + def ops_in_model_before(self): + return [torch.ops.vllm.all_gather.default] + + def ops_in_model_after(self): + return [torch.ops.symm_mem.fused_all_gather_matmul.default] + + +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize("test_model", [TestMMRSModel, TestAGMMModel]) +@pytest.mark.parametrize("batch_size", [8]) +@pytest.mark.parametrize("seq_len", [16]) +@pytest.mark.parametrize("hidden_size", [16]) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], + reason="Only test on CUDA") +def test_async_tp_pass_replace(test_model: str, batch_size: int, seq_len: int, + hidden_size: int, dtype: torch.dtype): + num_processes = 2 + + def run_torch_spawn(fn, nprocs): + # need to use torch.mp.spawn otherwise will have problems with + # torch.distributed and cuda + torch.multiprocessing.spawn(fn, + args=(num_processes, test_model, + batch_size, seq_len, hidden_size, + dtype), + nprocs=nprocs) + + run_torch_spawn(async_tp_pass_on_test_model, num_processes) + + +def async_tp_pass_on_test_model(local_rank: int, world_size: int, + test_model_cls: torch.nn.Module, + batch_size: int, seq_len: int, + hidden_size: int, dtype: torch.dtype): + current_platform.seed_everything(0) + + device = torch.device(f"cuda:{local_rank}") + torch.cuda.set_device(device) + torch.set_default_device(device) + torch.set_default_dtype(dtype) + + update_environment_variables({ + 'RANK': str(local_rank), + 'LOCAL_RANK': str(local_rank), + 'WORLD_SIZE': str(world_size), + 'MASTER_ADDR': 'localhost', + 'MASTER_PORT': '12345', + }) + + # initialize distributed + init_distributed_environment() + initialize_model_parallel(tensor_model_parallel_size=world_size) + + # configure vllm config for SequenceParallelismPass + vllm_config = VllmConfig() + vllm_config.compilation_config = CompilationConfig(pass_config=PassConfig( + enable_async_tp=True, ), ) + vllm_config.device_config = DeviceConfig(device=torch.device("cuda")) + + # this is a fake model name to construct the model config + # in the vllm_config, it's not really used. + model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e" + vllm_config.model_config = ModelConfig(model=model_name, + task="auto", + tokenizer=model_name, + tokenizer_mode="auto", + trust_remote_code=True, + dtype=dtype, + seed=42) + + async_tp_pass = AsyncTPPass(vllm_config) + backend = TestBackend(async_tp_pass) + + model = test_model_cls(hidden_size) + + hidden_states = torch.randn((batch_size * seq_len, hidden_size), + dtype=dtype, + requires_grad=False) + + compiled_model = torch.compile(model, backend=backend) + compiled_model(hidden_states) + + # In pre-nodes, all gather or reduce scatter should exist, + # fused_matmul_reduce_scatter or fused_all_gather_matmul should not + backend.check_before_ops(model.ops_in_model_before(), + ops_fully_replaced=False) + + # In post-nodes, fused_matmul_reduce_scatter or \ + # fused_all_gather_matmul should exist + backend.check_after_ops(model.ops_in_model_after()) + + +@create_new_process_for_each_test() +@pytest.mark.parametrize("model_id", ["meta-llama/Llama-3.2-1B-Instruct"]) +@pytest.mark.parametrize("tp_size", [2]) +@pytest.mark.parametrize("async_tp_enabled", [True]) +@pytest.mark.parametrize("distributed_backend", ["mp"]) +@pytest.mark.parametrize("eager_mode", [False, True]) +def test_async_tp_pass_correctness( + model_id: str, + tp_size: int, + async_tp_enabled: bool, + distributed_backend: str, + eager_mode: bool, + num_gpus_available: int, +): + model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id) + model_info.check_transformers_version(on_fail="skip") + model_info.check_available_online(on_fail="skip") + + pp_size = 1 + if num_gpus_available < tp_size: + pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs") + + common_args = [ + "--dtype", + "bfloat16", + "--max-model-len", + "2048", + "--max-num-seqs", + "8", + ] + if eager_mode: + common_args.append("--enforce-eager") + + compilation_config = { + 'level': 3, + 'compile_sizes': [2, 4, 8], + 'splitting_ops': [], + 'pass_config': { + 'enable_async_tp': async_tp_enabled + }, + } + + async_tp_env = tp_env = { + "VLLM_USE_V1": "1", + } + + aysnc_tp_args = [ + *common_args, + "--tensor-parallel-size", + str(tp_size), + "--distributed-executor-backend", + distributed_backend, + "--compilation_config", + json.dumps(compilation_config), + ] + + tp_args = [ + *common_args, + "--tensor-parallel-size", + str(tp_size), + "--distributed-executor-backend", + "mp", + ] + + compare_two_settings(model_id, + aysnc_tp_args, + tp_args, + async_tp_env, + tp_env, + method="generate") diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index 4d56b34bdecfb..509593e7328de 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -29,6 +29,10 @@ class TestModel(torch.nn.Module): self.cutlass_fp8_enabled = cutlass_fp8_enabled self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)] self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)] + self.key = QuantKey(dtype=FP8_DTYPE, + static=static, + per_tensor=static, + symmetric=True) if static: self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)] else: @@ -59,6 +63,15 @@ class TestModel(torch.nn.Module): y3, resid = self.norm[2](x3, resid) # use resid here return y3 + def ops_in_model_before(self): + return [QUANT_OPS[self.key]] + + def ops_in_model_after(self): + return [ + FUSED_OPS[FusedRMSQuantKey(self.key, False)], + FUSED_OPS[FusedRMSQuantKey(self.key, True)] + ] + @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) @pytest.mark.parametrize("hidden_size", [64, 3392, 4096]) @@ -107,25 +120,10 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static, torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL) - # Check substitution worked - pre_nodes = backend.graph_pre_pass.nodes - post_nodes = backend.graph_post_pass.nodes - - # static is per-tensor, dynamic is per-token - key = QuantKey(dtype=FP8_DTYPE, - static=static, - per_tensor=static, - symmetric=True) - rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)] - add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)] - fp8_quant = QUANT_OPS[key] - # In pre-nodes, fp8 quant should be there and fused kernels should not - assert find_auto_fn_maybe(pre_nodes, rms_quant) is None - assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None - find_auto_fn(pre_nodes, fp8_quant) + backend.check_before_ops(model.ops_in_model_before(), find_auto_fn, + find_auto_fn_maybe) # In post-nodes, fused kernels should be there and fp8 quant should not - find_auto_fn(post_nodes, rms_quant) - find_auto_fn(post_nodes, add_rms_quant) - assert find_auto_fn_maybe(post_nodes, fp8_quant) is None + backend.check_after_ops(model.ops_in_model_after(), find_auto_fn, + find_auto_fn_maybe) diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py index 6152f171705b1..2cd7ebaacec00 100644 --- a/tests/compile/test_sequence_parallelism.py +++ b/tests/compile/test_sequence_parallelism.py @@ -5,9 +5,7 @@ import torch import vllm.envs as envs from vllm.compilation.fix_functionalization import FixFunctionalizationPass -from vllm.compilation.fx_utils import (find_auto_fn, find_auto_fn_maybe, - find_specified_fn, - find_specified_fn_maybe, is_func) +from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func from vllm.compilation.sequence_parallelism import SequenceParallelismPass from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig, PassConfig, VllmConfig) @@ -21,17 +19,6 @@ from vllm.utils import update_environment_variables from ..utils import multi_gpu_test from .backend import TestBackend -OPS_IN_MODEL_BEFORE = [ - torch.ops.vllm.all_reduce.default, -] - -OPS_IN_MODEL_AFTER = [ - torch.ops.vllm.reduce_scatter.default, - torch.ops.vllm.all_gather.default, -] - -OPS_IN_MODEL = [torch.ops._C.fused_add_rms_norm.default] - prompts = [ "Hello, my name is", "The president of the United States is", @@ -78,6 +65,18 @@ class TestModel(torch.nn.Module): return norm_output, residual_output + def ops_in_model_before(self): + return [torch.ops.vllm.all_reduce.default] + + def ops_in_model_after(self): + return [ + torch.ops.vllm.reduce_scatter.default, + torch.ops.vllm.all_gather.default + ] + + def ops_in_model(self): + return [torch.ops._C.fused_add_rms_norm.default] + @multi_gpu_test(num_gpus=2) @pytest.mark.parametrize("batch_size", [8]) @@ -156,26 +155,16 @@ def sequence_parallelism_pass_on_test_model(local_rank: int, world_size: int, compiled_model_func = torch.compile(model, backend=backend_func) compiled_model_func(hidden_states, residual) - # Check substitution worked - pre_nodes = backend_no_func.graph_pre_pass.nodes - post_nodes = backend_no_func.graph_post_pass.nodes - # In pre-nodes, all reduce should be there, # reduce scatter and all gather should not - for op in OPS_IN_MODEL_BEFORE: - find_specified_fn(pre_nodes, op) - for op in OPS_IN_MODEL_AFTER: - assert find_specified_fn_maybe(pre_nodes, op) is None + backend_no_func.check_before_ops(model.ops_in_model_before()) # In post-nodes, reduce scatter and all gather should be there, # all reduce should not - for op in OPS_IN_MODEL_AFTER: - find_specified_fn(post_nodes, op) - for op in OPS_IN_MODEL_BEFORE: - assert find_specified_fn_maybe(post_nodes, op) is None + backend_no_func.check_after_ops(model.ops_in_model_after()) # check if the functionalization pass is applied - for op in OPS_IN_MODEL: + for op in model.ops_in_model(): find_auto_fn(backend_no_func.graph_post_pass.nodes, op) assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None # noqa: E501 @@ -183,7 +172,7 @@ def sequence_parallelism_pass_on_test_model(local_rank: int, world_size: int, # make sure the ops were all de-functionalized found = dict() for node in backend_func.graph_post_pass.nodes: - for op in OPS_IN_MODEL: + for op in model.ops_in_model(): if is_func(node, op): found[op] = True - assert all(found[op] for op in OPS_IN_MODEL) + assert all(found[op] for op in model.ops_in_model()) diff --git a/tests/conftest.py b/tests/conftest.py index 19c2c62471295..6336c6c2ce011 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -311,6 +311,7 @@ class HfRunner: dtype: str = "auto", *, model_kwargs: Optional[dict[str, Any]] = None, + trust_remote_code: bool = True, is_sentence_transformer: bool = False, is_cross_encoder: bool = False, skip_tokenizer_init: bool = False, @@ -320,10 +321,15 @@ class HfRunner: self.config = AutoConfig.from_pretrained( model_name, - trust_remote_code=True, + trust_remote_code=trust_remote_code, ) self.device = self.get_default_device() - self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype) + self.dtype = torch_dtype = _get_and_verify_dtype( + self.model_name, + self.config, + dtype=dtype, + is_pooling_model=is_sentence_transformer or is_cross_encoder, + ) model_kwargs = model_kwargs if model_kwargs is not None else {} model_kwargs.setdefault("torch_dtype", torch_dtype) @@ -336,7 +342,7 @@ class HfRunner: model_name, device=self.device, model_kwargs=model_kwargs, - trust_remote_code=True, + trust_remote_code=trust_remote_code, ) elif is_cross_encoder: # Lazy init required for AMD CI @@ -346,12 +352,12 @@ class HfRunner: model_name, device=self.device, automodel_args=model_kwargs, - trust_remote_code=True, + trust_remote_code=trust_remote_code, ) else: model = auto_cls.from_pretrained( model_name, - trust_remote_code=True, + trust_remote_code=trust_remote_code, **model_kwargs, ) @@ -372,7 +378,7 @@ class HfRunner: self.tokenizer = AutoTokenizer.from_pretrained( model_name, torch_dtype=torch_dtype, - trust_remote_code=True, + trust_remote_code=trust_remote_code, ) # don't put this import at the top level @@ -381,7 +387,7 @@ class HfRunner: self.processor = AutoProcessor.from_pretrained( model_name, torch_dtype=torch_dtype, - trust_remote_code=True, + trust_remote_code=trust_remote_code, ) if skip_tokenizer_init: self.tokenizer = self.processor.tokenizer diff --git a/tests/distributed/test_events.py b/tests/distributed/test_events.py index 15bcfdb8555f3..8de1aa20eabd0 100644 --- a/tests/distributed/test_events.py +++ b/tests/distributed/test_events.py @@ -119,13 +119,12 @@ def test_topic_filtering(publisher_config): """ publisher_config.replay_endpoint = None - cfg = publisher_config.model_copy() - cfg.topic = "foo" - pub = EventPublisherFactory.create(cfg) + publisher_config.topic = "foo" + pub = EventPublisherFactory.create(publisher_config) from .conftest import MockSubscriber - sub_foo = MockSubscriber(cfg.endpoint, None, "foo") - sub_bar = MockSubscriber(cfg.endpoint, None, "bar") + sub_foo = MockSubscriber(publisher_config.endpoint, None, "foo") + sub_bar = MockSubscriber(publisher_config.endpoint, None, "bar") try: time.sleep(0.1) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 5346d67b10d16..e6410ab068d23 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -227,6 +227,7 @@ MULTIMODAL_MODELS = { "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(), "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(), "allenai/Molmo-7B-D-0924": PPTestSettings.fast(), + "AIDC-AI/Ovis2-1B": PPTestSettings.fast(), "microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(), "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"), "Qwen/Qwen-VL-Chat": PPTestSettings.fast(), diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py index 711c2441f34bc..f9eacc11d75f8 100644 --- a/tests/distributed/test_shm_broadcast.py +++ b/tests/distributed/test_shm_broadcast.py @@ -9,7 +9,7 @@ import torch.distributed as dist from vllm.distributed.device_communicators.shm_broadcast import MessageQueue from vllm.distributed.utils import StatelessProcessGroup -from vllm.utils import get_ip, get_open_port, update_environment_variables +from vllm.utils import get_open_port, update_environment_variables def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]: @@ -60,12 +60,12 @@ def worker_fn(): rank = dist.get_rank() if rank == 0: port = get_open_port() - ip = get_ip() + ip = '127.0.0.1' dist.broadcast_object_list([ip, port], src=0) else: recv = [None, None] dist.broadcast_object_list(recv, src=0) - ip, port = recv + ip, port = recv # type: ignore stateless_pg = StatelessProcessGroup.create(ip, port, rank, dist.get_world_size()) @@ -107,10 +107,10 @@ def worker_fn(): if pg == dist.group.WORLD: dist.barrier() - print("torch distributed passed the test!") + print(f"torch distributed passed the test! Rank {rank}") else: pg.barrier() - print("StatelessProcessGroup passed the test!") + print(f"StatelessProcessGroup passed the test! Rank {rank}") def test_shm_broadcast(): diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py index fdbdccd4654c1..dd5d17885eb91 100644 --- a/tests/entrypoints/llm/test_guided_generate.py +++ b/tests/entrypoints/llm/test_guided_generate.py @@ -1,12 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 import json -import re import weakref from enum import Enum import jsonschema import pytest +import regex as re from pydantic import BaseModel from vllm.distributed import cleanup_dist_env_and_memory diff --git a/tests/entrypoints/llm/test_init.py b/tests/entrypoints/llm/test_init.py deleted file mode 100644 index 925bf56a93402..0000000000000 --- a/tests/entrypoints/llm/test_init.py +++ /dev/null @@ -1,24 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import pytest - -from vllm import LLM - -from ...utils import error_on_warning - -MODEL_NAME = "facebook/opt-125m" - - -def test_pos_args_deprecated(): - with error_on_warning(DeprecationWarning): - LLM(model=MODEL_NAME, tokenizer=MODEL_NAME) - - with error_on_warning(DeprecationWarning): - LLM(MODEL_NAME, tokenizer=MODEL_NAME) - - with pytest.warns(DeprecationWarning, match="'tokenizer'"): - LLM(MODEL_NAME, MODEL_NAME) - - with pytest.warns(DeprecationWarning, - match="'tokenizer', 'tokenizer_mode'"): - LLM(MODEL_NAME, MODEL_NAME, "auto") diff --git a/tests/entrypoints/openai/correctness/test_mteb.py b/tests/entrypoints/openai/correctness/test_mteb.py index b702e0acd38bc..44d7ac193760f 100644 --- a/tests/entrypoints/openai/correctness/test_mteb.py +++ b/tests/entrypoints/openai/correctness/test_mteb.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 -import math import os import pytest from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS, + MTEB_EMBED_TOL, OpenAIClientMtebEncoder, run_mteb_embed_task, run_mteb_embed_task_st) @@ -39,4 +39,4 @@ def test_mteb(server): print("SentenceTransformer main score: ", st_main_score) print("Difference: ", st_main_score - vllm_main_score) - assert math.isclose(st_main_score, vllm_main_score, rel_tol=1e-4) + assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL) diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index a10b42ea3a4b5..2509ef0d280a2 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -2,13 +2,13 @@ # imports for guided decoding tests import json -import re from typing import Optional import jsonschema import openai # use the official client for correctness check import pytest import pytest_asyncio +import regex as re import requests import torch from openai import BadRequestError, OpenAI diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 1d9aa4972b708..9d12f27a2b879 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 - # imports for guided decoding tests import json -import re import shutil from tempfile import TemporaryDirectory from typing import Optional @@ -11,6 +9,7 @@ import jsonschema import openai # use the official client for correctness check import pytest import pytest_asyncio +import regex as re # downloading lora to test lora requests from huggingface_hub import snapshot_download from openai import BadRequestError diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 1019bfd589362..81ca65b6541a8 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -11,7 +11,8 @@ import requests from vllm.entrypoints.openai.protocol import EmbeddingResponse from vllm.transformers_utils.tokenizer import get_tokenizer -from ...models.utils import run_embedding_correctness_test +from ...models.language.pooling.embed_utils import ( + run_embedding_correctness_test) from ...utils import RemoteOpenAIServer MODEL_NAME = "intfloat/multilingual-e5-small" diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py index 332fa332a4a41..341defae0b315 100644 --- a/tests/entrypoints/openai/test_embedding_dimensions.py +++ b/tests/entrypoints/openai/test_embedding_dimensions.py @@ -11,7 +11,9 @@ import pytest from vllm.entrypoints.openai.protocol import EmbeddingResponse from ...conftest import HfRunner -from ...models.utils import EmbedModelInfo, run_embedding_correctness_test +from ...models.language.pooling.embed_utils import ( + run_embedding_correctness_test) +from ...models.utils import EmbedModelInfo from ...utils import RemoteOpenAIServer MODELS = [ diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py index 2fc08b47513e6..cd07ca46ca651 100644 --- a/tests/entrypoints/openai/test_lora_adapters.py +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -313,3 +313,37 @@ async def test_loading_invalid_adapters_does_not_break_others( prompt=["Hello there", "Foo bar bazz buzz"], max_tokens=5, ) + + +@pytest.mark.asyncio +async def test_beam_search_with_lora_adapters( + client: openai.AsyncOpenAI, + tmp_path, + zephyr_lora_files, +): + """Validate that async beam search can be used with lora.""" + + async def load_and_run_adapter(adapter_name: str): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": adapter_name, + "lora_path": str(zephyr_lora_files) + }) + for _ in range(3): + await client.completions.create( + model=adapter_name, + prompt=["Hello there", "Foo bar bazz buzz"], + max_tokens=5, + extra_body=dict(use_beam_search=True), + ) + + lora_tasks = [] + for i in range(3): + lora_tasks.append( + asyncio.create_task(load_and_run_adapter(f"adapter_{i}"))) + + results, _ = await asyncio.wait(lora_tasks) + + for r in results: + assert not isinstance(r, Exception), f"Got exception {r}" diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 42f7b098f917d..b21c0173c7b86 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -171,10 +171,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer, EXPECTED_METRICS = [ "vllm:num_requests_running", - "vllm:num_requests_swapped", # deprecated "vllm:num_requests_waiting", "vllm:gpu_cache_usage_perc", - "vllm:cpu_cache_usage_perc", # deprecated "vllm:time_to_first_token_seconds_sum", "vllm:time_to_first_token_seconds_bucket", "vllm:time_to_first_token_seconds_count", @@ -274,10 +272,7 @@ EXPECTED_METRICS_V1 = [ "vllm:request_decode_time_seconds_count", ] -HIDDEN_DEPRECATED_METRICS = [ - "vllm:num_requests_swapped", - "vllm:cpu_cache_usage_perc", -] +HIDDEN_DEPRECATED_METRICS: list[str] = [] @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py index 5c585d54c429b..cae2a3b59553d 100644 --- a/tests/entrypoints/openai/test_openai_schema.py +++ b/tests/entrypoints/openai/test_openai_schema.py @@ -1,6 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 +from typing import Final + import pytest import schemathesis +from hypothesis import settings from schemathesis import GenerationConfig from ...utils import RemoteOpenAIServer @@ -9,6 +12,8 @@ schemathesis.experimental.OPEN_API_3_1.enable() MODEL_NAME = "HuggingFaceTB/SmolVLM-256M-Instruct" MAXIMUM_IMAGES = 2 +DEFAULT_TIMEOUT_SECONDS: Final[int] = 10 +LONG_TIMEOUT_SECONDS: Final[int] = 60 @pytest.fixture(scope="module") @@ -42,8 +47,58 @@ def get_schema(server): schema = schemathesis.from_pytest_fixture("get_schema") +@schemathesis.hook +def before_generate_case(context: schemathesis.hooks.HookContext, strategy): + op = context.operation + assert op is not None + + def no_file_type(case: schemathesis.models.Case): + """ + This filter skips test cases for the `POST /tokenize` endpoint where the + HTTP request body uses `"type": "file"` in any message's content. + We expect these cases to fail because that type isn't implemented here + https://github.com/vllm-project/vllm/blob/0b34593017953051b3225b1483ce0f4670e3eb0e/vllm/entrypoints/chat_utils.py#L1038-L1095 + + Example test cases that are skipped: + curl -X POST -H 'Content-Type: application/json' \ + -d '{"messages": [{"role": "assistant"}, {"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \ + http://localhost:8000/tokenize + + curl -X POST -H 'Content-Type: application/json' \ + -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \ + http://localhost:8000/tokenize + """ # noqa: E501 + if (op.method.lower() == "post" and op.path == "/tokenize" + and hasattr(case, "body") and isinstance(case.body, dict) + and "messages" in case.body + and isinstance(case.body["messages"], list) + and len(case.body["messages"]) > 0): + for message in case.body["messages"]: + if not isinstance(message, dict): + continue + content = message.get("content", []) + if not isinstance(content, list) or len(content) == 0: + continue + if any(item.get("type") == "file" for item in content): + return False + return True + + return strategy.filter(no_file_type) + + @schema.parametrize() @schema.override(headers={"Content-Type": "application/json"}) +@settings(deadline=LONG_TIMEOUT_SECONDS * 1000) def test_openapi_stateless(case: schemathesis.Case): + key = ( + case.operation.method.upper(), + case.operation.path, + ) + timeout = { + # requires a longer timeout + ("POST", "/v1/chat/completions"): + LONG_TIMEOUT_SECONDS, + }.get(key, DEFAULT_TIMEOUT_SECONDS) + #No need to verify SSL certificate for localhost - case.call_and_validate(verify=False) + case.call_and_validate(verify=False, timeout=timeout) diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index f889189a99681..e384915899d3d 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -1,10 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # imports for guided decoding tests -import re - import openai import pytest +import regex as re from ...utils import RemoteOpenAIServer @@ -32,7 +31,7 @@ async def test_out_of_vocab_token_ids(): client = remote_server.get_async_client() with pytest.raises(openai.BadRequestError, - match=re.compile('.*out of vocabulary.*')): + match=re.compile('.*out of vocabulary.*').pattern): await client.completions.create(model=model_name, prompt=[999999], max_tokens=5, @@ -46,9 +45,10 @@ async def test_reject_multistep_with_guided_decoding(): with RemoteOpenAIServer(model_name, server_args) as remote_server: client = remote_server.get_async_client() - with pytest.raises(openai.BadRequestError, - match=re.compile( - '.*Guided decoding .* multi-step decoding.*')): + with pytest.raises( + openai.BadRequestError, + match=re.compile( + '.*Guided decoding .* multi-step decoding.*').pattern): await client.completions.create( model=model_name, prompt="Hello", diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index 643d0d06abcb8..99639ce51aa74 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -2,9 +2,10 @@ import json import subprocess -import sys import tempfile +import pytest + from vllm.entrypoints.openai.protocol import BatchRequestOutput # ruff: noqa: E501 @@ -24,9 +25,13 @@ INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": " {"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "Hello world!"}} {"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}""" -INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} +INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} {"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}""" +INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} +{"custom_id": "request-2", "method": "POST", "url": "/v2/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}""" + def test_empty_file(): with tempfile.NamedTemporaryFile( @@ -35,9 +40,8 @@ def test_empty_file(): input_file.write("") input_file.flush() proc = subprocess.Popen([ - sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", - input_file.name, "-o", output_file.name, "--model", - "intfloat/multilingual-e5-small" + "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name, + "--model", "intfloat/multilingual-e5-small" ], ) proc.communicate() proc.wait() @@ -54,9 +58,8 @@ def test_completions(): input_file.write(INPUT_BATCH) input_file.flush() proc = subprocess.Popen([ - sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", - input_file.name, "-o", output_file.name, "--model", - "NousResearch/Meta-Llama-3-8B-Instruct" + "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name, + "--model", "NousResearch/Meta-Llama-3-8B-Instruct" ], ) proc.communicate() proc.wait() @@ -79,9 +82,8 @@ def test_completions_invalid_input(): input_file.write(INVALID_INPUT_BATCH) input_file.flush() proc = subprocess.Popen([ - sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", - input_file.name, "-o", output_file.name, "--model", - "NousResearch/Meta-Llama-3-8B-Instruct" + "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name, + "--model", "NousResearch/Meta-Llama-3-8B-Instruct" ], ) proc.communicate() proc.wait() @@ -95,9 +97,8 @@ def test_embeddings(): input_file.write(INPUT_EMBEDDING_BATCH) input_file.flush() proc = subprocess.Popen([ - sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", - input_file.name, "-o", output_file.name, "--model", - "intfloat/multilingual-e5-small" + "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name, + "--model", "intfloat/multilingual-e5-small" ], ) proc.communicate() proc.wait() @@ -110,16 +111,17 @@ def test_embeddings(): BatchRequestOutput.model_validate_json(line) -def test_score(): +@pytest.mark.parametrize("input_batch", + [INPUT_SCORE_BATCH, INPUT_RERANK_BATCH]) +def test_score(input_batch): with tempfile.NamedTemporaryFile( "w") as input_file, tempfile.NamedTemporaryFile( "r") as output_file: - input_file.write(INPUT_SCORE_BATCH) + input_file.write(input_batch) input_file.flush() proc = subprocess.Popen([ - sys.executable, - "-m", - "vllm.entrypoints.openai.run_batch", + "vllm", + "run-batch", "-i", input_file.name, "-o", diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py index b756680ea9f27..b373f29127524 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/openai/test_score.py @@ -1,6 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 - -import math from typing import Any import pytest @@ -92,7 +90,7 @@ class TestModel: hf_outputs = run_transformers(runner, model, text_pairs) for i in range(len(vllm_outputs)): - assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01) + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) def test_text_1_list_text_2_list(self, server: RemoteOpenAIServer, model: dict[str, Any], runner): @@ -124,7 +122,7 @@ class TestModel: hf_outputs = run_transformers(runner, model, text_pairs) for i in range(len(vllm_outputs)): - assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01) + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) def test_text_1_str_text_2_str(self, server: RemoteOpenAIServer, model: dict[str, Any], runner): @@ -150,7 +148,7 @@ class TestModel: hf_outputs = run_transformers(runner, model, text_pairs) for i in range(len(vllm_outputs)): - assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01) + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) def test_score_max_model_len(self, server: RemoteOpenAIServer, model: dict[str, Any]): diff --git a/tests/entrypoints/openai/test_tensorizer_entrypoint.py b/tests/entrypoints/openai/test_tensorizer_entrypoint.py new file mode 100644 index 0000000000000..f1ab7223048db --- /dev/null +++ b/tests/entrypoints/openai/test_tensorizer_entrypoint.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: Apache-2.0 +import gc +import json +import tempfile + +import openai +import pytest +import pytest_asyncio +import torch.cuda + +from vllm.engine.arg_utils import EngineArgs +from vllm.model_executor.model_loader.tensorizer import ( + TensorizerConfig, tensorize_lora_adapter, tensorize_vllm_model) + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "unsloth/llama-3.2-1b-Instruct" +LORA_PATH = "davzoku/finqa_adapter_1b" + + +def _cleanup(): + gc.collect() + torch.cuda.empty_cache() + + +@pytest.fixture(autouse=True) +def cleanup(): + _cleanup() + + +@pytest.fixture(scope='module') +def tmp_dir(): + with tempfile.TemporaryDirectory() as path: + yield path + + +@pytest.fixture(scope='module') +def model_uri(tmp_dir): + yield f"{tmp_dir}/model.tensors" + + +@pytest.fixture(scope="module") +def tensorize_model_and_lora(tmp_dir, model_uri): + tensorizer_config = TensorizerConfig(tensorizer_uri=model_uri, + lora_dir=tmp_dir) + args = EngineArgs(model=MODEL_NAME, device="cuda") + + tensorize_lora_adapter(LORA_PATH, tensorizer_config) + tensorize_vllm_model(args, tensorizer_config) + + # Manually invoke a _cleanup() here, as the cleanup() + # fixture won't be guaranteed to be called after this + # when this fixture is used for a test + _cleanup() + yield + + +@pytest.fixture(scope="module") +def server(model_uri, tensorize_model_and_lora): + model_loader_extra_config = { + "tensorizer_uri": model_uri, + } + + ## Start OpenAI API server + args = [ + "--load-format", "tensorizer", "--device", "cuda", + "--model-loader-extra-config", + json.dumps(model_loader_extra_config), "--enable-lora" + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): + _cleanup() + completion = await client.completions.create(model=model_name, + prompt="Hello, my name is", + max_tokens=5, + temperature=0.0) + + assert completion.id is not None + assert completion.choices is not None and len(completion.choices) == 1 + assert completion.model == MODEL_NAME + assert len(completion.choices) == 1 + assert len(completion.choices[0].text) >= 5 + assert completion.choices[0].finish_reason == "length" + assert completion.usage == openai.types.CompletionUsage( + completion_tokens=5, prompt_tokens=6, total_tokens=11) diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index 9773f3e45b99c..7d823542e3744 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -76,11 +76,11 @@ async def test_tokenize_completions( }) response.raise_for_status() - assert response.json() == { - "tokens": tokens, - "count": len(tokens), - "max_model_len": 8192 - } + result = response.json() + assert result["tokens"] == tokens + assert result["count"] == len(tokens) + assert result["max_model_len"] == 8192 + assert result["token_strs"] is None @pytest.mark.asyncio @@ -138,11 +138,11 @@ async def test_tokenize_chat( }) response.raise_for_status() - assert response.json() == { - "tokens": tokens, - "count": len(tokens), - "max_model_len": 8192 - } + result = response.json() + assert result["tokens"] == tokens + assert result["count"] == len(tokens) + assert result["max_model_len"] == 8192 + assert result["token_strs"] is None @pytest.mark.asyncio @@ -215,11 +215,46 @@ async def test_tokenize_chat_with_tools( ) response.raise_for_status() - assert response.json() == { - "tokens": tokens, - "count": len(tokens), - "max_model_len": 8192, - } + result = response.json() + assert result["tokens"] == tokens + assert result["count"] == len(tokens) + assert result["max_model_len"] == 8192 + assert result["token_strs"] is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name, tokenizer_name", + [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], + indirect=["tokenizer_name"], +) +async def test_tokenize_with_return_token_strs( + server: RemoteOpenAIServer, + model_name: str, + tokenizer_name: str, +): + tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, + tokenizer_mode="fast") + + prompt = "This is a token_strs test prompt! vllm1" + response = requests.post( + server.url_for("tokenize"), + json={ + "prompt": prompt, + "model": model_name, + "return_token_strs": True + }, + ) + response.raise_for_status() + + tokens = tokenizer.encode(prompt, add_special_tokens=True) + tokens_str = tokenizer.convert_ids_to_tokens(tokens) + + result = response.json() + assert result["tokens"] == tokens + assert result["count"] == len(tokens) + assert result["max_model_len"] == 8192 + assert result["token_strs"] == tokens_str @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py new file mode 100644 index 0000000000000..f5f327ea068c6 --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py @@ -0,0 +1,217 @@ +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import MagicMock, patch + +import pytest + +from tests.entrypoints.openai.tool_parsers.utils import ( + run_tool_extraction, run_tool_extraction_streaming) +from vllm.entrypoints.openai.protocol import FunctionCall +from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager + +# Test cases similar to pythonic parser but with Llama4 specific format +SIMPLE_FUNCTION_OUTPUT = "[get_weather(city='LA', metric='C')]" +SIMPLE_FUNCTION_CALL = FunctionCall( + name="get_weather", + arguments='{"city": "LA", "metric": "C"}', +) +MORE_TYPES_FUNCTION_OUTPUT = ("[register_user(name='Doe', " + "age=9, " + "address={'city': 'LA', 'state': 'CA'}, " + "role=None, " + "passed_test=True, " + "aliases=['John', 'Johnny'])]") +MORE_TYPES_FUNCTION_CALL = FunctionCall( + name="register_user", + arguments='{"name": "Doe", ' + '"age": 9, ' + '"address": {"city": "LA", "state": "CA"}, ' + '"role": null, ' + '"passed_test": true, ' + '"aliases": ["John", "Johnny"]}', +) +PARAMETERLESS_FUNCTION_OUTPUT = "[get_weather()]" +PARAMETERLESS_FUNCTION_CALL = FunctionCall( + name="get_weather", + arguments='{}', +) +EMPTY_DICT_FUNCTION_OUTPUT = "[do_something_cool(additional_data={})]" +EMPTY_DICT_FUNCTION_CALL = FunctionCall( + name="do_something_cool", + arguments='{"additional_data": {}}', +) +EMPTY_LIST_FUNCTION_OUTPUT = "[do_something_cool(steps=[])]" +EMPTY_LIST_FUNCTION_CALL = FunctionCall( + name="do_something_cool", + arguments='{"steps": []}', +) +ESCAPED_STRING_FUNCTION_OUTPUT = ( + r"[get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')]") +ESCAPED_STRING_FUNCTION_CALL = FunctionCall( + name="get_weather", + arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}', +) +PYTHON_TAG_FUNCTION_OUTPUT = ( + "<|python_start|>[get_weather(city='LA', metric='C')]<|python_end|>") + + +@pytest.mark.parametrize("streaming", [True, False]) +def test_no_tool_call(streaming: bool): + mock_tokenizer = MagicMock() + tool_parser: ToolParser = ToolParserManager.get_tool_parser( + "llama4_pythonic")(mock_tokenizer) + model_output = "How can I help you today?" + + content, tool_calls = run_tool_extraction(tool_parser, + model_output, + streaming=streaming) + + assert content == model_output + assert len(tool_calls) == 0 + + +test_str = "<|python_start|>" +test_str += "[get_weather(city='LA', metric='C')," +test_str += "register_user(name='Doe', age=9)]" +TEST_CASES = [ + pytest.param(True, + ESCAPED_STRING_FUNCTION_OUTPUT, + [ESCAPED_STRING_FUNCTION_CALL], + id="simple_streaming"), + pytest.param(False, + SIMPLE_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL], + id="simple_nonstreaming"), + pytest.param(True, + MORE_TYPES_FUNCTION_OUTPUT, [MORE_TYPES_FUNCTION_CALL], + id="more_types_streaming"), + pytest.param(False, + MORE_TYPES_FUNCTION_OUTPUT, [MORE_TYPES_FUNCTION_CALL], + id="more_types_nonstreaming"), + pytest.param(True, + PARAMETERLESS_FUNCTION_OUTPUT, [PARAMETERLESS_FUNCTION_CALL], + id="parameterless_streaming"), + pytest.param(False, + PARAMETERLESS_FUNCTION_OUTPUT, [PARAMETERLESS_FUNCTION_CALL], + id="parameterless_nonstreaming"), + pytest.param(True, + EMPTY_DICT_FUNCTION_OUTPUT, [EMPTY_DICT_FUNCTION_CALL], + id="empty_dict_streaming"), + pytest.param(False, + EMPTY_DICT_FUNCTION_OUTPUT, [EMPTY_DICT_FUNCTION_CALL], + id="empty_dict_nonstreaming"), + pytest.param(True, + EMPTY_LIST_FUNCTION_OUTPUT, [EMPTY_LIST_FUNCTION_CALL], + id="empty_list_streaming"), + pytest.param(False, + EMPTY_LIST_FUNCTION_OUTPUT, [EMPTY_LIST_FUNCTION_CALL], + id="empty_list_nonstreaming"), + pytest.param(True, + ESCAPED_STRING_FUNCTION_OUTPUT, + [ESCAPED_STRING_FUNCTION_CALL], + id="escaped_string_streaming"), + pytest.param(False, + ESCAPED_STRING_FUNCTION_OUTPUT, + [ESCAPED_STRING_FUNCTION_CALL], + id="escaped_string_nonstreaming"), + pytest.param( + True, + "[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]", + [ + SIMPLE_FUNCTION_CALL, + FunctionCall(name="register_user", + arguments='{"name": "Doe", "age": 9}') + ], + id="parallel_calls_streaming"), + pytest.param( + False, + "[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]", + [ + SIMPLE_FUNCTION_CALL, + FunctionCall(name="register_user", + arguments='{"name": "Doe", "age": 9}') + ], + id="parallel_calls_nonstreaming"), + pytest.param(True, + PYTHON_TAG_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL], + id="python_tag_streaming"), + pytest.param(False, + PYTHON_TAG_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL], + id="python_tag_nonstreaming"), + pytest.param(True, + test_str, [ + SIMPLE_FUNCTION_CALL, + FunctionCall(name="register_user", + arguments='{"name": "Doe", "age": 9}') + ], + id="parallel_calls_streaming"), + pytest.param(False, + "<|python_start|>[get_weather(city='LA', metric='C'), " + + "register_user(name='Doe', age=9)]", [ + SIMPLE_FUNCTION_CALL, + FunctionCall(name="register_user", + arguments='{"name": "Doe", "age": 9}') + ], + id="parallel_calls_nonstreaming"), +] + + +@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", + TEST_CASES) +def test_tool_call(streaming: bool, model_output: str, + expected_tool_calls: list[FunctionCall]): + mock_tokenizer = MagicMock() + tool_parser: ToolParser = ToolParserManager.get_tool_parser( + "llama4_pythonic")(mock_tokenizer) + + content, tool_calls = run_tool_extraction(tool_parser, + model_output, + streaming=streaming) + + assert len(tool_calls) == len(expected_tool_calls) + for actual, expected in zip(tool_calls, expected_tool_calls): + assert actual.type == "function" + assert actual.function == expected + + +def test_streaming_tool_call_with_large_steps(): + mock_tokenizer = MagicMock() + tool_parser: ToolParser = ToolParserManager.get_tool_parser( + "llama4_pythonic")(mock_tokenizer) + model_output_deltas = [ + "<|python_start|>[get_weather(city='LA', metric='C'), " + "get_weather(), " + "do_something_cool(steps=[])]<|python_end|>", + ] + + reconstructor = run_tool_extraction_streaming( + tool_parser, model_output_deltas, assert_one_tool_per_delta=False) + + assert reconstructor.other_content == "" + assert len(reconstructor.tool_calls) == 3 + assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL + assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL + assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL + + +@pytest.mark.parametrize("streaming", [False]) +def test_regex_timeout_handling(streaming: bool): + """test regex timeout is handled gracefully""" + mock_tokenizer = MagicMock() + tool_parser: ToolParser = ToolParserManager.get_tool_parser( + "llama4_pythonic")(mock_tokenizer) + + fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2 + + # create a mock regex that raises TimeoutError + mock_regex = MagicMock() + mock_regex.match.side_effect = TimeoutError("Regex timeout") + + with patch.object(tool_parser, 'TOOL_CALL_REGEX', mock_regex): + content, tool_calls = run_tool_extraction(tool_parser, + fake_problematic_input, + streaming=streaming) + + # should treat as regular text when regex times out + assert content == fake_problematic_input + assert len(tool_calls) == 0 + mock_regex.match.assert_called_once() diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py index fbbbc1fb2a596..71f41ea7d93b4 100644 --- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest @@ -159,3 +159,27 @@ def test_streaming_tool_call_with_large_steps(): assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL + + +@pytest.mark.parametrize("streaming", [False]) +def test_regex_timeout_handling(streaming: bool): + """test regex timeout is handled gracefully""" + mock_tokenizer = MagicMock() + tool_parser: ToolParser = ToolParserManager.get_tool_parser( + "llama4_pythonic")(mock_tokenizer) + + fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2 + + # create a mock regex that raises TimeoutError + mock_regex = MagicMock() + mock_regex.match.side_effect = TimeoutError("Regex timeout") + + with patch.object(tool_parser, 'TOOL_CALL_REGEX', mock_regex): + content, tool_calls = run_tool_extraction(tool_parser, + fake_problematic_input, + streaming=streaming) + + # should treat as regular text when regex times out + assert content == fake_problematic_input + assert len(tool_calls) == 0 + mock_regex.match.assert_called_once() diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py new file mode 100644 index 0000000000000..0dd1fdd996948 --- /dev/null +++ b/tests/entrypoints/test_api_server_process_manager.py @@ -0,0 +1,268 @@ +# SPDX-License-Identifier: Apache-2.0 + +import multiprocessing +import socket +import threading +import time +from typing import Optional +from unittest.mock import patch + +import pytest + +from vllm.v1.utils import (APIServerProcessManager, + wait_for_completion_or_failure) + +# Global variables to control worker behavior +WORKER_RUNTIME_SECONDS = 0.5 + + +# Mock implementation of run_api_server_worker +def mock_run_api_server_worker(listen_address, sock, args, client_config=None): + """Mock run_api_server_worker that runs for a specific time.""" + print(f"Mock worker started with client_config: {client_config}") + time.sleep(WORKER_RUNTIME_SECONDS) + print("Mock worker completed successfully") + + +@pytest.fixture +def api_server_args(): + """Fixture to provide arguments for APIServerProcessManager.""" + sock = socket.socket() + return { + "target_server_fn": + mock_run_api_server_worker, + "listen_address": + "localhost:8000", + "sock": + sock, + "args": + "test_args", # Simple string to avoid pickling issues + "num_servers": + 3, + "input_addresses": [ + "tcp://127.0.0.1:5001", "tcp://127.0.0.1:5002", + "tcp://127.0.0.1:5003" + ], + "output_addresses": [ + "tcp://127.0.0.1:6001", "tcp://127.0.0.1:6002", + "tcp://127.0.0.1:6003" + ], + "stats_update_address": + "tcp://127.0.0.1:7000", + } + + +@pytest.mark.parametrize("with_stats_update", [True, False]) +def test_api_server_process_manager_init(api_server_args, with_stats_update): + """Test initializing the APIServerProcessManager.""" + # Set the worker runtime to ensure tests complete in reasonable time + global WORKER_RUNTIME_SECONDS + WORKER_RUNTIME_SECONDS = 0.5 + + # Copy the args to avoid mutating the + args = api_server_args.copy() + + if not with_stats_update: + args.pop("stats_update_address") + manager = APIServerProcessManager(**args) + + try: + # Verify the manager was initialized correctly + assert len(manager.processes) == 3 + + # Verify all processes are running + for proc in manager.processes: + assert proc.is_alive() + + print("Waiting for processes to run...") + time.sleep(WORKER_RUNTIME_SECONDS / 2) + + # They should still be alive at this point + for proc in manager.processes: + assert proc.is_alive() + + finally: + # Always clean up the processes + print("Cleaning up processes...") + manager.close() + + # Give processes time to terminate + time.sleep(0.2) + + # Verify all processes were terminated + for proc in manager.processes: + assert not proc.is_alive() + + +@patch("vllm.entrypoints.cli.serve.run_api_server_worker", + mock_run_api_server_worker) +def test_wait_for_completion_or_failure(api_server_args): + """Test that wait_for_completion_or_failure works with failures.""" + global WORKER_RUNTIME_SECONDS + WORKER_RUNTIME_SECONDS = 1.0 + + # Create the manager + manager = APIServerProcessManager(**api_server_args) + + try: + assert len(manager.processes) == 3 + + # Create a result capture for the thread + result: dict[str, Optional[Exception]] = {"exception": None} + + def run_with_exception_capture(): + try: + wait_for_completion_or_failure(api_server_manager=manager) + except Exception as e: + result["exception"] = e + + # Start a thread to run wait_for_completion_or_failure + wait_thread = threading.Thread(target=run_with_exception_capture, + daemon=True) + wait_thread.start() + + # Let all processes run for a short time + time.sleep(0.2) + + # All processes should still be running + assert all(proc.is_alive() for proc in manager.processes) + + # Now simulate a process failure + print("Simulating process failure...") + manager.processes[0].terminate() + + # Wait for the wait_for_completion_or_failure + # to detect and handle the failure + # This should trigger it to terminate all other processes + wait_thread.join(timeout=1.0) + + # The wait thread should have exited + assert not wait_thread.is_alive() + + # Verify that an exception was raised with appropriate error message + assert result["exception"] is not None + assert "died with exit code" in str(result["exception"]) + + # All processes should now be terminated + for i, proc in enumerate(manager.processes): + assert not proc.is_alive(), f"Process {i} should not be alive" + + finally: + manager.close() + time.sleep(0.2) + + +@pytest.mark.timeout(30) +def test_normal_completion(api_server_args): + """Test that wait_for_completion_or_failure works in normal completion.""" + global WORKER_RUNTIME_SECONDS + WORKER_RUNTIME_SECONDS = 0.1 + + # Create the manager + manager = APIServerProcessManager(**api_server_args) + + try: + # Give processes time to terminate + # wait for processes to complete + remaining_processes = manager.processes.copy() + while remaining_processes: + for proc in remaining_processes: + if not proc.is_alive(): + remaining_processes.remove(proc) + time.sleep(0.1) + + # Verify all processes have terminated + for i, proc in enumerate(manager.processes): + assert not proc.is_alive( + ), f"Process {i} still alive after terminate()" + + # Now call wait_for_completion_or_failure + # since all processes have already + # terminated, it should return immediately + # with no error + wait_for_completion_or_failure(api_server_manager=manager) + + finally: + # Clean up just in case + manager.close() + time.sleep(0.2) + + +@pytest.mark.timeout(30) +def test_external_process_monitoring(api_server_args): + """Test that wait_for_completion_or_failure handles additional processes.""" + global WORKER_RUNTIME_SECONDS + WORKER_RUNTIME_SECONDS = 100 + + # Create and start the external process + # (simulates local_engine_manager or coordinator) + spawn_context = multiprocessing.get_context("spawn") + external_proc = spawn_context.Process(target=mock_run_api_server_worker, + name="MockExternalProcess") + external_proc.start() + + # Create the class to simulate a coordinator + class MockCoordinator: + + def __init__(self, proc): + self.proc = proc + + def close(self): + if self.proc.is_alive(): + self.proc.terminate() + self.proc.join(timeout=0.5) + + # Create a mock coordinator with the external process + mock_coordinator = MockCoordinator(external_proc) + + # Create the API server manager + manager = APIServerProcessManager(**api_server_args) + + try: + # Verify manager initialization + assert len(manager.processes) == 3 + + # Create a result capture for the thread + result: dict[str, Optional[Exception]] = {"exception": None} + + def run_with_exception_capture(): + try: + wait_for_completion_or_failure(api_server_manager=manager, + coordinator=mock_coordinator) + except Exception as e: + result["exception"] = e + + # Start a thread to run wait_for_completion_or_failure + wait_thread = threading.Thread(target=run_with_exception_capture, + daemon=True) + wait_thread.start() + + # Terminate the external process to trigger a failure + time.sleep(0.2) + external_proc.terminate() + + # Wait for the thread to detect the failure + wait_thread.join(timeout=1.0) + + # The wait thread should have completed + assert not wait_thread.is_alive( + ), "wait_for_completion_or_failure thread still running" + + # Verify that an exception was raised with appropriate error message + assert result["exception"] is not None, "No exception was raised" + error_message = str(result["exception"]) + assert "died with exit code" in error_message, \ + f"Unexpected error message: {error_message}" + assert "MockExternalProcess" in error_message, \ + f"Error doesn't mention external process: {error_message}" + + # Verify that all API server processes were terminated as a result + for i, proc in enumerate(manager.processes): + assert not proc.is_alive( + ), f"API server process {i} was not terminated" + + finally: + # Clean up + manager.close() + mock_coordinator.close() + time.sleep(0.2) diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index e5650136f2584..d9f956fbc7c00 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -148,6 +148,11 @@ def test_paged_attention( or (version == "rocm" and head_size not in (64, 128))): pytest.skip() + if (version == "rocm" and current_platform.is_navi() + and (kv_cache_dtype == "fp8" or head_size != 128 + or block_size != 16 or use_alibi)): + pytest.skip() + global PARTITION_SIZE current_platform.seed_everything(seed) @@ -275,6 +280,7 @@ def test_paged_attention( scale, block_tables, seq_lens, + None, block_size, max_seq_len, alibi_slopes, @@ -286,7 +292,7 @@ def test_paged_attention( opcheck(torch.ops._rocm_C.paged_attention, (output, exp_sums, max_logits, tmp_output, query, key_cache, value_cache, num_kv_heads, scale, block_tables, - seq_lens, block_size, max_seq_len, alibi_slopes, + seq_lens, None, block_size, max_seq_len, alibi_slopes, kv_cache_dtype, k_scale, v_scale), cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0])) diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py index 4e15d00255a4f..be3d1879de24b 100644 --- a/tests/kernels/attention/test_triton_unified_attention.py +++ b/tests/kernels/attention/test_triton_unified_attention.py @@ -13,7 +13,9 @@ HEAD_SIZES = [128, 256] BLOCK_SIZES = [16, 32] DTYPES = [torch.float16, torch.bfloat16] -QDTYPES = [None, torch.float8_e4m3fn] +QDTYPES = [None, torch.float8_e4m3fn] if not current_platform.is_rocm() else [ + None, torch.float8_e4m3fnuz +] # one value large enough to test overflow in index calculation. # one value small enough to test the schema op check NUM_BLOCKS = [32768, 2048] diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index f327deb0e549e..8cb56314cf94a 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -70,7 +70,7 @@ def test_rotary_embedding( device: str, use_key: bool, max_position: int = 8192, - base: int = 10000, + base: float = 10000, ) -> None: if rotary_dim is None: rotary_dim = head_size @@ -135,7 +135,7 @@ def test_batched_rotary_embedding( device: str, use_key: bool, max_position: int = 8192, - base: int = 10000, + base: float = 10000, ) -> None: current_platform.seed_everything(seed) torch.set_default_device(device) @@ -203,7 +203,7 @@ def test_batched_rotary_embedding_multi_lora( device: str, use_key: bool, max_position: int = 8192, - base: int = 10000, + base: float = 10000, ) -> None: current_platform.seed_everything(seed) torch.set_default_device(device) diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 43ddc79fcb818..299279390fe0c 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -575,3 +575,21 @@ def test_moe_align_block_size_opcheck(): opcheck(torch.ops._moe_C.moe_align_block_size, (topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad)) + + +@pytest.mark.parametrize("m", [1, 33, 64, 222]) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("k", [128, 511, 1024]) +@pytest.mark.parametrize("dtype", + [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm") +def test_moe_sum(m: int, topk: int, k: int, dtype: torch.dtype): + input = torch.randn((m, topk, k), device="cuda", dtype=dtype) + actual = torch.empty((m, k), device="cuda", dtype=dtype) + + expected = input.sum(dim=1) + torch.ops._moe_C.moe_sum(input, actual) + + torch.testing.assert_close(actual, expected, atol=2e-2, rtol=0) + + opcheck(torch.ops._moe_C.moe_sum, (input, actual)) diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py index dfcd61f775870..10e6ac64df877 100644 --- a/tests/kernels/moe/test_moe_permute_unpermute.py +++ b/tests/kernels/moe/test_moe_permute_unpermute.py @@ -13,7 +13,7 @@ import torch from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe.layer import determine_expert_map from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( - moe_permute, moe_unpermute) + moe_permute, moe_permute_unpermute_supported, moe_unpermute) from vllm.platforms import current_platform NUM_EXPERTS = [16, 64] @@ -167,6 +167,8 @@ def torch_unpermute(permuted_hidden_states: torch.Tensor, def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int, n_expert: int, ep_size: int, dtype: torch.dtype, align_block_size: Optional[int]): + if not moe_permute_unpermute_supported(): + pytest.skip("moe_permute_unpermute is not supported on this platform.") fill_invalid_expert = 0 ep_rank = np.random.randint(0, ep_size) expert_map = None diff --git a/tests/kernels/moe/test_rocm_aiter_topk.py b/tests/kernels/moe/test_rocm_aiter_topk.py index b0d34ddfd4234..922fd66dbef49 100644 --- a/tests/kernels/moe/test_rocm_aiter_topk.py +++ b/tests/kernels/moe/test_rocm_aiter_topk.py @@ -35,6 +35,15 @@ def test_rocm_aiter_biased_grouped_topk_custom_op_registration(): assert callable(torch.ops.vllm.rocm_aiter_biased_grouped_topk) +def test_rocm_aiter_grouped_topk_custom_op_registration(): + """Test that the custom op is correctly registered.""" + # Check if the op exists in torch.ops.vllm + assert hasattr(torch.ops.vllm, 'rocm_aiter_grouped_topk') + + # Check if the op is callable + assert callable(torch.ops.vllm.rocm_aiter_grouped_topk) + + def test_rocm_aiter_biased_grouped_topk_torch_compile_compatibility(): """Test that the op can be used with torch.compile.""" # Create test tensors @@ -120,3 +129,87 @@ def test_rocm_aiter_biased_grouped_topk_torch_compile_compatibility(): rtol=1e-2, atol=1e-2) assert torch.allclose(topk_ids_original, topk_ids_compiled) + + +def test_rocm_aiter_grouped_topk_torch_compile_compatibility(): + """Test that the op can be used with torch.compile.""" + # Create test tensors + token = 64 + expert = 256 + num_expert_group = 8 + topk = 8 + topk_group = 4 + renormalize = True + scoring_func = "softmax" + scale_factor = 1.0 + + gating_output = torch.randn((token, expert), + dtype=torch.bfloat16, + device="cuda") + + device = gating_output.device + topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device) + topk_weights = torch.empty((token, topk), + dtype=torch.float32, + device=device) + + # Define a function that uses the op + def grouped_topk_fn(gating_output, topk_weights, topk_ids, scoring_func): + return torch.ops.vllm.rocm_aiter_grouped_topk( + gating_output, topk_weights, topk_ids, num_expert_group, + topk_group, renormalize, scoring_func, scale_factor) + + # Verify the op's fake implementation + torch.library.opcheck(torch.ops.vllm.rocm_aiter_grouped_topk, + (gating_output, topk_weights, topk_ids), + kwargs={ + "num_expert_group": num_expert_group, + "topk_group": topk_group, + "need_renorm": renormalize, + "scoring_func": scoring_func, + "routed_scaling_factor": scale_factor + }, + test_utils=("test_faketensor")) + + # Compile the function with appropriate settings + compiled_fn = torch.compile(grouped_topk_fn, + fullgraph=True, + backend="inductor", + mode="reduce-overhead", + dynamic=False) + + topk_weights_original = torch.empty((token, topk), + dtype=torch.float32, + device=device) + topk_ids_original = torch.empty((token, topk), + dtype=torch.int32, + device=device) + + topk_weights_compiled = torch.empty((token, topk), + dtype=torch.float32, + device=device) + topk_ids_compiled = torch.empty((token, topk), + dtype=torch.int32, + device=device) + + # Run both compiled (V1 graph mode) and uncompiled versions (V1 eager mode) + grouped_topk_fn(gating_output, topk_weights_original, topk_ids_original, + scoring_func) + compiled_fn(gating_output, topk_weights_compiled, topk_ids_compiled, + scoring_func) + + # Sort the results for comparison since the order might not be deterministic + topk_ids_original, indices_original = torch.sort(topk_ids_original) + topk_weights_original = torch.gather(topk_weights_original, 1, + indices_original) + + topk_ids_compiled, indices_compiled = torch.sort(topk_ids_compiled) + topk_weights_compiled = torch.gather(topk_weights_compiled, 1, + indices_compiled) + + # Verify results match + assert torch.allclose(topk_weights_original, + topk_weights_compiled, + rtol=1e-2, + atol=1e-2) + assert torch.allclose(topk_ids_original, topk_ids_compiled) diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index 764924f26783d..892309a017e43 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -8,7 +8,7 @@ from vllm.platforms import current_platform # Using the default value (240.0) from pytorch will cause accuracy # issue on dynamic quantization models. Here use 224.0 for rocm. -ROCM_FP8_MAX = 224.0 +ROCM_FP8FNUZ_MAX = 224.0 FP8_DTYPE = current_platform.fp8_dtype() @@ -26,9 +26,11 @@ def ref_dynamic_per_token_quant(x: torch.tensor, qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \ else torch.finfo(quant_dtype) - qtype_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \ + qtype_traits_max = ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \ + and current_platform.is_fp8_fnuz() \ else qtype_traits.max - qtype_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \ + qtype_traits_min = -ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \ + and current_platform.is_fp8_fnuz() \ else qtype_traits.min qtype_max = as_float32_tensor(qtype_traits_max) s_1 = as_float32_tensor(1.0) @@ -70,9 +72,11 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \ -> tuple[torch.tensor, torch.tensor]: fp8_traits = torch.finfo(FP8_DTYPE) - fp8_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \ + fp8_traits_max = ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \ + and current_platform.is_fp8_fnuz() \ else fp8_traits.max - fp8_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \ + fp8_traits_min = -ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \ + and current_platform.is_fp8_fnuz() \ else fp8_traits.min fp8_max = as_float32_tensor(fp8_traits_max) one = as_float32_tensor(1.0) diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index ef1d7e47ef810..ae05d61173f33 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -36,16 +36,16 @@ vllm_config.scheduler_config.max_model_len = 8192 # Test configurations DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32] -NUM_TOKENS = [7, 83, 2048] +NUM_TOKENS = [7, 2050] D = [512, 4096, 5120, 13824] -GROUP_SIZE = [64, 128, 256, 512] -M = [1, 7, 8, 83, 84, 512, 2048, 4096] -N = [128, 512, 1024, 4096, 7168, 7748, 13824] -K = [256, 4096, 5120, 3884, 13824, 16384] +GROUP_SIZE = [64, 128, 512] +M = [1, 7, 8, 83, 84, 4096] +N = [128, 512, 7168, 7748, 13824] +K = [256, 3884, 4096, 13824, 16384] # Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8 # and its hidden size is 7168. -M_moe = [1, 2, 7, 83, 128, 512, 2048] -M_moe_dg = [128, 192, 512, 1335, 2048] +M_moe = [1, 2, 7, 83, 128, 2048] +M_moe_dg = [128, 192, 1335, 2048] N_moe = [128, 256, 1024, 4608] # [13824] K_moe = [256, 512, 7168] # [13824] BLOCK_SIZE = [[128, 128]] diff --git a/tests/kernels/quantization/test_gguf.py b/tests/kernels/quantization/test_gguf.py index 6cf88604ec65e..ad755fe7f7a0b 100644 --- a/tests/kernels/quantization/test_gguf.py +++ b/tests/kernels/quantization/test_gguf.py @@ -8,7 +8,6 @@ from gguf import GGMLQuantizationType, GGUFReader, ReaderTensor, dequantize from huggingface_hub import snapshot_download import vllm._custom_ops as ops -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_experts from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf from vllm.platforms import current_platform @@ -35,11 +34,11 @@ def get_gguf_MoE_tensors( return GGUFReader(sample_file).tensors -DTYPES = [torch.half, torch.bfloat16, torch.float32] +DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32] # Hidden_size for testing, must match the sample file in HF repo, # we have `hidden_size = 256, 1024` for test in HF repo currently. HIDDEN_SIZES = [256, 1024] -NUM_TOKENS = [7, 83, 128, 2048] # Arbitrary values for testing +NUM_TOKENS = [7, 2050] # Arbitrary values for testing SEEDS = [0] QUANT_TYPES = [ # i-matrix @@ -176,12 +175,11 @@ def test_moe(num_tokens: int, hidden_size: int, dtype: torch.dtype, w2_dequant = torch.tensor(dequantize(w2.data, quant_type), device="cuda").to(dtype) - act = SiluAndMul() output = _fused_moe_gguf(x, torch.tensor(w13.data, device="cuda"), torch.tensor(w2.data, device="cuda"), topk_weights, - topk_ids, quant_type, quant_type, act) + topk_ids, quant_type, quant_type, "silu") ref_output = fused_experts(x, w13_dequant, w2_dequant, topk_weights, topk_ids).reshape(output.shape) diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py index 45f10b0eb1d53..30e6eeb8d5660 100644 --- a/tests/kernels/quantization/test_triton_scaled_mm.py +++ b/tests/kernels/quantization/test_triton_scaled_mm.py @@ -13,8 +13,13 @@ from vllm.platforms import current_platform device = "cuda" +triton_scaled_mm_module = importlib.import_module( + "vllm.model_executor.layers.quantization.compressed_tensors." + "triton_scaled_mm") +triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm -def scaled_mm_torch(a: torch.Tensor, + +def torch_scaled_mm(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor, scale_b: torch.Tensor, @@ -101,21 +106,8 @@ def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a, if use_bias: bias = torch.rand((N, ), device=device, dtype=out_dtype) - triton_scaled_mm_module = importlib.import_module( - "vllm.model_executor.layers.quantization.compressed_tensors." - "triton_scaled_mm") - triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm - c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias) - a_cpu = a.cpu() - b_cpu = b.cpu() - scale_a_cpu = scale_a.cpu() - scale_b_cpu = scale_b.cpu() - bias_cpu = None if bias is None else bias.cpu() + c_actual = torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias) - c_actual = scaled_mm_torch(a_cpu, b_cpu, scale_a_cpu, scale_b_cpu, - out_dtype, bias_cpu) - - c_check_cpu = c_check.cpu() - torch.testing.assert_close(c_check_cpu, c_actual, rtol=1e-1, atol=1e-1) + torch.testing.assert_close(c_check, c_actual, rtol=1e-1, atol=1e-1) diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index e3a054bd62064..580992dea53da 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -1,12 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 +import subprocess +import sys +from typing import Union import pytest import ray import vllm +from vllm import LLM from vllm.lora.request import LoRARequest +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -from ..utils import create_new_process_for_each_test, multi_gpu_test +from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test MODEL_PATH = "meta-llama/Llama-2-7b-hf" @@ -36,7 +41,10 @@ def v1(run_with_both_engines_lora): pass -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: +def do_sample(llm: vllm.LLM, + lora_path: str, + lora_id: int, + tensorizer_config_dict: Union[dict, None] = None) -> list[str]: prompts = [ "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 @@ -45,15 +53,28 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 ] + sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256, skip_special_tokens=False, stop=["[/assistant]"]) - outputs = llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest(str(lora_id), lora_id, lora_path) - if lora_id else None) + + if tensorizer_config_dict is not None: + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest( + str(lora_id), + lora_id, + lora_path, + tensorizer_config_dict=tensorizer_config_dict) + if lora_id else None) + else: + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None) # Print the outputs. generated_texts: list[str] = [] for output in outputs: @@ -64,18 +85,32 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: return generated_texts -def generate_and_test(llm, sql_lora_files): +def generate_and_test(llm, + sql_lora_files, + tensorizer_config_dict: Union[dict, None] = None): print("lora adapter created") - assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT + assert do_sample(llm, + sql_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=0) == EXPECTED_NO_LORA_OUTPUT print("lora 1") - assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT + assert do_sample(llm, + sql_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=1) == EXPECTED_LORA_OUTPUT print("no lora") - assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT + assert do_sample(llm, + sql_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=0) == EXPECTED_NO_LORA_OUTPUT print("lora 2") - assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT + assert do_sample(llm, + sql_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=2) == EXPECTED_LORA_OUTPUT print("removing lora") @@ -153,3 +188,64 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): enable_chunked_prefill=True, ) generate_and_test(llm, sql_lora_files) + + +@multi_gpu_test(num_gpus=2) +@create_new_process_for_each_test() +def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files, + sql_lora_huggingface_id): + + # Run the tensorizing of the LoRA adapter and the model in a subprocess + # to guarantee cleanup + + tp_size = 2 + model_name = "model-rank-%03d.tensors" + + model_ref = MODEL_PATH + lora_path = sql_lora_huggingface_id + suffix = "test" + try: + result = subprocess.run([ + sys.executable, + f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model", + MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size", + str(tp_size), "serialize", "--serialized-directory", + str(tmp_path), "--suffix", suffix + ], + check=True, + capture_output=True, + text=True) + except subprocess.CalledProcessError as e: + print("Tensorizing failed.") + print("STDOUT:\n", e.stdout) + print("STDERR:\n", e.stderr) + raise + + print("STDOUT:\n", result.stdout) + + model_uri = tmp_path / "vllm" / model_ref / suffix / model_name + tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri)) + tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir + + loaded_vllm_model = LLM(model=model_ref, + load_format="tensorizer", + enable_lora=True, + enforce_eager=True, + model_loader_extra_config=tensorizer_config, + max_num_seqs=13, + tensor_parallel_size=2, + max_loras=2) + + tensorizer_config_dict = tensorizer_config.to_dict() + + print("lora adapter created") + assert do_sample(loaded_vllm_model, + sql_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=0) == EXPECTED_NO_LORA_OUTPUT + + print("lora 1") + assert do_sample(loaded_vllm_model, + sql_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=1) == EXPECTED_LORA_OUTPUT diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py index 204624a0540af..7ae33a848a0aa 100644 --- a/tests/lora/test_lora_functions.py +++ b/tests/lora/test_lora_functions.py @@ -69,7 +69,7 @@ def test_lora_functions_sync(): run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11]) run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11]) - # Remove all LoRAs + # Remove all LoRAs. run_check(llm.remove_lora, 13, [12, 10, 11]) run_check(llm.remove_lora, 12, [10, 11]) run_check(llm.remove_lora, 11, [10]) diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index caf71976a2608..7a76ffb740ef2 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -24,16 +24,16 @@ if current_platform.is_rocm(): MODELS = [ ModelWithQuantization( model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", - quantization="GPTQ"), + quantization="gptq"), ] else: MODELS = [ ModelWithQuantization( model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", - quantization="AWQ"), + quantization="awq"), ModelWithQuantization( model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", - quantization="GPTQ"), + quantization="gptq"), ] @@ -100,7 +100,7 @@ def test_quant_model_lora(tinyllama_lora_files, model): "#ff8050", "#ff8080", ] - elif model.quantization == "AWQ": + elif model.quantization == "awq": expected_no_lora_output = [ "I'm sorry, I don't understand", "I'm sorry, I don't understand", @@ -109,7 +109,7 @@ def test_quant_model_lora(tinyllama_lora_files, model): "#f07700: A v", "#f00000: A v", ] - elif model.quantization == "GPTQ": + elif model.quantization == "gptq": expected_no_lora_output = [ "I'm sorry, I don't have", "I'm sorry, I don't have", @@ -122,7 +122,7 @@ def test_quant_model_lora(tinyllama_lora_files, model): def expect_match(output, expected_output): # HACK: GPTQ lora outputs are just incredibly unstable. # Assert that the outputs changed. - if (model.quantization == "GPTQ" + if (model.quantization == "gptq" and expected_output is expected_lora_output): assert output != expected_no_lora_output for i, o in enumerate(output): @@ -172,7 +172,7 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, model): if num_gpus_available < 2: pytest.skip(f"Not enough GPUs for tensor parallelism {2}") - if model.quantization == "GPTQ": + if model.quantization == "gptq": pytest.skip("GPTQ lora outputs are just incredibly unstable") llm_tp1 = vllm.LLM( model=model.model_path, diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index 7bd3e3d0fe27f..162714df2f130 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -10,6 +10,7 @@ import vllm from vllm.assets.image import ImageAsset from vllm.lora.request import LoRARequest from vllm.platforms import current_platform +from vllm.sampling_params import BeamSearchParams @pytest.fixture(autouse=not current_platform.is_cpu()) @@ -69,7 +70,7 @@ class Qwen2VLTester: expected_outputs: list[str], lora_id: Optional[int] = None, temperature: float = 0, - max_tokens: int = 5) -> list[str]: + max_tokens: int = 5): sampling_params = vllm.SamplingParams( temperature=temperature, @@ -97,7 +98,35 @@ class Qwen2VLTester: generated), f"Generated text {generated} doesn't " f"match expected pattern {expected}" - return generated_texts + def run_beam_search_test(self, + images: list[ImageAsset], + expected_outputs: list[list[str]], + lora_id: Optional[int] = None, + temperature: float = 0, + beam_width: int = 2, + max_tokens: int = 5): + + beam_search_params = BeamSearchParams(beam_width=beam_width, + max_tokens=max_tokens, + temperature=temperature) + + inputs = [{ + "prompt": self.PROMPT_TEMPLATE, + "multi_modal_data": { + "image": asset.pil_image + }, + } for asset in images] + + lora_request = LoRARequest(str(lora_id), lora_id, + self.config.lora_path) + outputs = self.llm.beam_search(inputs, + beam_search_params, + lora_request=lora_request) + + for output_obj, expected_outs in zip(outputs, expected_outputs): + output_texts = [seq.text for seq in output_obj.sequences] + assert output_texts == expected_outs, \ + f"Generated texts {output_texts} do not match expected {expected_outs}" # noqa: E501 TEST_IMAGES = [ @@ -110,6 +139,14 @@ EXPECTED_OUTPUTS = [ "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.", # noqa: E501 ] +# NOTE - beam search .text contains the whole text +EXPECTED_BEAM_SEARCH_OUTPUTS = [ + [ + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is in the image?<|im_end|>\n<|im_start|>assistant\nA majestic skyscraper stands", # noqa: E501 + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is in the image?<|im_end|>\n<|im_start|>assistant\nA majestic tower stands tall", # noqa: E501 + ], +] + QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct" QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct" @@ -130,6 +167,27 @@ def test_qwen2vl_lora(qwen2vl_lora_files): lora_id=lora_id) +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="Qwen2-VL dependency xformers incompatible with ROCm") +def test_qwen2vl_lora_beam_search(qwen2vl_lora_files): + """Test Qwen 2.0 VL model with LoRA through beam search.""" + config = TestConfig(model_path=QWEN2VL_MODEL_PATH, + lora_path=qwen2vl_lora_files) + tester = Qwen2VLTester(config) + + # Test with different LoRA IDs + for lora_id in [1, 2]: + # NOTE currently, we only test cherry blossom since stop sign + # output is slightly different for v1; - the root cause is likely + # independent of the intent of this test, which is to ensure beam + # search passes through lora through correctly. + tester.run_beam_search_test( + [ImageAsset("cherry_blossom")], + expected_outputs=EXPECTED_BEAM_SEARCH_OUTPUTS, + lora_id=lora_id) + + @pytest.mark.xfail( current_platform.is_rocm(), reason="Qwen2.5-VL dependency xformers incompatible with ROCm", diff --git a/tests/test_logits_processor.py b/tests/model_executor/test_logits_processor.py similarity index 100% rename from tests/test_logits_processor.py rename to tests/model_executor/test_logits_processor.py diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index f8efa2eff857b..7fda1f0e80d07 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -4,7 +4,7 @@ import os import pytest -from vllm.model_executor.layers.pooler import CLSPool, PoolingType +from vllm.model_executor.layers.pooler import CLSPool, MeanPool, PoolingType from vllm.model_executor.models.bert import BertEmbeddingModel from vllm.model_executor.models.roberta import RobertaEmbeddingModel from vllm.platforms import current_platform @@ -14,7 +14,7 @@ MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5") REVISION = os.environ.get("REVISION", "main") MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME", - "intfloat/multilingual-e5-small") + "intfloat/multilingual-e5-base") REVISION_ROBERTA = os.environ.get("REVISION", "main") @@ -40,17 +40,15 @@ def test_model_loading_with_params(vllm_runner): # asserts on the pooling config files assert model_config.pooler_config.pooling_type == PoolingType.CLS.name - assert model_config.pooler_config.pooling_norm + assert model_config.pooler_config.normalize # asserts on the tokenizer loaded assert model_tokenizer.tokenizer_id == "BAAI/bge-base-en-v1.5" - assert model_tokenizer.tokenizer_config["do_lower_case"] assert model_tokenizer.tokenizer.model_max_length == 512 def check_model(model): assert isinstance(model, BertEmbeddingModel) - assert model._pooler.pooling_type == PoolingType.CLS - assert model._pooler.normalize + assert isinstance(model._pooler, CLSPool) vllm_model.apply_model(check_model) @@ -80,16 +78,15 @@ def test_roberta_model_loading_with_params(vllm_runner): # asserts on the pooling config files assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name - assert model_config.pooler_config.pooling_norm + assert model_config.pooler_config.normalize # asserts on the tokenizer loaded - assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-small" - assert not model_tokenizer.tokenizer_config["do_lower_case"] + assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-base" + assert model_tokenizer.tokenizer.model_max_length == 512 def check_model(model): assert isinstance(model, RobertaEmbeddingModel) - assert model._pooler.pooling_type == PoolingType.MEAN - assert model._pooler.normalize + assert isinstance(model._pooler, MeanPool) vllm_model.apply_model(check_model) diff --git a/tests/model_executor/weight_utils.py b/tests/model_executor/test_weight_utils.py similarity index 100% rename from tests/model_executor/weight_utils.py rename to tests/model_executor/test_weight_utils.py diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 9b7a42acece59..604cb854b32ff 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -31,7 +31,7 @@ HYBRID_MODELS = [ # not compatible with pip-compile. "pfnet/plamo-2-1b", "Zyphra/Zamba2-1.2B-instruct", - "hmellor/bamba-tiny-random", + "hmellor/tiny-random-BambaForCausalLM", ] # Avoid OOM diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py new file mode 100644 index 0000000000000..0c8ac2ab1b9eb --- /dev/null +++ b/tests/models/language/pooling/embed_utils.py @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: Apache-2.0 +from collections.abc import Sequence +from typing import Optional + +import pytest + +from tests.conftest import HfRunner +from tests.models.utils import (EmbedModelInfo, check_embeddings_close, + matryoshka_fy) + + +def run_embedding_correctness_test( + hf_model: "HfRunner", + inputs: list[str], + vllm_outputs: Sequence[list[float]], + dimensions: Optional[int] = None, +): + hf_outputs = hf_model.encode(inputs) + if dimensions: + hf_outputs = matryoshka_fy(hf_outputs, dimensions) + + check_embeddings_close( + embeddings_0_lst=hf_outputs, + embeddings_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + tol=1e-2, + ) + + +def correctness_test_embed_models(hf_runner, + vllm_runner, + model_info: EmbedModelInfo, + example_prompts, + vllm_extra_kwargs=None, + hf_model_callback=None): + if not model_info.enable_test: + # A model family has many models with the same architecture, + # and we don't need to test each one. + pytest.skip("Skipping test.") + + # The example_prompts has ending "\n", for example: + # "Write a short story about a robot that dreams for the first time.\n" + # sentence_transformers will strip the input texts, see: + # https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159 + # This makes the input_ids different between hf_model and vllm_model. + # So we need to strip the input texts to avoid test failing. + example_prompts = [str(s).strip() for s in example_prompts] + + vllm_extra_kwargs = vllm_extra_kwargs or {} + vllm_extra_kwargs["dtype"] = model_info.dtype + + with vllm_runner(model_info.name, + task="embed", + max_model_len=None, + **vllm_extra_kwargs) as vllm_model: + vllm_outputs = vllm_model.encode(example_prompts) + vllm_dtype = vllm_model.model.llm_engine.model_config.dtype + model_dtype = getattr( + vllm_model.model.llm_engine.model_config.hf_config, "torch_dtype", + vllm_dtype) + + with hf_runner( + model_info.name, + dtype=model_dtype, + is_sentence_transformer=True, + ) as hf_model: + + if hf_model_callback is not None: + hf_model_callback(hf_model) + + run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs) diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index 7de2a9af2f2ed..f45168bc0f1d6 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -1,5 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -import math from collections.abc import Sequence import mteb @@ -81,18 +80,19 @@ def run_mteb_embed_task_st(model_name, tasks): def mteb_test_embed_models(hf_runner, vllm_runner, model_info: EmbedModelInfo, - vllm_extra_kwargs=None): + vllm_extra_kwargs=None, + hf_model_callback=None): if not model_info.enable_test: # A model family has many models with the same architecture, # and we don't need to test each one. pytest.skip("Skipping test.") vllm_extra_kwargs = vllm_extra_kwargs or {} + vllm_extra_kwargs["dtype"] = model_info.dtype with vllm_runner(model_info.name, task="embed", max_model_len=None, - dtype=model_info.dtype, **vllm_extra_kwargs) as vllm_model: if model_info.architecture: @@ -102,17 +102,18 @@ def mteb_test_embed_models(hf_runner, vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS) vllm_dtype = vllm_model.model.llm_engine.model_config.dtype - model_dtype = getattr( - vllm_model.model.llm_engine.model_config.hf_config, "torch_dtype", - vllm_dtype) - with set_default_torch_dtype(model_dtype) and hf_runner( + with set_default_torch_dtype(vllm_dtype) and hf_runner( model_info.name, is_sentence_transformer=True, - dtype=model_dtype) as hf_model: + dtype=vllm_dtype) as hf_model: + + if hf_model_callback is not None: + hf_model_callback(hf_model) + st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS) - print("VLLM:", vllm_dtype, vllm_main_score) - print("SentenceTransformer:", model_dtype, st_main_score) + print("VLLM:", vllm_main_score) + print("SentenceTransformers:", st_main_score) print("Difference:", st_main_score - vllm_main_score) - assert math.isclose(st_main_score, vllm_main_score, rel_tol=MTEB_EMBED_TOL) + assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL) diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling/test_baai.py new file mode 100644 index 0000000000000..fc0e8207954fa --- /dev/null +++ b/tests/models/language/pooling/test_baai.py @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest + +from .embed_utils import EmbedModelInfo, correctness_test_embed_models +from .mteb_utils import mteb_test_embed_models + +MODELS = [ + ########## BertModel + EmbedModelInfo("BAAI/bge-base-en", + architecture="BertModel", + enable_test=True), + EmbedModelInfo("BAAI/bge-base-zh", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-small-en", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-small-zh", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-large-en", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-large-zh", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-large-zh-noinstruct", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-base-en-v1.5", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-base-zh-v1.5", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-small-en-v1.5", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-small-zh-v1.5", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-large-en-v1.5", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-large-zh-v1.5", + architecture="BertModel", + enable_test=False), + ########## XLMRobertaModel + EmbedModelInfo("BAAI/bge-m3", + architecture="XLMRobertaModel", + enable_test=True), + ########## Qwen2Model + EmbedModelInfo("BAAI/bge-code-v1", + architecture="Qwen2Model", + dtype="float32", + enable_test=True), +] + + +@pytest.mark.parametrize("model_info", MODELS) +def test_embed_models_mteb(hf_runner, vllm_runner, + model_info: EmbedModelInfo) -> None: + mteb_test_embed_models(hf_runner, vllm_runner, model_info) + + +@pytest.mark.parametrize("model_info", MODELS) +def test_embed_models_correctness(hf_runner, vllm_runner, + model_info: EmbedModelInfo, + example_prompts) -> None: + correctness_test_embed_models(hf_runner, vllm_runner, model_info, + example_prompts) diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py index 44af3df08a867..57b3cb58d88ba 100644 --- a/tests/models/language/pooling/test_classification.py +++ b/tests/models/language/pooling/test_classification.py @@ -43,6 +43,6 @@ def test_models( # the tolerance value of 1e-2 is selected based on the # half datatype tests in - # tests/models/embedding/language/test_embedding.py + # tests/models/language/pooling/test_embedding.py assert torch.allclose(hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2) diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index 9db385e77bdbb..8f82c8091af37 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -10,30 +10,31 @@ from ...utils import check_embeddings_close @pytest.mark.parametrize( "model", [ - # [Encoder-only] - pytest.param("BAAI/bge-base-en-v1.5", - marks=[pytest.mark.core_model, pytest.mark.cpu_model]), - pytest.param("sentence-transformers/all-MiniLM-L12-v2"), - pytest.param("intfloat/multilingual-e5-small"), - pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"), + # Be careful of the order of models, decoder-only models should be + # placed before encoder-only models, otherwise `Qwen2.5-0.5B-Instruct` + # case won't pass because gte-Qwen2-1.5B-instruct will cache custom + # model code with bidirectional attention. # [Decoder-only] pytest.param("BAAI/bge-multilingual-gemma2", marks=[pytest.mark.core_model]), pytest.param("intfloat/e5-mistral-7b-instruct", marks=[pytest.mark.core_model, pytest.mark.cpu_model]), - pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), + # [Encoder-only] + pytest.param("BAAI/bge-base-en-v1.5", + marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + pytest.param("sentence-transformers/all-MiniLM-L12-v2"), + pytest.param("intfloat/multilingual-e5-small"), + pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), # [Cross-Encoder] pytest.param("sentence-transformers/stsb-roberta-base-v2"), ], ) -@pytest.mark.parametrize("dtype", ["half"]) def test_models( hf_runner, vllm_runner, example_prompts, model, - dtype: str, monkeypatch, ) -> None: @@ -45,10 +46,7 @@ def test_models( vllm_extra_kwargs = {} if model == "ssmits/Qwen2-7B-Instruct-embed-base": vllm_extra_kwargs["override_pooler_config"] = \ - PoolerConfig(pooling_type="MEAN") - - if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct": - vllm_extra_kwargs["hf_overrides"] = {"is_causal": True} + PoolerConfig(pooling_type="MEAN", normalize=False) # The example_prompts has ending "\n", for example: # "Write a short story about a robot that dreams for the first time.\n" @@ -58,13 +56,11 @@ def test_models( # So we need to strip the input texts to avoid test failing. example_prompts = [str(s).strip() for s in example_prompts] - with hf_runner(model, dtype=dtype, - is_sentence_transformer=True) as hf_model: + with hf_runner(model, is_sentence_transformer=True) as hf_model: hf_outputs = hf_model.encode(example_prompts) with vllm_runner(model, task="embed", - dtype=dtype, max_model_len=None, **vllm_extra_kwargs) as vllm_model: vllm_outputs = vllm_model.encode(example_prompts) diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py index 7dd3c8a4e79e2..f450edd821623 100644 --- a/tests/models/language/pooling/test_gritlm.py +++ b/tests/models/language/pooling/test_gritlm.py @@ -2,7 +2,6 @@ from __future__ import annotations import importlib.util -import math from array import array import openai @@ -104,16 +103,16 @@ def get_test_data(): def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]): cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0]) - assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001) + assert cosine_sim_q0_d0 == pytest.approx(0.609, abs=0.001) cosine_sim_q0_d1 = 1 - cosine(q_rep[0], d_rep[1]) - assert math.isclose(cosine_sim_q0_d1, 0.101, abs_tol=0.001) + assert cosine_sim_q0_d1 == pytest.approx(0.101, abs=0.001) cosine_sim_q1_d0 = 1 - cosine(q_rep[1], d_rep[0]) - assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001) + assert cosine_sim_q1_d0 == pytest.approx(0.120, abs=0.001) cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1]) - assert math.isclose(cosine_sim_q1_d1, 0.534, abs_tol=0.001) + assert cosine_sim_q1_d1 == pytest.approx(0.534, abs=0.001) def test_gritlm_offline_embedding(vllm_runner): diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index b60d27aaa72bd..725e3d168408b 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -3,7 +3,8 @@ from typing import Any import pytest -from ...utils import EmbedModelInfo, run_embedding_correctness_test +from .embed_utils import EmbedModelInfo, correctness_test_embed_models +from .mteb_utils import mteb_test_embed_models MODELS = [ ########## BertModel @@ -44,10 +45,8 @@ MODELS = [ ########### Qwen2ForCausalLM EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct", architecture="Qwen2ForCausalLM", + dtype="float32", enable_test=True), - EmbedModelInfo("Alibaba-NLP/gte-Qwen2-7B-instruct", - architecture="Qwen2ForCausalLM", - enable_test=False), ########## ModernBertModel EmbedModelInfo("Alibaba-NLP/gte-modernbert-base", architecture="ModernBertModel", @@ -56,14 +55,10 @@ MODELS = [ @pytest.mark.parametrize("model_info", MODELS) -def test_models_mteb(hf_runner, vllm_runner, - model_info: EmbedModelInfo) -> None: - from .mteb_utils import mteb_test_embed_models +def test_embed_models_mteb(hf_runner, vllm_runner, + model_info: EmbedModelInfo) -> None: vllm_extra_kwargs: dict[str, Any] = {} - if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct": - vllm_extra_kwargs["hf_overrides"] = {"is_causal": True} - if model_info.architecture == "GteNewModel": vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]} @@ -72,31 +67,13 @@ def test_models_mteb(hf_runner, vllm_runner, @pytest.mark.parametrize("model_info", MODELS) -def test_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo, - example_prompts) -> None: - if not model_info.enable_test: - pytest.skip("Skipping test.") - - # ST will strip the input texts, see test_embedding.py - example_prompts = [str(s).strip() for s in example_prompts] +def test_embed_models_correctness(hf_runner, vllm_runner, + model_info: EmbedModelInfo, + example_prompts) -> None: vllm_extra_kwargs: dict[str, Any] = {} - if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct": - vllm_extra_kwargs["hf_overrides"] = {"is_causal": True} - if model_info.architecture == "GteNewModel": vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]} - with vllm_runner(model_info.name, - task="embed", - dtype=model_info.dtype, - max_model_len=None, - **vllm_extra_kwargs) as vllm_model: - vllm_outputs = vllm_model.encode(example_prompts) - - with hf_runner( - model_info.name, - dtype=model_info.dtype, - is_sentence_transformer=True, - ) as hf_model: - run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs) + correctness_test_embed_models(hf_runner, vllm_runner, model_info, + example_prompts, vllm_extra_kwargs) diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py index 5287ca37c0fb5..0403a20a445af 100644 --- a/tests/models/language/pooling/test_jina.py +++ b/tests/models/language/pooling/test_jina.py @@ -1,11 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 -import math +from functools import partial import pytest from vllm import PoolingParams -from ...utils import check_embeddings_close, matryoshka_fy +from .embed_utils import (EmbedModelInfo, check_embeddings_close, + correctness_test_embed_models, matryoshka_fy) +from .mteb_utils import mteb_test_embed_models SCORING_MODELS = [ "jinaai/jina-reranker-v2-base-multilingual", # Roberta @@ -27,16 +29,10 @@ TEXTS_2 = [ ] EMBEDDING_MODELS = [ - "jinaai/jina-embeddings-v3", -] - -EMBEDDING_PROMPTS = [ - "Follow the white rabbit.", # English - "Sigue al conejo blanco.", # Spanish - "Suis le lapin blanc.", # French - "่ทŸ็€็™ฝๅ…”่ตฐใ€‚", # Chinese - "ุงุชุจุน ุงู„ุฃุฑู†ุจ ุงู„ุฃุจูŠุถ.", # Arabic - "Folge dem weiรŸen Kaninchen.", # German + EmbedModelInfo("jinaai/jina-embeddings-v3", + architecture="XLMRobertaModel", + is_matryoshka=True, + dtype="float32") ] @@ -60,7 +56,7 @@ def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str): assert len(vllm_outputs) == 1 assert len(hf_outputs) == 1 - assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) @pytest.mark.parametrize("dtype", ["half"]) @@ -78,77 +74,70 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str): assert len(vllm_outputs) == 10 assert len(hf_outputs) == 10 - assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) - assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) + assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) -@pytest.fixture(scope="module", params=EMBEDDING_MODELS) -def emb_model_name(request): - yield request.param +@pytest.mark.parametrize("model_info", EMBEDDING_MODELS) +def test_embed_models_mteb(hf_runner, vllm_runner, + model_info: EmbedModelInfo) -> None: + + def hf_model_callback(model): + model.encode = partial(model.encode, task="text-matching") + + mteb_test_embed_models(hf_runner, + vllm_runner, + model_info, + hf_model_callback=hf_model_callback) -def test_is_matryoshka(vllm_runner, emb_model_name): - with vllm_runner(emb_model_name, task="embed", - max_model_len=None) as vllm_model: - assert vllm_model.model.llm_engine.model_config.is_matryoshka +@pytest.mark.parametrize("model_info", EMBEDDING_MODELS) +def test_embed_models_correctness(hf_runner, vllm_runner, + model_info: EmbedModelInfo, + example_prompts) -> None: + + def hf_model_callback(model): + model.encode = partial(model.encode, task="text-matching") + + correctness_test_embed_models(hf_runner, + vllm_runner, + model_info, + example_prompts, + hf_model_callback=hf_model_callback) -@pytest.mark.parametrize("model", EMBEDDING_MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -def test_embeddings( - hf_runner, - vllm_runner, - model, - dtype: str, - monkeypatch, -) -> None: - - example_prompts = EMBEDDING_PROMPTS - - with hf_runner( - model, - dtype=dtype, - is_sentence_transformer=True, - ) as hf_model: - hf_outputs = hf_model.encode(example_prompts, task="text-matching") - - with vllm_runner(model, task="embed", dtype=dtype, - max_model_len=None) as vllm_model: - vllm_outputs = vllm_model.encode(example_prompts) - - check_embeddings_close( - embeddings_0_lst=hf_outputs, - embeddings_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - tol=1e-2, - ) - - -@pytest.mark.parametrize("model", EMBEDDING_MODELS) +@pytest.mark.parametrize("model_info", EMBEDDING_MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dimensions", [16, 32]) def test_matryoshka( hf_runner, vllm_runner, - model, + model_info, dtype: str, dimensions: int, + example_prompts, monkeypatch, ) -> None: + if not model_info.is_matryoshka: + pytest.skip("Model is not matryoshka") - example_prompts = EMBEDDING_PROMPTS + # ST will strip the input texts, see test_embedding.py + example_prompts = [str(s).strip() for s in example_prompts] with hf_runner( - model, + model_info.name, dtype=dtype, is_sentence_transformer=True, ) as hf_model: hf_outputs = hf_model.encode(example_prompts, task="text-matching") hf_outputs = matryoshka_fy(hf_outputs, dimensions) - with vllm_runner(model, task="embed", dtype=dtype, + with vllm_runner(model_info.name, + task="embed", + dtype=dtype, max_model_len=None) as vllm_model: + assert vllm_model.model.llm_engine.model_config.is_matryoshka + matryoshka_dimensions = ( vllm_model.model.llm_engine.model_config.matryoshka_dimensions) assert matryoshka_dimensions is not None diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py index 28df32e0c2301..92cd7cc569d39 100644 --- a/tests/models/language/pooling/test_nomic.py +++ b/tests/models/language/pooling/test_nomic.py @@ -2,7 +2,8 @@ import pytest -from ...utils import EmbedModelInfo, run_embedding_correctness_test +from .embed_utils import EmbedModelInfo, correctness_test_embed_models +from .mteb_utils import mteb_test_embed_models MODELS = [ EmbedModelInfo("nomic-ai/nomic-embed-text-v1", @@ -13,6 +14,9 @@ MODELS = [ architecture="NomicBertModel", dtype="float32", enable_test=False), + EmbedModelInfo("nomic-ai/CodeRankEmbed", + architecture="NomicBertModel", + enable_test=False), EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe", architecture="NomicBertModel", dtype="float32", @@ -21,30 +25,14 @@ MODELS = [ @pytest.mark.parametrize("model_info", MODELS) -def test_models_mteb(hf_runner, vllm_runner, - model_info: EmbedModelInfo) -> None: - from .mteb_utils import mteb_test_embed_models +def test_embed_models_mteb(hf_runner, vllm_runner, + model_info: EmbedModelInfo) -> None: mteb_test_embed_models(hf_runner, vllm_runner, model_info) @pytest.mark.parametrize("model_info", MODELS) -def test_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo, - example_prompts) -> None: - if not model_info.enable_test: - pytest.skip("Skipping test.") - - # ST will strip the input texts, see test_embedding.py - example_prompts = [str(s).strip() for s in example_prompts] - - with vllm_runner(model_info.name, - task="embed", - dtype=model_info.dtype, - max_model_len=None) as vllm_model: - vllm_outputs = vllm_model.encode(example_prompts) - - with hf_runner( - model_info.name, - dtype=model_info.dtype, - is_sentence_transformer=True, - ) as hf_model: - run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs) +def test_embed_models_correctness(hf_runner, vllm_runner, + model_info: EmbedModelInfo, + example_prompts) -> None: + correctness_test_embed_models(hf_runner, vllm_runner, model_info, + example_prompts) diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py new file mode 100644 index 0000000000000..68603e62843eb --- /dev/null +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -0,0 +1,130 @@ +# SPDX-License-Identifier: Apache-2.0 +# ruff: noqa: SIM117 +import pytest + +from ...utils import EmbedModelInfo + +MODELS = [ + EmbedModelInfo("nomic-ai/nomic-embed-text-v1"), + #EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"), + #EmbedModelInfo("nomic-ai/CodeRankEmbed"), + EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"), + #EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"), +] + +rope_theta = 1000 +factor = 4.0 +original_max_position_embeddings = 2048 +max_model_len = int(original_max_position_embeddings * factor) + + +@pytest.mark.parametrize("model_info", MODELS) +def test_default(model_info, vllm_runner): + with vllm_runner(model_info.name, task="embed", + max_model_len=None) as vllm_model: + model_config = vllm_model.model.llm_engine.model_config + if model_info.name == "nomic-ai/nomic-embed-text-v2-moe": + # For nomic-embed-text-v2-moe the length is set to 512 + # by sentence_bert_config.json. + assert model_config.max_model_len == 512 + else: + assert ( + model_config.max_model_len == original_max_position_embeddings) + + +@pytest.mark.parametrize("model_info", MODELS) +def test_set_max_model_len_legal(model_info, vllm_runner): + # set max_model_len <= 512 + with vllm_runner(model_info.name, task="embed", + max_model_len=256) as vllm_model: + model_config = vllm_model.model.llm_engine.model_config + assert model_config.max_model_len == 256 + + # set 512 < max_model_len <= 2048 + if model_info.name == "nomic-ai/nomic-embed-text-v2-moe": + # For nomic-embed-text-v2-moe the length is set to 512 + # by sentence_bert_config.json. + with pytest.raises(ValueError): + with vllm_runner(model_info.name, task="embed", + max_model_len=1024): + pass + else: + with vllm_runner(model_info.name, task="embed", + max_model_len=1024) as vllm_model: + model_config = vllm_model.model.llm_engine.model_config + assert model_config.max_model_len == 1024 + + +@pytest.mark.parametrize("model_info", MODELS) +def test_set_max_model_len_illegal(model_info, vllm_runner): + # set max_model_len > 2048 + with pytest.raises(ValueError): + with vllm_runner(model_info.name, task="embed", max_model_len=4096): + pass + + # set max_model_len > 2048 by hf_overrides + hf_overrides = {"max_model_len": 4096} + with pytest.raises(ValueError): + with vllm_runner(model_info.name, + task="embed", + max_model_len=None, + hf_overrides=hf_overrides): + pass + + +@pytest.mark.parametrize("model_info", MODELS) +def test_use_rope_scaling_legal(model_info, vllm_runner): + hf_overrides = { + "rope_theta": rope_theta, + "rope_scaling": { + "rope_type": "yarn", + "factor": factor, + "original_max_position_embeddings": + original_max_position_embeddings + }, + "max_model_len": max_model_len + } + + with vllm_runner(model_info.name, + task="embed", + max_model_len=None, + hf_overrides=hf_overrides): + pass + + +@pytest.mark.parametrize("model_info", MODELS) +def test_use_rope_scaling_illegal(model_info, vllm_runner): + hf_overrides = { + "rope_theta": rope_theta, + "rope_scaling": { + "rope_type": "yarn", + "factor": factor, + "original_max_position_embeddings": + original_max_position_embeddings + } + } + # illegal max_model_len + with pytest.raises(ValueError): + with vllm_runner(model_info.name, + task="embed", + max_model_len=max_model_len + 1, + hf_overrides=hf_overrides): + pass + + hf_overrides = { + "rope_theta": rope_theta, + "rope_scaling": { + "rope_type": "yarn", + "factor": factor, + "original_max_position_embeddings": + original_max_position_embeddings + }, + "max_model_len": max_model_len + 1 + } + # illegal max_model_len by hf_overrides + with pytest.raises(ValueError): + with vllm_runner(model_info.name, + task="embed", + max_model_len=None, + hf_overrides=hf_overrides): + pass diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py index e9527700c3ca2..6b10aeffc4b72 100644 --- a/tests/models/language/pooling/test_scoring.py +++ b/tests/models/language/pooling/test_scoring.py @@ -1,6 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -import math - import pytest import torch import torch.nn.functional as F @@ -45,7 +43,7 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name): assert len(vllm_outputs) == 1 assert len(hf_outputs) == 1 - assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name): @@ -64,8 +62,8 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name): assert len(vllm_outputs) == 2 assert len(hf_outputs) == 2 - assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) - assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) + assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name): @@ -84,8 +82,8 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name): assert len(vllm_outputs) == 2 assert len(hf_outputs) == 2 - assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) - assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) + assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) @pytest.fixture(scope="module", params=EMBEDDING_MODELS) @@ -112,7 +110,7 @@ def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name): assert len(vllm_outputs) == 1 assert len(hf_outputs) == 1 - assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name): @@ -140,8 +138,8 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name): assert len(vllm_outputs) == 2 assert len(hf_outputs) == 2 - assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) - assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) + assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name): @@ -169,5 +167,5 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name): assert len(vllm_outputs) == 2 assert len(hf_outputs) == 2 - assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) - assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) + assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py index 5679e0e1ce00b..c6c2d1e7a679d 100644 --- a/tests/models/language/pooling/test_snowflake_arctic_embed.py +++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py @@ -2,7 +2,8 @@ import pytest -from ...utils import EmbedModelInfo, run_embedding_correctness_test +from .embed_utils import EmbedModelInfo, correctness_test_embed_models +from .mteb_utils import mteb_test_embed_models MODELS = [ EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs", @@ -41,37 +42,14 @@ MODELS = [ @pytest.mark.parametrize("model_info", MODELS) -def test_models_mteb( - hf_runner, - vllm_runner, - model_info: EmbedModelInfo, -) -> None: - from .mteb_utils import mteb_test_embed_models +def test_embed_models_mteb(hf_runner, vllm_runner, + model_info: EmbedModelInfo) -> None: mteb_test_embed_models(hf_runner, vllm_runner, model_info) @pytest.mark.parametrize("model_info", MODELS) -def test_models_correctness( - hf_runner, - vllm_runner, - model_info: EmbedModelInfo, - example_prompts, -) -> None: - if not model_info.enable_test: - pytest.skip("Skipping test.") - - # ST will strip the input texts, see test_embedding.py - example_prompts = [str(s).strip() for s in example_prompts] - - with vllm_runner(model_info.name, - task="embed", - dtype=model_info.dtype, - max_model_len=None) as vllm_model: - vllm_outputs = vllm_model.encode(example_prompts) - - with hf_runner( - model_info.name, - dtype=model_info.dtype, - is_sentence_transformer=True, - ) as hf_model: - run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs) +def test_embed_models_correctness(hf_runner, vllm_runner, + model_info: EmbedModelInfo, + example_prompts) -> None: + correctness_test_embed_models(hf_runner, vllm_runner, model_info, + example_prompts) diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index d51a03dfea7e6..e4e48f9951cf2 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -349,6 +349,17 @@ VLM_TEST_SETTINGS = { use_tokenizer_eos=True, patch_hf_runner=model_utils.internvl_patch_hf_runner, ), + "intern_vl-video": VLMTestInfo( + models=[ + "OpenGVLab/InternVL3-1B", + ], + test_type=VLMTestType.VIDEO, + prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 + video_idx_to_prompt=lambda idx: "<video>", + max_model_len=8192, + use_tokenizer_eos=True, + patch_hf_runner=model_utils.internvl_patch_hf_runner, + ), "kimi_vl": VLMTestInfo( models=["moonshotai/Kimi-VL-A3B-Instruct"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), diff --git a/tests/models/multimodal/generation/test_interleaved.py b/tests/models/multimodal/generation/test_interleaved.py index eec84751e4504..972db40e8bd61 100644 --- a/tests/models/multimodal/generation/test_interleaved.py +++ b/tests/models/multimodal/generation/test_interleaved.py @@ -4,6 +4,7 @@ import pytest from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset +from vllm.multimodal.image import convert_image_mode models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"] @@ -26,8 +27,9 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None: give the same result. """ - image_cherry = ImageAsset("cherry_blossom").pil_image.convert("RGB") - image_stop = ImageAsset("stop_sign").pil_image.convert("RGB") + image_cherry = convert_image_mode( + ImageAsset("cherry_blossom").pil_image, "RGB") + image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB") images = [image_cherry, image_stop] video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py index 11460a1a8d2b5..e51dbee479c55 100644 --- a/tests/models/multimodal/generation/test_phi4mm.py +++ b/tests/models/multimodal/generation/test_phi4mm.py @@ -1,18 +1,18 @@ # SPDX-License-Identifier: Apache-2.0 import os -import re from collections.abc import Sequence from typing import Optional import librosa import pytest +import regex as re from huggingface_hub import snapshot_download from transformers import AutoTokenizer from vllm.assets.image import ImageAsset from vllm.lora.request import LoRARequest -from vllm.multimodal.image import rescale_image_size +from vllm.multimodal.image import convert_image_mode, rescale_image_size from vllm.platforms import current_platform from vllm.sequence import SampleLogprobs @@ -267,7 +267,7 @@ def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str, # use the example speech question so that the model outputs are reasonable audio = librosa.load(speech_question, sr=None) - image = ImageAsset("cherry_blossom").pil_image.convert("RGB") + image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB") inputs_vision_speech = [ ( diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py index 4e48bdbd04289..d0b85842a3d8f 100644 --- a/tests/models/multimodal/generation/test_whisper.py +++ b/tests/models/multimodal/generation/test_whisper.py @@ -100,6 +100,7 @@ def run_test( with vllm_runner( model, + dtype="half", max_model_len=448, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index b71400fc8312d..dc1ea5208240d 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -3,11 +3,13 @@ for manipulating the input / output of HF & vLLM test runners, which are typically specific to a small subset of models. """ -import re import types from pathlib import PosixPath from typing import Optional, Union +import numpy as np +import numpy.typing as npt +import regex as re import torch from PIL.Image import Image from transformers import (AutoConfig, AutoTokenizer, BatchFeature, @@ -495,30 +497,74 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: self.max_num = self.config.max_dynamic_patch self.image_size = self.vision_config.image_size - def __call__(self, text: str, images: Union[Image, list[Image]], - **kwargs): + def __call__( + self, + text: str, + images: Union[Image, list[Image]] = None, + videos: Union[npt.NDArray, list[npt.NDArray]] = None, + **kwargs, + ): from vllm.model_executor.models.internvl import ( IMG_CONTEXT, IMG_END, IMG_START, - image_to_pixel_values_internvl) + image_to_pixel_values_internvl, video_to_pixel_values_internvl) images = [images] if isinstance(images, Image) else images - pixel_values = [ - image_to_pixel_values_internvl( - image, - input_size=self.image_size, - min_num=self.min_num, - max_num=self.max_num, - use_thumbnail=self.use_thumbnail, - ) for image in images - ] - num_patches_list = [ - pixel_value.shape[0] for pixel_value in pixel_values - ] + videos = [videos] if isinstance(videos, np.ndarray) else videos + if images is not None: + pixel_values_images = [ + image_to_pixel_values_internvl( + image, + input_size=self.image_size, + min_num=self.min_num, + max_num=self.max_num, + use_thumbnail=self.use_thumbnail, + ) for image in images + ] + num_patches_images = [ + pixel_value.shape[0] for pixel_value in pixel_values_images + ] + else: + pixel_values_images, num_patches_images = [], [] + + if videos is not None: + pixel_values_videos = [ + video_to_pixel_values_internvl( + video, + input_size=self.image_size, + min_num=1, + max_num=1, + use_thumbnail=False, + ) for video in videos + ] + num_patches_videos = [ + pixel_value.shape[0] for pixel_value in pixel_values_videos + ] + else: + pixel_values_videos, num_patches_videos = [], [] + + pixel_values = [] + while ("<image>" in text) or ("<video>" in text): + image_index = text.find("<image>") + video_index = text.find("<video>") + if image_index == -1 or (video_index > -1 + and video_index < image_index): + num_patches = num_patches_videos.pop(0) + pixel_values.append(pixel_values_videos.pop(0)) + context_tokens = IMG_START + \ + IMG_CONTEXT * self.num_image_token + IMG_END + video_tokens = ''.join([ + f'Frame{i+1}: {context_tokens}' + for i in range(num_patches) + ]) + text = text.replace('<video>', video_tokens, 1) + else: + num_patches = num_patches_images.pop(0) + pixel_values.append(pixel_values_images.pop(0)) + context_tokens = IMG_CONTEXT * self.num_image_token \ + * num_patches + image_tokens = IMG_START + context_tokens + IMG_END + text = text.replace('<image>', image_tokens, 1) pixel_values = torch.cat(pixel_values, dim=0) - for num_patches in num_patches_list: - context_tokens = IMG_CONTEXT * self.num_image_token \ - * num_patches - image_tokens = IMG_START + context_tokens + IMG_END - text = text.replace('<image>', image_tokens, 1) + prompt = self.tokenizer(text, return_tensors="pt") prompt.update({"pixel_values": pixel_values}) return prompt diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index e6b70a4438e9e..d7f950c23d954 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -9,15 +9,15 @@ from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk, UserMessage) from mistral_common.protocol.instruct.request import ChatCompletionRequest from PIL import Image -from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast from vllm.config import ModelConfig from vllm.inputs import InputProcessingContext from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict from vllm.multimodal.inputs import MultiModalInputs from vllm.multimodal.processing import BaseMultiModalProcessor, ProcessingCache -from vllm.transformers_utils.tokenizer import (MistralTokenizer, - cached_tokenizer_from_config) +from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer, + cached_tokenizer_from_config, + encode_tokens) from ....multimodal.utils import random_audio, random_image, random_video from ...registry import HF_EXAMPLE_MODELS @@ -28,7 +28,6 @@ def _test_processing_correctness( hit_rate: float, num_batches: int, simplify_rate: float, - ignore_mm_keys: Optional[set[str]] = None, ): model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id) model_info.check_available_online(on_fail="skip") @@ -41,7 +40,7 @@ def _test_processing_correctness( tokenizer_mode=model_info.tokenizer_mode, trust_remote_code=model_info.trust_remote_code, seed=0, - dtype="float16", + dtype="auto", revision=None, hf_overrides=model_info.hf_overrides, ) @@ -99,10 +98,23 @@ def _test_processing_correctness( } mm_counts = {k: len(vs) for k, vs in mm_data.items()} - prompt = dummy_inputs.get_dummy_processor_inputs( - model_config.max_model_len, - mm_counts, - ).prompt_text + + # Mistral chat outputs tokens directly, rather than text prompts + if isinstance(tokenizer, MistralTokenizer): + images = mm_data.get("image", []) + request = ChatCompletionRequest(messages=[ + UserMessage(content=[ + TextChunk(text=""), + *(ImageChunk(image=image) for image in images), + ]), + ]) + res = tokenizer.mistral.encode_chat_completion(request) + prompt = res.tokens + else: + prompt = dummy_inputs.get_dummy_processor_inputs( + model_config.max_model_len, + mm_counts, + ).prompt # Drop unnecessary keys and test single -> multi conversion if rng.rand() < simplify_rate: @@ -112,67 +124,59 @@ def _test_processing_correctness( elif len(mm_data[k]) == 1: mm_data[k] = mm_data[k][0] - if isinstance(tokenizer, MistralTokenizer): - _test_processing_correctness_mistral( - model_config, - tokenizer, - prompt, - mm_data, - baseline_processor, - cached_processor, - batch_idx, - ignore_mm_keys=ignore_mm_keys, - ) - else: - _test_processing_correctness_hf( - model_config, - tokenizer, - prompt, - mm_data, - baseline_processor, - cached_processor, - batch_idx, - ignore_mm_keys=ignore_mm_keys, - ) + _test_processing_correctness_one( + model_config, + tokenizer, + prompt, + mm_data, + baseline_processor, + cached_processor, + batch_idx, + ) -def _test_processing_correctness_hf( +# For some multimodal models, tokenizer will always add bos_token +# at the beginning of prompt by default, causing hf_processor outputs +# incorrect token ids. So we need use `add_special_tokens=False` here +# to leave bos_token to be added by the processor. +_ADD_SPECIAL_TOKENS_OVERRIDES = { + "mllama": False, + "ovis": False, + "ultravox": False, + "whisper": False, +} + +_IGNORE_MM_KEYS = { + # In Ultravox, the audio_features can be different depending on padding + # The slight difference should not be a problem though, since + # attention_mask lets us ignore the difference. + "ultravox": {"audio_features"}, +} + + +def _test_processing_correctness_one( model_config: ModelConfig, - tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - prompt: str, + tokenizer: AnyTokenizer, + prompt: Union[str, list[int]], mm_data: MultiModalDataDict, baseline_processor: BaseMultiModalProcessor, cached_processor: BaseMultiModalProcessor, batch_idx: int, - ignore_mm_keys: Optional[set[str]] = None, ): - if model_config.hf_config.model_type in ("mllama", "ovis", "ultravox", - "whisper"): - # For some multimodal models, tokenizer will always add bos_token - # at the beginning of prompt by default, causing hf_processor outputs - # incorrect token ids. So we need use `add_special_tokens=False` here - # to leave bos_token to be added by the processor. - token_prompt = tokenizer.encode(prompt, add_special_tokens=False) + model_type = model_config.hf_config.model_type + ignore_mm_keys = _IGNORE_MM_KEYS.get(model_type, set[str]()) + + if isinstance(prompt, str): + text_prompt = prompt + token_prompt = encode_tokens( + tokenizer, + prompt, + add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type), + ) else: - token_prompt = tokenizer.encode(prompt) - - baseline_result = baseline_processor.apply( - prompt, - mm_data=mm_data, - hf_processor_mm_kwargs={}, - ) - cached_result = cached_processor.apply( - prompt, - mm_data=mm_data, - hf_processor_mm_kwargs={}, - ) - - _assert_inputs_equal( - baseline_result, - cached_result, - ignore_mm_keys=ignore_mm_keys, - msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})", - ) + # Mistral does not support decode_tokens with skip_special_tokens=False + text_prompt = None + token_prompt = prompt baseline_tokenized_result = baseline_processor.apply( token_prompt, @@ -180,56 +184,6 @@ def _test_processing_correctness_hf( hf_processor_mm_kwargs={}, ) - _assert_inputs_equal( - baseline_result, - baseline_tokenized_result, - ignore_mm_keys=ignore_mm_keys, - msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})", - ) - - cached_tokenized_result = cached_processor.apply( - token_prompt, - mm_data=mm_data, - hf_processor_mm_kwargs={}, - ) - - _assert_inputs_equal( - cached_result, - cached_tokenized_result, - ignore_mm_keys=ignore_mm_keys, - msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})", - ) - - -def _test_processing_correctness_mistral( - model_config: ModelConfig, - tokenizer: MistralTokenizer, - prompt: str, - mm_data: MultiModalDataDict, - baseline_processor: BaseMultiModalProcessor, - cached_processor: BaseMultiModalProcessor, - batch_idx: int, - ignore_mm_keys: Optional[set[str]] = None, -): - images = mm_data.get("image", []) - if not isinstance(images, list): - images = [images] - - request = ChatCompletionRequest(messages=[ - UserMessage(content=[ - TextChunk(text=prompt), - *(ImageChunk(image=image) for image in images), - ]), - ]) - res = tokenizer.mistral.encode_chat_completion(request) - token_prompt = res.tokens - - # Mistral chat outputs tokens directly, rather than text prompts - baseline_tokenized_result = baseline_processor.apply( - token_prompt, - mm_data=mm_data, - hf_processor_mm_kwargs={}, - ) cached_tokenized_result = cached_processor.apply( token_prompt, mm_data=mm_data, @@ -240,9 +194,44 @@ def _test_processing_correctness_mistral( baseline_tokenized_result, cached_tokenized_result, ignore_mm_keys=ignore_mm_keys, - msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})", + msg=f"Failed ({batch_idx=}, {token_prompt=}, {mm_data=})", ) + if text_prompt is not None: + baseline_text_result = baseline_processor.apply( + text_prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + cached_text_result = cached_processor.apply( + text_prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + _assert_inputs_equal( + baseline_text_result, + cached_text_result, + ignore_mm_keys=ignore_mm_keys, + msg=f"Failed ({batch_idx=}, {text_prompt=}, {mm_data=})", + ) + + _assert_inputs_equal( + baseline_text_result, + baseline_tokenized_result, + ignore_mm_keys=ignore_mm_keys, + msg=f"Failed ({batch_idx=}, {text_prompt=}, " + f"{token_prompt=}, {mm_data=})", + ) + + _assert_inputs_equal( + cached_text_result, + cached_tokenized_result, + ignore_mm_keys=ignore_mm_keys, + msg=f"Failed ({batch_idx=}, {text_prompt=}, " + f"{token_prompt=}, {mm_data=})", + ) + # yapf: disable @pytest.mark.parametrize("model_id", [ @@ -258,6 +247,7 @@ def _test_processing_correctness_mistral( "ibm-granite/granite-speech-3.3-8b", "h2oai/h2ovl-mississippi-800m", "OpenGVLab/InternVL2-1B", + "OpenGVLab/InternVL3-1B", "HuggingFaceM4/Idefics3-8B-Llama3", "HuggingFaceTB/SmolVLM2-2.2B-Instruct", "moonshotai/Kimi-VL-A3B-Instruct", @@ -280,6 +270,7 @@ def _test_processing_correctness_mistral( "AIDC-AI/Ovis2-1B", "google/paligemma-3b-mix-224", "google/paligemma2-3b-ft-docci-448", + "microsoft/Phi-3.5-vision-instruct", "microsoft/Phi-4-multimodal-instruct", "mistralai/Pixtral-12B-2409", "mistral-community/pixtral-12b", @@ -302,41 +293,6 @@ def test_processing_correctness( num_batches: int, simplify_rate: float, ): - ignore_mm_keys = None - if 'ultravox' in model_id: - # In Ultravox, the audio_features can be different depending on padding - # The slight difference should not be a problem though, since - # attention_mask lets us ignore the difference. - ignore_mm_keys = {"audio_features"} - - _test_processing_correctness( - model_id, - hit_rate=hit_rate, - num_batches=num_batches, - simplify_rate=simplify_rate, - ignore_mm_keys=ignore_mm_keys, - ) - - -# yapf: disable -@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"]) -@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) -@pytest.mark.parametrize("num_batches", [32]) -@pytest.mark.parametrize("simplify_rate", [1.0]) -# yapf: enable -def test_processing_correctness_phi3v( - model_id: str, - hit_rate: float, - num_batches: int, - simplify_rate: float, -): - # HACK - this is an attempted workaround for the following bug - # https://github.com/huggingface/transformers/issues/34307 - from transformers import AutoImageProcessor # noqa: F401 - from transformers import AutoProcessor # noqa: F401 - - AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True) - _test_processing_correctness( model_id, hit_rate=hit_rate, @@ -355,16 +311,10 @@ def _assert_inputs_equal( if ignore_mm_keys is None: ignore_mm_keys = set() - if msg is None: - assert "mm_kwargs" in a and "mm_kwargs" in b - else: - assert "mm_kwargs" in a and "mm_kwargs" in b, msg + assert "mm_kwargs" in a and "mm_kwargs" in b, msg for key in ignore_mm_keys: a["mm_kwargs"].pop(key, None) b["mm_kwargs"].pop(key, None) - if msg is None: - assert a == b - else: - assert a == b, msg + assert a == b, msg diff --git a/tests/models/multimodal/processing/test_mllama.py b/tests/models/multimodal/processing/test_mllama.py index b89376cf17229..d4794396f6d20 100644 --- a/tests/models/multimodal/processing/test_mllama.py +++ b/tests/models/multimodal/processing/test_mllama.py @@ -49,7 +49,7 @@ def test_profiling( ] * max_num_seqs mm_kwargs = processor.apply( - prompt=dummy_mm_data.prompt_text, + prompt=dummy_mm_data.prompt, mm_data=dummy_mm_data.mm_data, hf_processor_mm_kwargs=dict(), )["mm_kwargs"] diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py index 3ff36502df57a..5f17d12284a04 100644 --- a/tests/models/quantization/test_gguf.py +++ b/tests/models/quantization/test_gguf.py @@ -78,8 +78,12 @@ DOLPHIN_CONFIG = GGUFTestConfig( ) MODELS = [ - LLAMA_CONFIG, QWEN2_CONFIG, PHI3_CONFIG, GPT2_CONFIG, STABLELM_CONFIG, - DOLPHIN_CONFIG + LLAMA_CONFIG, + QWEN2_CONFIG, + PHI3_CONFIG, + GPT2_CONFIG, + # STABLELM_CONFIG, # enable this when v1 support head_size=80 + DOLPHIN_CONFIG, # STARCODER_CONFIG, # broken ] diff --git a/tests/models/registry.py b/tests/models/registry.py index 84abd42e92313..fe49d2427c744 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -8,6 +8,8 @@ import pytest from packaging.version import Version from transformers import __version__ as TRANSFORMERS_VERSION +from vllm.config import TokenizerMode + @dataclass(frozen=True) class _HfExamplesInfo: @@ -20,7 +22,7 @@ class _HfExamplesInfo: tokenizer: Optional[str] = None """Set the tokenizer to load for this architecture.""" - tokenizer_mode: str = "auto" + tokenizer_mode: TokenizerMode = "auto" """Set the tokenizer type for this architecture.""" speculative_model: Optional[str] = None @@ -55,9 +57,18 @@ class _HfExamplesInfo: trust_remote_code: bool = False """The ``trust_remote_code`` level required to load the model.""" + v0_only: bool = False + """The model is only available with the vLLM V0 engine.""" + hf_overrides: dict[str, Any] = field(default_factory=dict) """The ``hf_overrides`` required to load the model.""" + max_model_len: Optional[int] = None + """ + The maximum model length to use for this model. Some models default to a + length that is too large to fit into memory in CI. + """ + def check_transformers_version( self, *, @@ -124,7 +135,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat", trust_remote_code=True), "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B", - extras={"tiny": "hmellor/bamba-tiny-random"}), # noqa: E501 + extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}), # noqa: E501 "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m", {"1b": "bigscience/bloomz-1b1"}), "ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b", @@ -147,6 +158,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), # noqa: E501 "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501 "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"), + "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-1.5B-Instruct", + is_available_online=False, + min_transformers_version="4.52.2"), "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"), "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"), "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"), @@ -212,10 +226,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat", trust_remote_code=True), "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"), - "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"), + "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2", v0_only=True), "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"), "Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct", - trust_remote_code=True), + trust_remote_code=True, + v0_only=True), "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct", trust_remote_code=True), "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b", @@ -231,7 +246,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { is_available_online=False), "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b", # noqa: E501 is_available_online=False), - "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"), + "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t", + v0_only=True), "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"), "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"), "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B", @@ -267,7 +283,7 @@ _EMBEDDING_EXAMPLE_MODELS = { "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), "ModernBertModel": _HfExamplesInfo("Alibaba-NLP/gte-modernbert-base", trust_remote_code=True), - "NomicBertModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-long", # noqa: E501 + "NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe", trust_remote_code=True), "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"), "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"), @@ -300,7 +316,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"), "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501 "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b", # noqa: E501 - extras={"6b": "Salesforce/blip2-opt-6.7b"}), # noqa: E501 + extras={"6b": "Salesforce/blip2-opt-6.7b"}, # noqa: E501 + v0_only=True), "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501 "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny", # noqa: E501 extras={"fork": "Isotr0py/deepseek-vl2-tiny"}, # noqa: E501 @@ -319,15 +336,18 @@ _MULTIMODAL_EXAMPLE_MODELS = { max_transformers_version="4.48", # noqa: E501 transformers_version_reason="HF model is not compatible."), # noqa: E501 "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", - extras={"2B": "OpenGVLab/InternVL2-2B"}, # noqa: E501 + extras={"2B": "OpenGVLab/InternVL2-2B", + "3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501 trust_remote_code=True), "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501 {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501 "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501 extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"}, # noqa: E501 - trust_remote_code=True), + trust_remote_code=True, + v0_only=True), "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501 - min_transformers_version="4.51"), + min_transformers_version="4.51", + max_model_len=10240), "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf", extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501 "mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}), # noqa: E501 @@ -346,7 +366,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { extras={"2.6": "openbmb/MiniCPM-V-2_6"}, # noqa: E501 trust_remote_code=True), "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501 - trust_remote_code=True), + trust_remote_code=True, + v0_only=True), "Mistral3ForConditionalGeneration": _HfExamplesInfo("mistralai/Mistral-Small-3.1-24B-Instruct-2503", # noqa: E501 extras={"fp8": "nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"}), # noqa: E501 "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924", @@ -379,6 +400,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct"), # noqa: E501 "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B", min_transformers_version="4.52"), + "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ", # noqa: E501 + min_transformers_version="4.52"), "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"), "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"), # noqa: E501 "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501 @@ -411,6 +434,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { trust_remote_code=True, speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", tokenizer="meta-llama/Llama-3.1-8B-Instruct"), + "EagleMiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-1B-sft-bf16", + trust_remote_code=True, + is_available_online=False, + speculative_model="openbmb/MiniCPM-2B-sft-bf16", + tokenizer="openbmb/MiniCPM-2B-sft-bf16"), "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True, speculative_model="XiaomiMiMo/MiMo-7B-RL") diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 446c4efbf6af0..d403cb392fe06 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -15,12 +15,12 @@ from .registry import HF_EXAMPLE_MODELS @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs()) -def test_can_initialize(model_arch): +def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch): model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") - # Avoid OOM + # Avoid OOM and reduce initialization time by only using 1 layer def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig: hf_config.update(model_info.hf_overrides) @@ -34,6 +34,12 @@ def test_can_initialize(model_arch): "num_local_experts": 2, }) + if hasattr(hf_config, "vision_config"): + hf_config.vision_config.update({ + "num_layers": 1, + "num_hidden_layers": 1, + }) + return hf_config # Avoid calling model.forward() @@ -46,7 +52,7 @@ def test_can_initialize(model_arch): scheduler_kv_cache_config = get_kv_cache_config( vllm_config, kv_cache_specs[0], - 20 * GiB_bytes, + 10 * GiB_bytes, ) # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config @@ -55,7 +61,9 @@ def test_can_initialize(model_arch): with (patch.object(V0LLMEngine, "_initialize_kv_caches", _initialize_kv_caches_v0), patch.object(V1EngineCore, "_initialize_kv_caches", - _initialize_kv_caches_v1)): + _initialize_kv_caches_v1), monkeypatch.context() as m): + if model_info.v0_only: + m.setenv("VLLM_USE_V1", "0") LLM( model_info.default, tokenizer=model_info.tokenizer, @@ -65,6 +73,7 @@ def test_can_initialize(model_arch): "num_speculative_tokens": 1, } if model_info.speculative_model else None, trust_remote_code=model_info.trust_remote_code, + max_model_len=model_info.max_model_len, load_format="dummy", hf_overrides=hf_overrides, ) diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index b45a87d94b868..b62720caa9cb5 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -4,6 +4,7 @@ import pytest from vllm import LLM, SamplingParams from vllm.assets.image import ImageAsset +from vllm.multimodal.image import convert_image_mode from ..utils import create_new_process_for_each_test @@ -58,7 +59,7 @@ def test_oot_registration_embedding( assert all(v == 0 for v in output.outputs.embedding) -image = ImageAsset("cherry_blossom").pil_image.convert("RGB") +image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB") @create_new_process_for_each_test() diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index 6e38c4c7cadb3..1a51b4aeab04d 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -1,37 +1,50 @@ # SPDX-License-Identifier: Apache-2.0 """Test the functionality of the Transformers backend.""" +from typing import Any, Optional, Union + import pytest from vllm.platforms import current_platform from ..conftest import HfRunner, VllmRunner +from ..core.block.e2e.test_correctness_sliding_window import prep_prompts from ..utils import multi_gpu_test from .utils import check_logprobs_close def check_implementation( - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], + runner_ref: type[Union[HfRunner, VllmRunner]], + runner_test: type[VllmRunner], example_prompts: list[str], model: str, + kwargs_ref: Optional[dict[str, Any]] = None, + kwargs_test: Optional[dict[str, Any]] = None, **kwargs, ): + if kwargs_ref is None: + kwargs_ref = {} + if kwargs_test is None: + kwargs_test = {} + max_tokens = 32 num_logprobs = 5 - with vllm_runner(model, **kwargs) as vllm_model: - vllm_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) + args = (example_prompts, max_tokens, num_logprobs) - with hf_runner(model) as hf_model: - hf_outputs = hf_model.generate_greedy_logprobs_limit( - example_prompts, max_tokens, num_logprobs) + with runner_test(model, **kwargs_test, **kwargs) as model_test: + outputs_test = model_test.generate_greedy_logprobs(*args) + + with runner_ref(model, **kwargs_ref) as model_ref: + if isinstance(model_ref, VllmRunner): + outputs_ref = model_ref.generate_greedy_logprobs(*args) + else: + outputs_ref = model_ref.generate_greedy_logprobs_limit(*args) check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", + outputs_0_lst=outputs_ref, + outputs_1_lst=outputs_test, + name_0="ref", + name_1="test", ) @@ -58,6 +71,18 @@ def test_models( model_impl=model_impl) +def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None: + prompts, _, _ = prep_prompts(4, (800, 801)) + kwargs_ref = {"max_model_len": 8192, "enforce_eager": True} + kwargs_test = {"model_impl": "transformers", **kwargs_ref} + check_implementation(vllm_runner, + vllm_runner, + prompts, + model="hmellor/tiny-random-Gemma2ForCausalLM", + kwargs_ref=kwargs_ref, + kwargs_test=kwargs_test) + + @multi_gpu_test(num_gpus=2) def test_distributed( hf_runner: type[HfRunner], @@ -65,8 +90,11 @@ def test_distributed( example_prompts, ): kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2} - check_implementation(hf_runner, vllm_runner, example_prompts, - "meta-llama/Llama-3.2-1B-Instruct", **kwargs) + check_implementation(hf_runner, + vllm_runner, + example_prompts, + "meta-llama/Llama-3.2-1B-Instruct", + kwargs_test=kwargs) @pytest.mark.skipif( diff --git a/tests/models/utils.py b/tests/models/utils.py index a43fd77c6d794..ffc904bd10f46 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -2,7 +2,7 @@ import warnings from collections.abc import Sequence -from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union +from typing import Any, NamedTuple, Optional, Union import torch import torch.nn.functional as F @@ -13,9 +13,6 @@ from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs from .registry import HF_EXAMPLE_MODELS -if TYPE_CHECKING: - from ..conftest import HfRunner - TokensText = tuple[list[int], str] @@ -317,6 +314,7 @@ def check_embeddings_close( dim=0) fail_msg = (f"Test{prompt_idx}:" + f"\nCosine similarity: \t{sim:.4f}" f"\n{name_0}:\t{embeddings_0[:16]!r}" f"\n{name_1}:\t{embeddings_1[:16]!r}") @@ -337,22 +335,3 @@ class EmbedModelInfo(NamedTuple): architecture: str = "" dtype: str = "auto" enable_test: bool = True - - -def run_embedding_correctness_test( - hf_model: "HfRunner", - inputs: list[str], - vllm_outputs: Sequence[list[float]], - dimensions: Optional[int] = None, -): - hf_outputs = hf_model.encode(inputs) - if dimensions: - hf_outputs = matryoshka_fy(hf_outputs, dimensions) - - check_embeddings_close( - embeddings_0_lst=hf_outputs, - embeddings_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - tol=1e-2, - ) diff --git a/tests/multimodal/assets/rgba.png b/tests/multimodal/assets/rgba.png new file mode 100644 index 0000000000000..11eb81857a65b Binary files /dev/null and b/tests/multimodal/assets/rgba.png differ diff --git a/tests/multimodal/test_image.py b/tests/multimodal/test_image.py new file mode 100644 index 0000000000000..56b5475c9ca04 --- /dev/null +++ b/tests/multimodal/test_image.py @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: Apache-2.0 +from pathlib import Path + +import numpy as np +from PIL import Image, ImageChops + +from vllm.multimodal.image import convert_image_mode + +ASSETS_DIR = Path(__file__).parent / "assets" +assert ASSETS_DIR.exists() + + +def test_rgb_to_rgb(): + # Start with an RGB image. + original_image = Image.open(ASSETS_DIR / "image1.png").convert("RGB") + converted_image = convert_image_mode(original_image, "RGB") + + # RGB to RGB should be a no-op. + diff = ImageChops.difference(original_image, converted_image) + assert diff.getbbox() is None + + +def test_rgba_to_rgb(): + original_image = Image.open(ASSETS_DIR / "rgba.png") + original_image_numpy = np.array(original_image) + + converted_image = convert_image_mode(original_image, "RGB") + converted_image_numpy = np.array(converted_image) + + for i in range(original_image_numpy.shape[0]): + for j in range(original_image_numpy.shape[1]): + # Verify that all transparent pixels are converted to white. + if original_image_numpy[i][j][3] == 0: + assert converted_image_numpy[i][j][0] == 255 + assert converted_image_numpy[i][j][1] == 255 + assert converted_image_numpy[i][j][2] == 255 diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 478184c34b915..f1e45da30eda4 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -10,6 +10,7 @@ import numpy as np import pytest from PIL import Image, ImageChops +from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import PlaceholderRange from vllm.multimodal.utils import (MediaConnector, merge_and_sort_multimodal_metadata) @@ -53,7 +54,7 @@ def get_supported_suffixes() -> tuple[str, ...]: def _image_equals(a: Image.Image, b: Image.Image) -> bool: - return (np.asarray(a) == np.asarray(b.convert(a.mode))).all() + return (np.asarray(a) == np.asarray(convert_image_mode(b, a.mode))).all() @pytest.mark.asyncio diff --git a/tests/neuron/1_core/test_neuron_quant.py b/tests/neuron/1_core/test_neuron_quant.py new file mode 100644 index 0000000000000..68f0cb8054b4f --- /dev/null +++ b/tests/neuron/1_core/test_neuron_quant.py @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 +from vllm.model_executor.layers.quantization.neuron_quant import ( + NeuronQuantConfig) + + +def test_get_supported_act_dtypes(): + neuron_quant_config = NeuronQuantConfig() + supported_act_dtypes = neuron_quant_config.get_supported_act_dtypes() + target_list = ["any_dtype1", "any_dtype2"] + for dtype in target_list: + assert dtype in supported_act_dtypes diff --git a/tests/neuron/2_core/test_eagle.py b/tests/neuron/2_core/test_eagle.py new file mode 100644 index 0000000000000..d71c88689a994 --- /dev/null +++ b/tests/neuron/2_core/test_eagle.py @@ -0,0 +1,82 @@ +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +import shutil +import tempfile + +import torch +from huggingface_hub import snapshot_download +from safetensors import safe_open + +from vllm import LLM, SamplingParams + + +def patch_eagle_draft_with_lm_head(target_model_id: str, + draft_model_id: str) -> str: + # In NxDI, draft model checkpoint must include lm_head weights from target + # model. For more details see https://awsdocs-neuron.readthedocs-hosted.com + # /en/latest/libraries/nxd-inference/developer_guides/feature-guide.html + # #eagle-checkpoint-compatibility + final_draft_dir = "/tmp/patched_eagle_draft" + + with tempfile.TemporaryDirectory() as tmp_dir: + target_dir = snapshot_download(repo_id=target_model_id, + local_dir=os.path.join( + tmp_dir, "target")) + draft_dir = snapshot_download(repo_id=draft_model_id, + local_dir=os.path.join(tmp_dir, "draft")) + + lm_head_key = "lm_head.weight" + index_path = os.path.join(target_dir, "model.safetensors.index.json") + with open(index_path) as f: + index = json.load(f) + shard_name = index["weight_map"][lm_head_key] + target_safetensor_path = os.path.join(target_dir, shard_name) + + with safe_open(target_safetensor_path, framework="pt") as f: + target_lm_head = f.get_tensor(lm_head_key) + + draft_path = os.path.join(draft_dir, "pytorch_model.bin") + draft_state_dict = torch.load(draft_path, map_location="cpu") + draft_state_dict[lm_head_key] = target_lm_head.to(torch.float16) + torch.save(draft_state_dict, draft_path) + + shutil.copytree(draft_dir, final_draft_dir, dirs_exist_ok=True) + + return final_draft_dir + + +def test_eagle(): + patched_draft_path = patch_eagle_draft_with_lm_head( + target_model_id="meta-llama/Llama-2-7b-hf", + draft_model_id="yuhuili/EAGLE-llama2-chat-7B") + llm = LLM( + model="meta-llama/Llama-2-7b-hf", + speculative_config={ + "model": patched_draft_path, + "num_speculative_tokens": 5, + "max_model_len": 128 + }, + max_num_seqs=1, + max_model_len=128, + tensor_parallel_size=2, + override_neuron_config={ + "enable_eagle_speculation": True, + "enable_fused_speculation": True, + "fused_qkv": True + }, + ) + prompts = [ + "The president of the United States is", + ] + outputs = llm.generate(prompts, SamplingParams(top_k=1)) + expected_output = " the head of state and head of government of " \ + "the United States. The president direct" + + for output in outputs: + generated_text = output.outputs[0].text + print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}") + assert (expected_output == generated_text) + + print("Neuron Eagle speculation test passed.") diff --git a/tests/neuron/2_core/test_mistral.py b/tests/neuron/2_core/test_mistral.py index 8acd082f2ded7..3e651502d1e2a 100644 --- a/tests/neuron/2_core/test_mistral.py +++ b/tests/neuron/2_core/test_mistral.py @@ -7,26 +7,58 @@ def test_mistral(): llm = LLM(model="mistralai/Mistral-7B-v0.1", tensor_parallel_size=2, max_num_seqs=4, - max_model_len=512, + max_model_len=128, use_v2_block_manager=True, override_neuron_config={ "sequence_parallel_enabled": False, "skip_warmup": True - }, - device="neuron") + }) + # Send more prompts than the compiled batch size (4) and request + # varying generation lengths to test accuracy related to Neuron + # specific sequence id sorting. prompts = [ "The president of the United States is", "The capital of France is", + "What is Annapurna labs?", + "I believe the meaning of life is", + "Tell me a story about a brave knight", + "Hello, my name is Llama", ] - outputs = llm.generate(prompts, SamplingParams(top_k=1)) + + sampling_params = [ + SamplingParams(top_k=1, max_tokens=10), + SamplingParams(top_k=1, max_tokens=20), + SamplingParams(top_k=1, max_tokens=30), + SamplingParams(top_k=1, max_tokens=40), + SamplingParams(top_k=1, max_tokens=50), + SamplingParams(top_k=1, max_tokens=60) + ] + + outputs = llm.generate(prompts, sampling_params) expected_outputs = [ - " the most powerful person in the world. He is the head of state " - "and head", - " a city of many faces. It is a city of history, culture, art" + " the most powerful person in the world. He is", + " a city of many faces. It is a city of history, culture, art, " + "fashion, and", + "\n\nAnnapurna Labs is a semiconductor company that was founded " + "in 2013 by Amazon. The company is", + " to be happy.\n\nI believe that happiness is a choice.\n\nI " + "believe that happiness is a state of mind.\n\nI believe that " + "happiness is a journey.\n\nI believe", + " who rescued a princess from a dragon.\n\nTell me a story about" + " a princess who rescued herself from a dragon.\n\nTell me a " + "story about a princess who rescued herself from a dragon and " + "then rescued a knight from", + " and I am a 10 year old male. I am a very friendly and " + "affectionate boy who loves to be around people. I am a very " + "active boy who loves to play and run around. I am a very smart " + "boy who loves to learn new things. I am a very loyal boy" ] for expected_output, output in zip(expected_outputs, outputs): generated_text = output.outputs[0].text + print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}") assert (expected_output == generated_text) + + print("Neuron Mistral test passed.") diff --git a/tests/neuron/2_core/test_multi_lora.py b/tests/neuron/2_core/test_multi_lora.py new file mode 100644 index 0000000000000..6fa8f9128def7 --- /dev/null +++ b/tests/neuron/2_core/test_multi_lora.py @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: Apache-2.0 + +from huggingface_hub import snapshot_download + +from vllm import LLM, SamplingParams +from vllm.lora.request import LoRARequest + + +def test_llama_single_lora(): + sql_lora_files = snapshot_download( + repo_id="yard1/llama-2-7b-sql-lora-test") + llm = LLM(model="meta-llama/Llama-2-7b-hf", + tensor_parallel_size=2, + max_num_seqs=4, + max_model_len=512, + use_v2_block_manager=True, + override_neuron_config={ + "sequence_parallel_enabled": False, + "skip_warmup": True, + "lora_modules": [{ + "name": "lora_id_1", + "path": sql_lora_files + }] + }, + enable_lora=True, + max_loras=1, + max_lora_rank=256, + device="neuron") + """For multi-lora requests using NxDI as the backend, only the lora_name + needs to be specified. The lora_id and lora_path are supplied at the LLM + class/server initialization, after which the paths are handled by NxDI""" + lora_req_1 = LoRARequest("lora_id_1", 0, " ") + prompts = [ + "The president of the United States is", + "The capital of France is", + ] + outputs = llm.generate(prompts, + SamplingParams(top_k=1), + lora_request=[lora_req_1, lora_req_1]) + + expected_outputs = [ + " the head of state and head of government of the United States. " + "The president direct", + " a city of contrasts. The city is home to the Eiffel Tower" + ] + + for expected_output, output in zip(expected_outputs, outputs): + generated_text = output.outputs[0].text + assert (expected_output == generated_text) + + +def test_llama_multiple_lora(): + sql_lora_files = snapshot_download( + repo_id="yard1/llama-2-7b-sql-lora-test") + llm = LLM(model="meta-llama/Llama-2-7b-hf", + tensor_parallel_size=2, + max_num_seqs=4, + max_model_len=512, + use_v2_block_manager=True, + override_neuron_config={ + "sequence_parallel_enabled": + False, + "skip_warmup": + True, + "lora_modules": [{ + "name": "lora_id_1", + "path": sql_lora_files + }, { + "name": "lora_id_2", + "path": sql_lora_files + }] + }, + enable_lora=True, + max_loras=2, + max_lora_rank=256, + device="neuron") + """For multi-lora requests using NxDI as the backend, only the lora_name + needs to be specified. The lora_id and lora_path are supplied at the LLM + class/server initialization, after which the paths are handled by NxDI""" + lora_req_1 = LoRARequest("lora_id_1", 0, " ") + lora_req_2 = LoRARequest("lora_id_2", 1, " ") + prompts = [ + "The president of the United States is", + "The capital of France is", + ] + outputs = llm.generate(prompts, + SamplingParams(top_k=1), + lora_request=[lora_req_1, lora_req_2]) + + expected_outputs = [ + " the head of state and head of government of the United States. " + "The president direct", + " a city of contrasts. The city is home to the Eiffel Tower" + ] + + for expected_output, output in zip(expected_outputs, outputs): + generated_text = output.outputs[0].text + assert (expected_output == generated_text) diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py index 9d6872e0e0772..207de53abd8d1 100644 --- a/tests/plugins_tests/test_platform_plugins.py +++ b/tests/plugins_tests/test_platform_plugins.py @@ -29,5 +29,5 @@ def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch): # ignore the backend env variable if it is set with monkeypatch.context() as m: m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL) - backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) + backend = get_attn_backend(16, torch.float16, "auto", 16, False) assert backend.get_name() == "Dummy_Backend" diff --git a/tests/runai_model_streamer_test/test_weight_utils.py b/tests/runai_model_streamer_test/test_weight_utils.py index 4afa76c51693f..06e506c35761e 100644 --- a/tests/runai_model_streamer_test/test_weight_utils.py +++ b/tests/runai_model_streamer_test/test_weight_utils.py @@ -23,10 +23,11 @@ def test_runai_model_loader(): runai_model_streamer_tensors = {} hf_safetensors_tensors = {} - for name, tensor in runai_safetensors_weights_iterator(safetensors): + for name, tensor in runai_safetensors_weights_iterator( + safetensors, True): runai_model_streamer_tensors[name] = tensor - for name, tensor in safetensors_weights_iterator(safetensors): + for name, tensor in safetensors_weights_iterator(safetensors, True): hf_safetensors_tensors[name] = tensor assert len(runai_model_streamer_tensors) == len(hf_safetensors_tensors) diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py index 355e3adcf5f30..f9688b4b9b272 100644 --- a/tests/samplers/test_no_bad_words.py +++ b/tests/samplers/test_no_bad_words.py @@ -103,7 +103,7 @@ class TestTwoTokenBadWord: add_special_tokens=False)[0] def test_two_token_bad_word(self, vllm_runner): - with vllm_runner(self.MODEL) as llm: + with vllm_runner(self.MODEL, dtype="half") as llm: output_token_ids = self._generate(llm) assert output_token_ids[:2] == [ self.target_token_id1, self.target_token_id2 diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py index 7efef163d2b92..ce8689f5b89c1 100644 --- a/tests/tensorizer_loader/conftest.py +++ b/tests/tensorizer_loader/conftest.py @@ -5,14 +5,6 @@ from vllm.distributed import cleanup_dist_env_and_memory from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - Tensorizer only tested on V0 so far. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - @pytest.fixture(autouse=True) def cleanup(): cleanup_dist_env_and_memory(shutdown_ray=True) diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 7136dd44de03d..747ec56ad6298 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -1,17 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 import gc -import json import os import pathlib import subprocess -from functools import partial -from unittest.mock import MagicMock, patch -import openai import pytest import torch -from huggingface_hub import snapshot_download from vllm import SamplingParams from vllm.engine.arg_utils import EngineArgs @@ -20,14 +15,12 @@ from vllm.engine.arg_utils import EngineArgs from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig, TensorSerializer, is_vllm_tensorized, - load_with_tensorizer, open_stream, - serialize_vllm_model, tensorize_vllm_model) # yapf: enable -from vllm.utils import PlaceholderModule, import_from_path +from vllm.utils import PlaceholderModule -from ..utils import VLLM_PATH, RemoteOpenAIServer +from ..utils import VLLM_PATH try: from tensorizer import EncryptionParams @@ -66,21 +59,6 @@ def write_keyfile(keyfile_path: str): f.write(encryption_params.key) -@patch('vllm.model_executor.model_loader.tensorizer.TensorizerAgent') -def test_load_with_tensorizer(mock_agent, tensorizer_config): - mock_linear_method = MagicMock() - mock_agent_instance = mock_agent.return_value - mock_agent_instance.deserialize.return_value = MagicMock() - - result = load_with_tensorizer(tensorizer_config, - quant_method=mock_linear_method) - - mock_agent.assert_called_once_with(tensorizer_config, - quant_method=mock_linear_method) - mock_agent_instance.deserialize.assert_called_once() - assert result == mock_agent_instance.deserialize.return_value - - @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") def test_can_deserialize_s3(vllm_runner): model_ref = "EleutherAI/pythia-1.4b" @@ -103,6 +81,7 @@ def test_can_deserialize_s3(vllm_runner): @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") def test_deserialized_encrypted_vllm_model_has_same_outputs( vllm_runner, tmp_path): + args = EngineArgs(model=model_ref) with vllm_runner(model_ref) as vllm_model: model_path = tmp_path / (model_ref + ".tensors") key_path = tmp_path / (model_ref + ".key") @@ -110,15 +89,13 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs( outputs = vllm_model.generate(prompts, sampling_params) - config_for_serializing = TensorizerConfig(tensorizer_uri=model_path, - encryption_keyfile=key_path) + config_for_serializing = TensorizerConfig(tensorizer_uri=str(model_path), + encryption_keyfile=str(key_path)) - vllm_model.apply_model( - partial(serialize_vllm_model, - tensorizer_config=config_for_serializing)) + tensorize_vllm_model(args, config_for_serializing) - config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path, - encryption_keyfile=key_path) + config_for_deserializing = TensorizerConfig( + tensorizer_uri=str(model_path), encryption_keyfile=str(key_path)) with vllm_runner(model_ref, load_format="tensorizer", @@ -154,113 +131,46 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner, assert outputs == deserialized_outputs -def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): - multilora_inference = import_from_path( - "examples.offline_inference.multilora_inference", - EXAMPLES_PATH / "offline_inference/multilora_inference.py", - ) - - model_ref = "meta-llama/Llama-2-7b-hf" - lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") - test_prompts = multilora_inference.create_test_prompts(lora_path) - - # Serialize model before deserializing and binding LoRA adapters - with vllm_runner(model_ref) as vllm_model: - model_path = tmp_path / (model_ref + ".tensors") - - vllm_model.apply_model( - partial( - serialize_vllm_model, - tensorizer_config=TensorizerConfig(tensorizer_uri=model_path))) - - with vllm_runner( - model_ref, - load_format="tensorizer", - model_loader_extra_config=TensorizerConfig( - tensorizer_uri=model_path, - num_readers=1, - ), - enable_lora=True, - max_loras=1, - max_lora_rank=8, - max_cpu_loras=2, - max_num_seqs=50, - max_model_len=1000, - ) as loaded_vllm_model: - multilora_inference.process_requests( - loaded_vllm_model.model.llm_engine, test_prompts) - - assert loaded_vllm_model - - -def test_load_without_tensorizer_load_format(vllm_runner): +def test_load_without_tensorizer_load_format(vllm_runner, capfd): model = None - with pytest.raises(ValueError): + try: model = vllm_runner( model_ref, model_loader_extra_config=TensorizerConfig(tensorizer_uri="test")) - del model - gc.collect() - torch.cuda.empty_cache() + except RuntimeError: + out, err = capfd.readouterr() + combined_output = out + err + assert ("ValueError: Model loader extra config " + "is not supported for load " + "format LoadFormat.AUTO") in combined_output + finally: + del model + gc.collect() + torch.cuda.empty_cache() -@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") -def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): - ## Serialize model - with vllm_runner(model_ref) as vllm_model: - model_path = tmp_path / (model_ref + ".tensors") - - vllm_model.apply_model( - partial( - serialize_vllm_model, - tensorizer_config=TensorizerConfig(tensorizer_uri=model_path))) - - model_loader_extra_config = { - "tensorizer_uri": str(model_path), - } - - ## Start OpenAI API server - openai_args = [ - "--dtype", - "float16", - "--load-format", - "tensorizer", - "--model-loader-extra-config", - json.dumps(model_loader_extra_config), - ] - - with RemoteOpenAIServer(model_ref, openai_args) as server: - print("Server ready.") - - client = server.get_client() - completion = client.completions.create(model=model_ref, - prompt="Hello, my name is", - max_tokens=5, - temperature=0.0) - - assert completion.id is not None - assert len(completion.choices) == 1 - assert len(completion.choices[0].text) >= 5 - assert completion.choices[0].finish_reason == "length" - assert completion.usage == openai.types.CompletionUsage( - completion_tokens=5, prompt_tokens=6, total_tokens=11) - - -def test_raise_value_error_on_invalid_load_format(vllm_runner): +def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd): model = None - with pytest.raises(ValueError): + try: model = vllm_runner( model_ref, load_format="safetensors", model_loader_extra_config=TensorizerConfig(tensorizer_uri="test")) - del model - gc.collect() - torch.cuda.empty_cache() + except RuntimeError: + out, err = capfd.readouterr() + + combined_output = out + err + assert ("ValueError: Model loader extra config is not supported " + "for load format LoadFormat.SAFETENSORS") in combined_output + finally: + del model + gc.collect() + torch.cuda.empty_cache() @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs") -def test_tensorizer_with_tp_path_without_template(vllm_runner): - with pytest.raises(ValueError): +def test_tensorizer_with_tp_path_without_template(vllm_runner, capfd): + try: model_ref = "EleutherAI/pythia-1.4b" tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors" @@ -275,6 +185,13 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner): tensor_parallel_size=2, disable_custom_all_reduce=True, ) + except RuntimeError: + out, err = capfd.readouterr() + combined_output = out + err + assert ("ValueError: For a sharded model, tensorizer_uri " + "should include a string format template like '%04d' " + "to be formatted with the rank " + "of the shard") in combined_output @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs") @@ -288,7 +205,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs( enforce_eager=True, ) as base_model: outputs = base_model.generate(prompts, sampling_params) - base_model.model.llm_engine.model_executor.shutdown() # load model with two shards and serialize with encryption model_path = str(tmp_path / (model_ref + "-%02d.tensors")) @@ -296,7 +212,7 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs( tensorizer_config = TensorizerConfig( tensorizer_uri=model_path, - encryption_keyfile=key_path, + encryption_keyfile=str(key_path), ) tensorize_vllm_model( @@ -331,14 +247,13 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): model_ref = "facebook/opt-125m" model_path = tmp_path / (model_ref + ".tensors") config = TensorizerConfig(tensorizer_uri=str(model_path)) + args = EngineArgs(model=model_ref, device="cuda") with vllm_runner(model_ref) as vllm_model: outputs = vllm_model.generate(prompts, sampling_params) - vllm_model.apply_model( - partial(serialize_vllm_model, tensorizer_config=config)) - - assert is_vllm_tensorized(config) + tensorize_vllm_model(args, config) + assert is_vllm_tensorized(config) with vllm_runner(model_ref, load_format="tensorizer", diff --git a/tests/test_logger.py b/tests/test_logger.py index 11deae309ac8b..046f70504c899 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -1,10 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 - +import enum import json import logging import os import sys import tempfile +from dataclasses import dataclass from json.decoder import JSONDecodeError from tempfile import NamedTemporaryFile from typing import Any @@ -16,6 +17,7 @@ import pytest from vllm.logger import (_DATE_FORMAT, _FORMAT, _configure_vllm_root_logger, enable_trace_function_call, init_logger) from vllm.logging_utils import NewLineFormatter +from vllm.logging_utils.dump_input import prepare_object_to_dump def f1(x): @@ -216,3 +218,37 @@ def test_custom_logging_config_causes_an_error_if_configure_logging_is_off(): assert other_logger.handlers != root_logger.handlers assert other_logger.level != root_logger.level assert other_logger.propagate + + +def test_prepare_object_to_dump(): + str_obj = 'str' + assert prepare_object_to_dump(str_obj) == "'str'" + + list_obj = [1, 2, 3] + assert prepare_object_to_dump(list_obj) == '[1, 2, 3]' + + dict_obj = {'a': 1, 'b': 'b'} + assert prepare_object_to_dump(dict_obj) in [ + "{a: 1, b: 'b'}", "{b: 'b', a: 1}" + ] + + set_obj = {1, 2, 3} + assert prepare_object_to_dump(set_obj) == '[1, 2, 3]' + + tuple_obj = ('a', 'b', 'c') + assert prepare_object_to_dump(tuple_obj) == "['a', 'b', 'c']" + + class CustomEnum(enum.Enum): + A = enum.auto() + B = enum.auto() + C = enum.auto() + + assert prepare_object_to_dump(CustomEnum.A) == repr(CustomEnum.A) + + @dataclass + class CustomClass: + a: int + b: str + + assert (prepare_object_to_dump(CustomClass( + 1, 'b')) == "CustomClass(a=1, b='b')") diff --git a/tests/test_outputs.py b/tests/test_outputs.py new file mode 100644 index 0000000000000..c41bd6723ba11 --- /dev/null +++ b/tests/test_outputs.py @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm.outputs import RequestOutput + + +def test_request_output_forward_compatible(): + output = RequestOutput(request_id="test_request_id", + prompt="test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[], + finished=False, + example_arg_added_in_new_version="some_value") + assert output is not None diff --git a/tests/test_regression.py b/tests/test_regression.py index 8c9d4a91c73be..e092945422edb 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -60,6 +60,9 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch): # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary with monkeypatch.context() as m: m.setenv("VLLM_USE_MODELSCOPE", "True") + # Don't use HF_TOKEN for ModelScope repos, otherwise it will fail + # with 400 Client Error: Bad Request. + m.setenv("HF_TOKEN", "") llm = LLM(model="qwen/Qwen1.5-0.5B-Chat") prompts = [ diff --git a/tests/test_utils.py b/tests/test_utils.py index 0b88d05efeaad..dd8777f068887 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -17,7 +17,8 @@ from vllm_test_utils.monitor import monitor from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache, MemorySnapshot, PlaceholderModule, StoreBoolean, - bind_kv_cache, deprecate_kwargs, get_open_port, + bind_kv_cache, common_broadcastable_dtype, + deprecate_kwargs, get_open_port, is_lossless_cast, make_zmq_path, make_zmq_socket, memory_profiling, merge_async_iterators, sha256, split_zmq_path, supports_kw, swap_dict_values) @@ -567,12 +568,65 @@ def test_lru_cache(): assert 6 in cache +# yapf: disable +@pytest.mark.parametrize( + ("src_dtype", "tgt_dtype", "expected_result"), + [ + # Different precision_levels + (torch.bool, torch.int8, True), + (torch.bool, torch.float16, True), + (torch.bool, torch.complex32, True), + (torch.int64, torch.bool, False), + (torch.int64, torch.float16, True), + (torch.int64, torch.complex32, True), + (torch.float64, torch.bool, False), + (torch.float64, torch.int8, False), + (torch.float64, torch.complex32, True), + (torch.complex128, torch.bool, False), + (torch.complex128, torch.int8, False), + (torch.complex128, torch.float16, False), + # precision_level=0 + (torch.bool, torch.bool, True), + # precision_level=1 + (torch.int8, torch.int16, True), + (torch.int16, torch.int8, False), + (torch.uint8, torch.int8, False), + (torch.int8, torch.uint8, False), + # precision_level=2 + (torch.float16, torch.float32, True), + (torch.float32, torch.float16, False), + (torch.bfloat16, torch.float32, True), + (torch.float32, torch.bfloat16, False), + # precision_level=3 + (torch.complex32, torch.complex64, True), + (torch.complex64, torch.complex32, False), + ], +) +# yapf: enable +def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result): + assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result + + +# yapf: disable +@pytest.mark.parametrize( + ("dtypes", "expected_result"), + [ + ([torch.bool], torch.bool), + ([torch.bool, torch.int8], torch.int8), + ([torch.bool, torch.int8, torch.float16], torch.float16), + ([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32), # noqa: E501 + ], +) +# yapf: enable +def test_common_broadcastable_dtype(dtypes, expected_result): + assert common_broadcastable_dtype(dtypes) == expected_result + + def test_placeholder_module_error_handling(): placeholder = PlaceholderModule("placeholder_1234") def build_ctx(): - return pytest.raises(ModuleNotFoundError, - match="No module named") + return pytest.raises(ModuleNotFoundError, match="No module named") with build_ctx(): int(placeholder) @@ -608,6 +662,7 @@ def test_placeholder_module_error_handling(): _ = placeholder_attr.module +# yapf: disable @pytest.mark.parametrize( "obj,key1,key2", [ @@ -618,6 +673,7 @@ def test_placeholder_module_error_handling(): # Tests for both keys do not exist ({1: "a", 2: "b"}, 3, 4), ]) +# yapf: enable def test_swap_dict_values(obj, key1, key2): original_obj = obj.copy() swap_dict_values(obj, key1, key2) @@ -631,19 +687,19 @@ def test_swap_dict_values(obj, key1, key2): assert key1 not in obj -def test_model_specification(parser_with_config, - cli_config_file, +def test_model_specification(parser_with_config, cli_config_file, cli_config_file_with_model): # Test model in CLI takes precedence over config - args = parser_with_config.parse_args([ - 'serve', 'cli-model', '--config', cli_config_file_with_model - ]) + args = parser_with_config.parse_args( + ['serve', 'cli-model', '--config', cli_config_file_with_model]) assert args.model_tag == 'cli-model' assert args.served_model_name == 'mymodel' # Test model from config file works args = parser_with_config.parse_args([ - 'serve', '--config', cli_config_file_with_model, + 'serve', + '--config', + cli_config_file_with_model, ]) assert args.model == 'config-model' assert args.served_model_name == 'mymodel' @@ -654,17 +710,19 @@ def test_model_specification(parser_with_config, # Test using --model option raises error with pytest.raises( - ValueError, - match=( - "With `vllm serve`, you should provide the model as a positional " - "argument or in a config file instead of via the `--model` option." - ), + ValueError, + match= + ("With `vllm serve`, you should provide the model as a positional " + "argument or in a config file instead of via the `--model` option."), ): parser_with_config.parse_args(['serve', '--model', 'my-model']) # Test other config values are preserved args = parser_with_config.parse_args([ - 'serve', 'cli-model', '--config', cli_config_file_with_model, + 'serve', + 'cli-model', + '--config', + cli_config_file_with_model, ]) assert args.tensor_parallel_size == 2 assert args.trust_remote_code is True @@ -673,7 +731,7 @@ def test_model_specification(parser_with_config, @pytest.mark.parametrize("input", [(), ("abc", ), (None, ), - (None, bool, [1, 2, 3])]) + (None, bool, [1, 2, 3])]) @pytest.mark.parametrize("output", [0, 1, 2]) def test_sha256(input: tuple, output: int): hash = sha256(input) @@ -682,7 +740,8 @@ def test_sha256(input: tuple, output: int): assert hash != 0 bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL) - assert hash == int.from_bytes(hashlib.sha256(bytes).digest(), byteorder="big") + assert hash == int.from_bytes(hashlib.sha256(bytes).digest(), + byteorder="big") # hashing again, returns the same value assert hash == sha256(input) @@ -698,8 +757,7 @@ def test_sha256(input: tuple, output: int): ("tcp://127.0.0.1:5555", ("tcp", "127.0.0.1", "5555")), ("tcp://[::1]:5555", ("tcp", "::1", "5555")), # IPv6 address ("inproc://some_identifier", ("inproc", "some_identifier", "")), - ] -) + ]) def test_split_zmq_path(path, expected): assert split_zmq_path(path) == expected @@ -711,8 +769,7 @@ def test_split_zmq_path(path, expected): "tcp://127.0.0.1", # Missing port "tcp://[::1]", # Missing port for IPv6 "tcp://:5555", # Missing host - ] -) + ]) def test_split_zmq_path_invalid(invalid_path): with pytest.raises(ValueError): split_zmq_path(invalid_path) @@ -734,7 +791,8 @@ def test_make_zmq_socket_ipv6(): zsock: zmq.Socket = make_zmq_socket(ctx, ipv6_path, socket_type) # Verify that the IPV6 option is set - assert zsock.getsockopt(zmq.IPV6) == 1, "IPV6 option should be enabled for IPv6 addresses" + assert zsock.getsockopt( + zmq.IPV6) == 1, "IPV6 option should be enabled for IPv6 addresses" # Clean up zsock.close() diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py index f1c880286951a..b16d9af35be98 100644 --- a/tests/tokenization/test_mistral_tokenizer.py +++ b/tests/tokenization/test_mistral_tokenizer.py @@ -1,15 +1,18 @@ # SPDX-License-Identifier: Apache-2.0 import pytest -from mistral_common.protocol.instruct.messages import UserMessage +from mistral_common.protocol.instruct.messages import (AssistantMessage, + ToolMessage, + UserMessage) from mistral_common.protocol.instruct.request import ChatCompletionRequest -from mistral_common.protocol.instruct.tool_calls import Function, Tool +from mistral_common.protocol.instruct.tool_calls import (Function, + FunctionCall, Tool, + ToolCall) from vllm.transformers_utils.tokenizers.mistral import ( make_mistral_chat_completion_request) -# yapf: enable @pytest.mark.parametrize( "openai_request,expected_mistral_request", [( @@ -78,6 +81,107 @@ from vllm.transformers_utils.tokenizers.mistral import ( ) def test_make_mistral_chat_completion_request(openai_request, expected_mistral_request): - assert (make_mistral_chat_completion_request( - openai_request["messages"], - openai_request["tools"]) == expected_mistral_request) + actual_request = make_mistral_chat_completion_request( + openai_request["messages"], openai_request["tools"]) + assert actual_request == expected_mistral_request + + +# Tool use with list content and reasoning_content +@pytest.mark.parametrize("openai_request,expected_mistral_request", [( + { + "messages": [ + { + "role": "user", + "content": "What's the weather in Paris?", + }, + { + "role": + "assistant", + "reasoning_content": + None, + "content": + None, + "tool_calls": [{ + "id": "call123", + "type": "function", + "function": { + "name": "get_weather", + "arguments": '{"city": "Paris"}', + }, + }], + }, + { + "role": "tool", + "content": [{ + "type": "text", + "text": "Rainy" + }], + "name": "get_weather", + "tool_call_id": "call123", + }, + ], + "tools": [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Gets the current weather in a city.", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "The city name" + } + }, + "required": ["city"], + }, + }, + }], + }, + ChatCompletionRequest( + messages=[ + UserMessage(content="What's the weather in Paris?"), + AssistantMessage( + content=None, + tool_calls=[ + ToolCall( + id="call123", + function=FunctionCall( + name="get_weather", + arguments='{"city": "Paris"}', + ), + ) + ], + ), + ToolMessage( + content="Rainy", + tool_call_id="call123", + name="get_weather", + ), + ], + tools=[ + Tool( + type="function", + function=Function( + name="get_weather", + description="Gets the current weather in a city.", + parameters={ + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "The city name" + } + }, + "required": ["city"], + }, + ), + ) + ], + ), +)]) +def test_make_mistral_chat_completion_request_list_content( + openai_request, expected_mistral_request): + actual_request = make_mistral_chat_completion_request( + openai_request["messages"], openai_request["tools"]) + assert actual_request == expected_mistral_request diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py index 2ab87a0ef41ff..2917698481453 100644 --- a/tests/tool_use/test_tool_choice_required.py +++ b/tests/tool_use/test_tool_choice_required.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 import json -import re from copy import deepcopy from unittest.mock import MagicMock import pytest +import regex as re from pydantic import TypeAdapter from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, @@ -333,4 +333,4 @@ def test_streaming_output_valid(output, empty_params, delta_len): combined_messages += message.tool_calls[0].function.arguments combined_messages += "}]" assert json.loads(combined_messages) == output - assert json.dumps(json.loads(combined_messages)) == output_json + assert json.dumps(json.loads(combined_messages)) == output_json \ No newline at end of file diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py index c14eaf71e978f..efa6455c41df7 100644 --- a/tests/tool_use/utils.py +++ b/tests/tool_use/utils.py @@ -88,7 +88,7 @@ CONFIGS: dict[str, ServerConfig] = { "meta-llama/Llama-4-Scout-17B-16E-Instruct", "arguments": [ "--enforce-eager", "--no-enable-prefix-caching", - "--tool-call-parser", "pythonic", "--chat-template", + "--tool-call-parser", "llama4_pythonic", "--chat-template", str(VLLM_PATH / "examples/tool_chat_template_llama4_pythonic.jinja"), "-tp", "4" diff --git a/tests/tpu/lora/test_pallas_kernels.py b/tests/tpu/lora/test_pallas_kernels.py deleted file mode 100644 index 8bd47de50c340..0000000000000 --- a/tests/tpu/lora/test_pallas_kernels.py +++ /dev/null @@ -1,73 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -import pytest -import torch - -# Required to register the custom ops -import vllm.lora.ops.xla_ops.pallas # noqa # pylint: disable=unused-import - -N_TOKENS = [16, 1024, 4096] -HIDDEN_SIZES = [1024, 2048, 4096] - -DTYPES = [torch.bfloat16] -NUM_LORA = [1, 4, 16] -RANKS = [32, 256, 512] - - -def generate_test_data(T, D, L, N, seed, dtype=torch.float32): - """ - Inputs: (All integers) - T: Total number of tokens - D: Input dim - L: LoRA Dim - N: N LoRAs - - Outputs: - inputs: torch.Tensor - shape (T, D) - loras: torch.Tensor - shape (N, 1, L, D) - idxs: torch.Tensor - shape (T, ) - all values must be in [0, N) - - ref_output: torch.Tensor - shape (T, L) - inputs @ loras[idxs].T - """ - torch.manual_seed(seed) - - inputs = torch.randn((T, D), device="xla", dtype=dtype) - loras = torch.randn((N, 1, L, D), device="xla", dtype=dtype) - idxs = torch.randint(0, N, (T, ), dtype=torch.int32, device="xla") - - ref_output = ref_bgmv(inputs, loras, idxs) - return inputs, loras, idxs, ref_output - - -def ref_bgmv(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.Tensor): - selected_loras = loras[idxs] - if len(selected_loras.shape) == 4: - selected_loras = selected_loras.squeeze(axis=1) - - batch_size, output_size, input_size = selected_loras.shape - return (selected_loras @ inputs.reshape( - (batch_size, input_size, 1))).reshape((batch_size, output_size)) - - -# Parameterize tests with various shapes and dtypes -@pytest.mark.parametrize("T", N_TOKENS) -@pytest.mark.parametrize("D", HIDDEN_SIZES) -@pytest.mark.parametrize("L", RANKS) -@pytest.mark.parametrize("N", NUM_LORA) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("op_type", ["shrink", "expand"]) -@pytest.mark.parametrize("seed", [0]) -def test_bgmv_correctness(T, D, L, N, dtype, op_type, seed): - if op_type == "expand": - D, L = L, D - - inputs, loras, idxs, ref_output = generate_test_data( - T, D, L, N, seed, dtype) - - # Run bgmv - output = torch.ops.xla.bgmv(inputs, loras, idxs) - - # Make sure we have no NaNs - assert not torch.any(torch.isnan(output)) - - # Compare with reference output - assert torch.allclose(output, ref_output, rtol=1e-2, atol=1e-2) diff --git a/tests/tpu/test_moe_pallas.py b/tests/tpu/test_moe_pallas.py index 13fc8bc8fa2ed..19df22f780396 100644 --- a/tests/tpu/test_moe_pallas.py +++ b/tests/tpu/test_moe_pallas.py @@ -26,7 +26,7 @@ TOP_KS = [2, 6] # The Pallas GMM kernel requires num_tokens * topk to be a multiple of 16 @pytest.mark.parametrize("m", [8, 16, 64, 2048]) @pytest.mark.parametrize("n", [128, 1024, 2048]) -@pytest.mark.parametrize("k", [128, 511, 1024]) +@pytest.mark.parametrize("k", [128, 512, 1024]) @pytest.mark.parametrize("e", NUM_EXPERTS) @pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("ep_size", EP_SIZE) diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py index a781b8b563be1..caa233ec3ff9d 100644 --- a/tests/tracing/test_tracing.py +++ b/tests/tracing/test_tracing.py @@ -173,7 +173,7 @@ def test_traces_with_detailed_steps( llm = LLM( model=model, otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, - collect_detailed_traces="all", + collect_detailed_traces=["all"], ) prompts = ["This is a short prompt"] outputs = llm.generate(prompts, sampling_params=sampling_params) diff --git a/tests/utils.py b/tests/utils.py index bf38d7843853d..d21b18470b1bb 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -28,7 +28,7 @@ from tests.models.utils import TextTextLogprobs from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.entrypoints.openai.cli_args import make_arg_parser +from vllm.entrypoints.cli.serve import ServeSubcommand from vllm.model_executor.model_loader import get_model_loader from vllm.platforms import current_platform from vllm.transformers_utils.tokenizer import get_tokenizer @@ -99,7 +99,8 @@ class RemoteOpenAIServer: parser = FlexibleArgumentParser( description="vLLM's remote OpenAI server.") - parser = make_arg_parser(parser) + subparsers = parser.add_subparsers(required=False, dest="subparser") + parser = ServeSubcommand().subparser_init(subparsers) args = parser.parse_args(["--model", model, *vllm_serve_args]) self.host = str(args.host or 'localhost') self.port = int(args.port) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 43a27da2dbe43..d3d62cf09232d 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -45,7 +45,6 @@ def make_request(request_id, multi_modal_placeholders=mm_positions, sampling_params=SamplingParams(max_tokens=17), eos_token_id=100, - arrival_time=0, lora_request=None, cache_salt=cache_salt, ) diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 3da27786b1f2f..ba3c0b3cf3169 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -38,7 +38,6 @@ def make_request(request_id, sampling_params=SamplingParams(max_tokens=17, prompt_logprobs=prompt_logprobs), eos_token_id=100, - arrival_time=0, lora_request=None, cache_salt=cache_salt, ) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index f40d477a00363..f38454b1b2889 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -138,7 +138,6 @@ def create_requests(num_requests: int, multi_modal_placeholders=mm_position, multi_modal_hashes=None, eos_token_id=EOS_TOKEN_ID, - arrival_time=0, ) requests.append(request) return requests @@ -744,7 +743,8 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i]) # No draft or accepted tokens counted yet - assert engine_core_outputs.scheduler_stats.spec_decoding_stats is None + assert not engine_core_outputs or ( + engine_core_outputs[0].scheduler_stats.spec_decoding_stats is None) # Schedule the speculated tokens for validation output = scheduler.schedule() @@ -772,7 +772,8 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): engine_core_outputs = scheduler.update_from_output(output, model_runner_output) - scheduler_stats = engine_core_outputs.scheduler_stats + scheduler_stats = engine_core_outputs[0].scheduler_stats \ + if engine_core_outputs else None if expected[0] == 0: assert scheduler_stats.spec_decoding_stats is None else: @@ -843,7 +844,7 @@ def _step_until_done( # We should be in the decode phase now. assert num_scheduled_tokens == 1 assert len(output.kv_connector_metadata.requests) == 0 - ecos = scheduler.update_from_output(output, model_runner_output) + ecos = scheduler.update_from_output(output, model_runner_output)[0] all_done = True for eco in ecos.outputs: if eco.finish_reason is None: diff --git a/tests/v1/core/test_scheduler_e2e.py b/tests/v1/core/test_scheduler_e2e.py index 0a79424a30b74..511d57d405ba2 100644 --- a/tests/v1/core/test_scheduler_e2e.py +++ b/tests/v1/core/test_scheduler_e2e.py @@ -19,7 +19,8 @@ def model() -> LLM: enable_prefix_caching=True, long_prefill_token_threshold=2, max_num_batched_tokens=6, - max_num_seqs=3) + max_num_seqs=3, + block_size=16) def test_concurrent_partial_prefill(model): @@ -27,3 +28,11 @@ def test_concurrent_partial_prefill(model): assert len(outputs) == 3 for output in outputs: assert len(output.outputs) == 1 + + +def test_prefix_cache_stats_is_recorded(model): + # 17 tokens will make sure first 16 tokens are cached in a block + input_tokens = {"prompt_token_ids": [101] * 17} + _ = model.generate([input_tokens]) + outputs = model.generate([input_tokens]) + assert outputs[0].num_cached_tokens == 16 diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index dcf494825b0d4..e78c7480a837a 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -88,7 +88,7 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch): assert len(engine_core.scheduler.running) == 4 # Loop through until they are all done. - while len(engine_core.step().outputs) > 0: + while (outs := engine_core.step()[0].get(0)) and outs.outputs: pass assert len(engine_core.scheduler.waiting) == 0 @@ -163,11 +163,11 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch): req0.request_id = req1.request_id = "test" engine_core.add_request(req0) - while len(engine_core.step().outputs) > 0: + while (outs := engine_core.step()[0].get(0)) and outs.outputs: pass engine_core.add_request(req1) - while len(engine_core.step().outputs) > 0: + while (outs := engine_core.step()[0].get(0)) and outs.outputs: pass assert len(engine_core.scheduler.waiting) == 0 @@ -207,7 +207,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch): assert len(engine_core.scheduler.waiting) == 1 assert len(engine_core.scheduler.running) == 0 # Loop through until they are all done. - while len(engine_core.step().outputs) > 0: + while (outs := engine_core.step()[0].get(0)) and outs.outputs: pass assert len(engine_core.scheduler.waiting) == 0 assert len(engine_core.scheduler.running) == 0 @@ -296,7 +296,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): engine_core.add_request(req1) # Schedule Batch 1: (10, req0) - assert engine_core.step_with_batch_queue() is None + assert engine_core.step_with_batch_queue()[0] is None assert engine_core.batch_queue.qsize() == 1 scheduler_output = engine_core.batch_queue.queue[-1][1] assert scheduler_output.num_scheduled_tokens[0] == 10 @@ -305,7 +305,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): req0.request_id].num_computed_tokens == 10 # Schedule Batch 2: (2, req0), (8, req1) - assert engine_core.step_with_batch_queue() is None + assert engine_core.step_with_batch_queue()[0] is None assert engine_core.batch_queue.qsize() == 2 scheduler_output = engine_core.batch_queue.queue[-1][1] assert scheduler_output.num_scheduled_tokens[0] == 2 @@ -327,7 +327,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): assert scheduler_output.num_scheduled_tokens[1] == 4 # Batch queue is full. Finish Batch 2. Get first token of req0. - output = engine_core.step_with_batch_queue() + output = engine_core.step_with_batch_queue()[0].get(0) assert output is not None assert len(output.outputs) == 1 assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13 @@ -339,7 +339,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): assert scheduler_output.num_scheduled_tokens[0] == 1 # Batch queue is full. Finish Batch 3. Get first token of req1. - output = engine_core.step_with_batch_queue() + output = engine_core.step_with_batch_queue()[0].get(0) assert output is not None assert len(output.outputs) == 1 assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13 @@ -358,11 +358,11 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): engine_core.scheduler.requests[1].num_tokens + 1, ] while engine_core.scheduler.get_num_unfinished_requests() == 2: - output = engine_core.step_with_batch_queue() + output = engine_core.step_with_batch_queue()[0] if step % 2 == 0: # Even steps consumes an output. assert output is not None - assert len(output.outputs) == 1 + assert len(output[0].outputs) == 1 if req_id in engine_core.scheduler.requests: assert engine_core.scheduler.requests[ req_id].num_tokens == expected_num_tokens[req_id] diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py index cefb89eb652b2..e77916f958233 100644 --- a/tests/v1/engine/test_llm_engine.py +++ b/tests/v1/engine/test_llm_engine.py @@ -6,6 +6,7 @@ from typing import Optional import pytest from vllm import LLM, SamplingParams +from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector MODEL = "facebook/opt-125m" DTYPE = "half" @@ -97,3 +98,67 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None: raise AssertionError( f"{len(completion_counts)} unique completions; expected" f" {n}. Repeats: {repeats}") + + +def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): + max_tokens = 100 + # Use spec decoding to test num_accepted_tokens_per_pos + speculative_config = { + "method": "ngram", + "prompt_lookup_max": 5, + "prompt_lookup_min": 3, + "num_speculative_tokens": 5, + } + monkeypatch.setenv("VLLM_USE_V1", "1") + with vllm_runner( + MODEL, + speculative_config=speculative_config, + disable_log_stats=False, + ) as vllm_model: + model: LLM = vllm_model.model + sampling_params = SamplingParams(temperature=0.0, + max_tokens=max_tokens) + outputs = model.generate(example_prompts, sampling_params) + + n_prompts = len(example_prompts) + assert len(outputs) == n_prompts + + total_tokens = 0 + for out in outputs: + assert len(out.outputs) == 1 + total_tokens += len(out.outputs[0].token_ids) + assert total_tokens == max_tokens * n_prompts + + metrics = model.get_metrics() + + def find_metric(name) -> list[Metric]: + found = [] + for metric in metrics: + if metric.name == name: + found.append(metric) + return found + + num_requests_running = find_metric("vllm:num_requests_running") + assert len(num_requests_running) == 1 + assert isinstance(num_requests_running[0], Gauge) + assert num_requests_running[0].value == .0 + + generation_tokens = find_metric("vllm:generation_tokens") + assert len(generation_tokens) == 1 + assert isinstance(generation_tokens[0], Counter) + assert generation_tokens[0].value == total_tokens + + request_generation_tokens = find_metric( + "vllm:request_generation_tokens") + assert len(request_generation_tokens) == 1 + assert isinstance(request_generation_tokens[0], Histogram) + assert "+Inf" in request_generation_tokens[0].buckets + assert request_generation_tokens[0].buckets["+Inf"] == n_prompts + assert request_generation_tokens[0].count == n_prompts + assert request_generation_tokens[0].sum == total_tokens + + num_accepted_tokens_per_pos = find_metric( + "vllm:spec_decode_num_accepted_tokens_per_pos") + assert len(num_accepted_tokens_per_pos) == 1 + assert isinstance(num_accepted_tokens_per_pos[0], Vector) + assert len(num_accepted_tokens_per_pos[0].values) == 5 diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index 25bbcd901d6a9..5f1fff200de31 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -4,12 +4,12 @@ from __future__ import annotations import json -import re from enum import Enum from typing import TYPE_CHECKING, Any import jsonschema import pytest +import regex as re from pydantic import BaseModel from tests.reasoning.utils import run_reasoning_extraction diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py index 3ffc54f520b44..333ad23795f34 100644 --- a/tests/v1/entrypoints/openai/test_completion.py +++ b/tests/v1/entrypoints/openai/test_completion.py @@ -1,11 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 -import re from typing import Optional import openai # use the official client for correctness check import pytest import pytest_asyncio +import regex as re from openai import BadRequestError from tests.utils import RemoteOpenAIServer diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/v1/entrypoints/openai/test_multi_api_servers.py new file mode 100644 index 0000000000000..7b4583bc3bf37 --- /dev/null +++ b/tests/v1/entrypoints/openai/test_multi_api_servers.py @@ -0,0 +1,171 @@ +# SPDX-License-Identifier: Apache-2.0 +import asyncio +import os + +import openai # use the official client for correctness check +import pytest +import pytest_asyncio + +from tests.utils import RemoteOpenAIServer + +MODEL_NAME = "ibm-research/PowerMoE-3b" + +DP_SIZE = os.getenv("DP_SIZE", "1") + + +@pytest.fixture(scope="module") +def default_server_args(): + return [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "2048", + "--max-num-seqs", + "128", + "--enforce-eager", + "--api-server-count", + "4", + "--data_parallel_size", + DP_SIZE, + ] + + +@pytest.fixture(scope="module") +def server(default_server_args): + with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME], +) +async def test_single_completion(client: openai.AsyncOpenAI, + model_name: str) -> None: + + async def make_request(): + completion = await client.completions.create( + model=model_name, + prompt="Hello, my name is", + max_tokens=10, + temperature=1.0) + + assert completion.id is not None + assert completion.choices is not None and len(completion.choices) == 1 + + choice = completion.choices[0] + # The exact number of tokens can vary slightly with temperature=1.0, + # so we check for a reasonable minimum length. + assert len(choice.text) >= 1 + # Finish reason might not always be 'length' if the model finishes early + # or due to other reasons, especially with high temperature. + # So, we'll accept 'length' or 'stop'. + assert choice.finish_reason in ("length", "stop") + + # Token counts can also vary, so we check they are positive. + assert completion.usage.completion_tokens > 0 + assert completion.usage.prompt_tokens > 0 + assert completion.usage.total_tokens > 0 + return completion + + # Test single request + result = await make_request() + assert result is not None + + await asyncio.sleep(0.5) + + # Send two bursts of requests + num_requests = 100 + tasks = [make_request() for _ in range(num_requests)] + results = await asyncio.gather(*tasks) + assert len(results) == num_requests + assert all(completion is not None for completion in results) + + await asyncio.sleep(0.5) + + tasks = [make_request() for _ in range(num_requests)] + results = await asyncio.gather(*tasks) + assert len(results) == num_requests + assert all(completion is not None for completion in results) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME], +) +async def test_completion_streaming(client: openai.AsyncOpenAI, + model_name: str) -> None: + prompt = "What is an LLM?" + + async def make_streaming_request(): + # Perform a non-streaming request to get the expected full output + single_completion = await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + ) + single_output = single_completion.choices[0].text + + # Perform the streaming request + stream = await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True) + chunks: list[str] = [] + finish_reason_count = 0 + last_chunk = None + async for chunk in stream: + chunks.append(chunk.choices[0].text) + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + last_chunk = chunk # Keep track of the last chunk + + # finish reason should only return in the last block for OpenAI API + assert finish_reason_count == 1, ( + "Finish reason should appear exactly once.") + assert last_chunk is not None, ( + "Stream should have yielded at least one chunk.") + assert last_chunk.choices[ + 0].finish_reason == "length", "Finish reason should be 'length'." + # Check that the combined text matches the non-streamed version. + assert "".join( + chunks + ) == single_output, "Streamed output should match non-streamed output." + return True # Indicate success for this request + + # Test single request + result = await make_streaming_request() + assert result is not None + + await asyncio.sleep(0.5) + + # Send two bursts of requests + num_requests = 100 + tasks = [make_streaming_request() for _ in range(num_requests)] + results = await asyncio.gather(*tasks) + + assert len( + results + ) == num_requests, f"Expected {num_requests} results, got {len(results)}" + assert all(results), "Not all streaming requests completed successfully." + + await asyncio.sleep(0.5) + + tasks = [make_streaming_request() for _ in range(num_requests)] + results = await asyncio.gather(*tasks) + + assert len( + results + ) == num_requests, f"Expected {num_requests} results, got {len(results)}" + assert all(results), "Not all streaming requests completed successfully." diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index e90b72a7cf249..c17784e0a263e 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -13,6 +13,8 @@ NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-2} # Default to 2 # Find the git repository root directory GIT_ROOT=$(git rev-parse --show-toplevel) +SMI_BIN=$(which nvidia-smi || which rocm-smi) + # Trap the SIGINT signal (triggered by Ctrl+C) trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT @@ -44,6 +46,13 @@ get_model_args() { echo "$extra_args" } +get_num_gpus() { + if [[ "$SMI_BIN" == *"nvidia"* ]]; then + echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)" + else + echo "$($SMI_BIN -l | grep GPU | wc -l)" + fi +} # Function to run tests for a specific model run_tests_for_model() { @@ -64,7 +73,7 @@ run_tests_for_model() { # Start prefill instances for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do # Calculate GPU ID - we'll distribute across available GPUs - GPU_ID=$((i % $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l))) + GPU_ID=$((i % $(get_num_gpus))) # Calculate port number (base port + instance number) PORT=$((8100 + i)) # Calculate side channel port @@ -96,7 +105,7 @@ run_tests_for_model() { # Start decode instances for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs - GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l))) + GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus))) # Calculate port number (base port + instance number) PORT=$((8200 + i)) # Calculate side channel port diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py index 64da0d79bf334..a21d92c52244d 100644 --- a/tests/v1/kv_connector/unit/test_multi_connector.py +++ b/tests/v1/kv_connector/unit/test_multi_connector.py @@ -239,3 +239,11 @@ def get_connector_events() -> dict[str, list[str]]: print(f"[ERROR] Could not read connector events for {name}: {e}") return connector_events + + +def test_engine_id_conflict(): + configs = [KVTransferConfig() for _ in range(2)] + ids = [config.engine_id for config in configs] + assert ids[0] != ids[1], ( + "Engine IDs should be different for different configs. " + f"Got {ids}") diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py index 77098140343a0..dc963251c962b 100644 --- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py @@ -43,7 +43,7 @@ def test_basic_lifecycle(): # Ensure the request is finished after 1 tokens. assert request.is_finished() assert request.status == RequestStatus.FINISHED_LENGTH_CAPPED - output = engine_core_outputs.outputs[0] + output = engine_core_outputs[0].outputs[0] assert output.finish_reason == FinishReason.LENGTH assert output.kv_transfer_params is not None @@ -165,7 +165,7 @@ def test_prefix_cache_lifecycle(): scheduler_output = scheduler.schedule() model_runner_output = create_model_runner_output(reqs=[request_remote]) eco = scheduler.update_from_output(scheduler_output, model_runner_output) - kv_transfer_params = eco.outputs[0].kv_transfer_params + kv_transfer_params = eco[0].outputs[0].kv_transfer_params # Ensure we send all block ids, even if there is a cache hit. assert (len( diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py index fc4928f9ebd19..86eacb693869d 100644 --- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py @@ -61,7 +61,7 @@ def test_basic_lifecycle(): # (1c): update_from_output() engine_core_outputs = scheduler.update_from_output(scheduler_output, model_runner_output) - assert len(engine_core_outputs.outputs) == 0 + assert not engine_core_outputs or not engine_core_outputs[0].outputs # STEP (2): # (2a): schedule(): nothing happens! @@ -112,7 +112,7 @@ def test_basic_lifecycle(): model_runner_output) scheduler.schedule() - outputs = engine_core_outputs.outputs + outputs = engine_core_outputs[0].outputs assert len(outputs) == 1 output = outputs[0] assert output.finish_reason == FinishReason.STOP @@ -335,8 +335,89 @@ def test_full_block_prompt(): model_runner_output) scheduler.schedule() - outputs = engine_core_outputs.outputs + outputs = engine_core_outputs[0].outputs assert len(outputs) == 1 output = outputs[0] assert output.finish_reason == FinishReason.STOP assert_scheduler_empty(scheduler) + + +def test_cannot_schedule_after_recv(): + """ + Test that we can handle no schedule after recv due to not + enough remaining KV blocks. + """ + + # NOTE: the KVCacheManager will use 1 null block. + # So there are 5 total working blocks. + TOTAL_NUM_BLOCKS = 6 + vllm_config = create_vllm_config() + scheduler = create_scheduler(vllm_config, num_blocks=TOTAL_NUM_BLOCKS) + + # Prime the KVCache. + NUM_PROMPT_BLOCKS = 2 + BLOCK_SIZE = vllm_config.cache_config.block_size + # Prompt will use 2 blocks + 1 block after we schedule. + NUM_TOKENS_LOCAL = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS) + NUM_TOKENS_REMOTE = int(BLOCK_SIZE * (NUM_PROMPT_BLOCKS + 0.5)) + + request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS_LOCAL) + request_remote = create_request(request_id=2, + num_tokens=NUM_TOKENS_REMOTE, + do_remote_prefill=True) + + # STEP 1: 3 blocks are in use (2 for prompt, 1 for decode). + scheduler.add_request(request_normal) + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_normal]) + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 0 + + # Step 2: 5 blocks are in use (2 new for remote blocks). + scheduler.add_request(request_remote) + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_normal]) + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 1 + + # Step 3: finish recving (5 blocks in use) + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output( + reqs=[request_normal], finished_recving=[request_remote.request_id]) + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 1 + + # Step 4: try to schedule, not enough blocks. + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_normal]) + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 1 + + # Step 5: finish the request, free it. + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_normal], + use_eos=True) + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.running) == 0 + assert len(scheduler.waiting) == 1 + + # Step 6: now we can schedule (with 2 blocks computed). + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_remote]) + assert (scheduler_output.scheduled_new_reqs[0].num_computed_tokens == + NUM_PROMPT_BLOCKS * BLOCK_SIZE) + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 0 + + # Step 7: free everything. + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_remote], + use_eos=True) + scheduler.update_from_output(scheduler_output, model_runner_output) + _ = scheduler.schedule() + assert_scheduler_empty(scheduler) diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 53e2d6fda1aea..3c3190b325636 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -153,7 +153,6 @@ def create_request( multi_modal_placeholders=None, multi_modal_hashes=None, eos_token_id=EOS_TOKEN_ID, - arrival_time=0, ) req.kv_transfer_params = kv_transfer_params return req diff --git a/tests/v1/sample/test_topk_topp_sampler.py b/tests/v1/sample/test_topk_topp_sampler.py index a8a713d446b79..220f05c7ff1c3 100644 --- a/tests/v1/sample/test_topk_topp_sampler.py +++ b/tests/v1/sample/test_topk_topp_sampler.py @@ -16,31 +16,40 @@ VOCAB_SIZE = 128 * 1024 FLASHINFER_ENABLED = current_platform.is_cuda() and is_flashinfer_available +@pytest.fixture(autouse=True) +def reset_default_device(): + """ + Explicitly set the default device, which can affect subsequent tests. + Adding this fixture helps avoid this problem. + """ + original_device = torch.get_default_device() + yield + torch.set_default_device(original_device) + + def test_topk_impl_equivalance(): - with torch.device(DEVICE): - generator = Generator(device=DEVICE).manual_seed(33) + torch.set_default_device(DEVICE) + generator = Generator(device=DEVICE).manual_seed(33) - logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator) + logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator) - # Random top-k values between 1 and 9. - k = torch.randint(1, 10, (BATCH_SIZE, ), generator=generator) + # Random top-k values between 1 and 9. + k = torch.randint(1, 10, (BATCH_SIZE, ), generator=generator) - # Set k=vocab_size for ~50% of requests in the batch (top-k disabled). - k.masked_fill_( - torch.randint(0, - 2, (BATCH_SIZE, ), - generator=generator, - dtype=bool), VOCAB_SIZE) + # Set k=vocab_size for ~50% of requests in the batch (top-k disabled). + k.masked_fill_( + torch.randint(0, 2, (BATCH_SIZE, ), generator=generator, dtype=bool), + VOCAB_SIZE) - # Top-k only implementation - result1 = apply_top_k_top_p(logits=logits.clone(), k=k, p=None) + # Top-k only implementation + result1 = apply_top_k_top_p(logits=logits.clone(), k=k, p=None) - # Top-p + top-k - no_op_top_p = torch.tensor([1.0]) - result2 = apply_top_k_top_p(logits=logits.clone(), k=k, p=no_op_top_p) + # Top-p + top-k + no_op_top_p = torch.tensor([1.0]) + result2 = apply_top_k_top_p(logits=logits.clone(), k=k, p=no_op_top_p) - assert torch.allclose(result1, result2) + assert torch.allclose(result1, result2) def test_flashinfer_sampler(): @@ -58,50 +67,49 @@ def test_flashinfer_sampler(): pytest.skip( "FlashInfer not installed or not available on this platform.") - with torch.device(DEVICE): - generator = Generator(device=DEVICE).manual_seed(42) + torch.set_default_device(DEVICE) + generator = Generator(device=DEVICE).manual_seed(42) - # Generate random logits - logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator) + # Generate random logits + logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator) - # Generate various top-k and top-p values - k_values = torch.randint(1, 1000, (BATCH_SIZE, ), generator=generator) - p_values = torch.rand( - (BATCH_SIZE, ), - generator=generator) * 0.5 + 0.5 # range in [0.5, 1.0] + # Generate various top-k and top-p values + k_values = torch.randint(1, 1000, (BATCH_SIZE, ), generator=generator) + p_values = torch.rand( + (BATCH_SIZE, ), generator=generator) * 0.5 + 0.5 # range in [0.5, 1.0] - # Sometimes disable top-k (k=vocab_size) - k_values.masked_fill_( - torch.randint(0, - 2, (BATCH_SIZE, ), - generator=generator, - dtype=torch.bool), VOCAB_SIZE) + # Sometimes disable top-k (k=vocab_size) + k_values.masked_fill_( + torch.randint(0, + 2, (BATCH_SIZE, ), + generator=generator, + dtype=torch.bool), VOCAB_SIZE) - # Sometimes disable top-p (p=1.0) - p_values.masked_fill_( - torch.randint(0, - 2, (BATCH_SIZE, ), - generator=generator, - dtype=torch.bool), 1.0) + # Sometimes disable top-p (p=1.0) + p_values.masked_fill_( + torch.randint(0, + 2, (BATCH_SIZE, ), + generator=generator, + dtype=torch.bool), 1.0) - python_logits = apply_top_k_top_p( - logits=logits.clone(), - k=k_values, - p=p_values, - ) - python_probs = torch.softmax(python_logits, dim=-1) + python_logits = apply_top_k_top_p( + logits=logits.clone(), + k=k_values, + p=p_values, + ) + python_probs = torch.softmax(python_logits, dim=-1) - # FlashInfer only exposed renorm interfaces for probs so convert first - flashinfer_probs = torch.softmax(logits.clone(), dim=-1) - flashinfer_probs = top_k_renorm_probs( - probs=flashinfer_probs, - top_k=k_values, - ) - flashinfer_probs = top_p_renorm_probs( - probs=flashinfer_probs, - top_p=p_values, - ) + # FlashInfer only exposed renorm interfaces for probs so convert first + flashinfer_probs = torch.softmax(logits.clone(), dim=-1) + flashinfer_probs = top_k_renorm_probs( + probs=flashinfer_probs, + top_k=k_values, + ) + flashinfer_probs = top_p_renorm_probs( + probs=flashinfer_probs, + top_p=p_values, + ) - # Compare the results - assert torch.allclose(python_probs, flashinfer_probs, atol=2e-2), \ - "FlashInfer and Python sampling implementations do not match!" + # Compare the results + assert torch.allclose(python_probs, flashinfer_probs, atol=2e-2), \ + "FlashInfer and Python sampling implementations do not match!" diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py index f540895bbf147..932b652aea32b 100644 --- a/tests/v1/sample/utils.py +++ b/tests/v1/sample/utils.py @@ -1,9 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 -import re from enum import Enum from typing import Optional +import regex as re + from vllm import CompletionOutput diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 7d93a44c50595..b49ac45f3129b 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -100,8 +100,12 @@ def test_prepare_inputs(): dtype=torch.int32, device=device) + # n1 + n2 + n3 - a - b -c + num_tokens = cu_target_query_lens[-1].item() - num_rejected_tokens.sum( + ).item() + cu_num_tokens, token_indices = EagleProposer.prepare_inputs( - cu_target_query_lens, num_rejected_tokens) + cu_target_query_lens, num_rejected_tokens, num_tokens) assert torch.equal(cu_num_tokens, expected_cu_num_tokens) assert token_indices.shape[0] == expected_cu_num_tokens[-1].item() @@ -117,34 +121,13 @@ def test_prepare_inputs(): ]) @mock.patch('vllm.v1.spec_decode.eagle.get_pp_group') @mock.patch('vllm.v1.spec_decode.eagle.get_layers_from_vllm_config') -@mock.patch('vllm.v1.spec_decode.eagle.ModelRegistry') -@mock.patch('vllm.v1.spec_decode.eagle.get_model_loader') -@mock.patch('vllm.v1.spec_decode.eagle.set_default_torch_dtype') -@mock.patch('vllm.v1.spec_decode.eagle.set_current_vllm_config') -def test_load_model(mock_set_config, mock_set_dtype, mock_get_loader, - mock_registry, mock_get_layers, mock_get_pp_group, method, +@mock.patch('vllm.v1.spec_decode.eagle.get_model') +def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method, proposer_helper, draft_model_dir, target_attribute_path): - # Setup mock for model class - mock_model_cls = mock.MagicMock() - mock_registry.resolve_model_cls.return_value = (mock_model_cls, - "test_arch") - - # Create a real context manager for mocks - class MockContextManager: - - def __init__(self): - pass - - def __enter__(self): - return None - - def __exit__(self, exc_type, exc_val, exc_tb): - return False - - # Make the mocks return actual context manager objects - mock_set_dtype.return_value = MockContextManager() - mock_set_config.return_value = MockContextManager() + # Setup model mock + mock_model = mock.MagicMock() + mock_get_model.return_value = mock_model # Setup mocks for attention layers target_attn_layers = { @@ -164,25 +147,6 @@ def test_load_model(mock_set_config, mock_set_dtype, mock_get_loader, mock_pp_group.world_size = 2 if method == "eagle" else 1 mock_get_pp_group.return_value = mock_pp_group - # Setup model loader mock - mock_loader = mock.MagicMock() - mock_get_loader.return_value = mock_loader - - # Setup model mock - mock_model = mock.MagicMock() - mock_model_cls.return_value = mock_model - mock_model.to.return_value = mock_model - - # Configure mock to test the attribute sharing path - if method == "eagle": - # For eagle, test the lm_head path - mock_model.load_weights.return_value = { - "model.embed_tokens.weight": torch.zeros(1) - } - else: - # For eagle3, test the embed_tokens path - mock_model.load_weights.return_value = {} - # Setup target model with the appropriate attributes target_model = mock.MagicMock() @@ -204,13 +168,7 @@ def test_load_model(mock_set_config, mock_set_dtype, mock_get_loader, proposer.load_model(target_model) # Verify common interactions - mock_get_loader.assert_called_once() - mock_model_cls.assert_called_once() - mock_model.to.assert_called_once() - mock_model.load_weights.assert_called_once() - - # Verify the loader was called with the right config - mock_get_loader.assert_called_once_with(proposer.vllm_config.load_config) + mock_get_model.assert_called_once() # Verify the specific attribute sharing based on the method if method == "eagle": @@ -288,6 +246,9 @@ def test_propose(num_speculative_tokens): # Assign the mock to the proposer proposer.model = model_mock + # Assign draft attn_layer_names since load_model is not invoked + proposer.attn_layer_names = ["layer.0"] + # Create input tensors cu_num_tokens = torch.tensor([0, seq_len_1, total_tokens], dtype=torch.int32, diff --git a/tests/v1/test_metrics_reader.py b/tests/v1/test_metrics_reader.py new file mode 100644 index 0000000000000..68539c80b59cc --- /dev/null +++ b/tests/v1/test_metrics_reader.py @@ -0,0 +1,112 @@ +# SPDX-License-Identifier: Apache-2.0 + +import prometheus_client +import pytest + +from vllm.v1.metrics.reader import (Counter, Gauge, Histogram, Vector, + get_metrics_snapshot) + + +@pytest.fixture(autouse=True) +def test_registry(monkeypatch): + # Use a custom registry for tests + test_registry = prometheus_client.CollectorRegistry(auto_describe=True) + monkeypatch.setattr("vllm.v1.metrics.reader.REGISTRY", test_registry) + return test_registry + + +@pytest.mark.parametrize("num_engines", [1, 4]) +def test_gauge_metric(test_registry, num_engines): + g = prometheus_client.Gauge("vllm:test_gauge", + "Test gauge metric", + labelnames=["model", "engine_index"], + registry=test_registry) + for i in range(num_engines): + g.labels(model="foo", engine_index=str(i)).set(98.5) + + metrics = get_metrics_snapshot() + assert len(metrics) == num_engines + engine_labels = [str(i) for i in range(num_engines)] + for m in metrics: + assert isinstance(m, Gauge) + assert m.name == "vllm:test_gauge" + assert m.value == 98.5 + assert m.labels["model"] == "foo" + assert m.labels["engine_index"] in engine_labels + engine_labels.remove(m.labels["engine_index"]) + + +@pytest.mark.parametrize("num_engines", [1, 4]) +def test_counter_metric(test_registry, num_engines): + c = prometheus_client.Counter("vllm:test_counter", + "Test counter metric", + labelnames=["model", "engine_index"], + registry=test_registry) + for i in range(num_engines): + c.labels(model="bar", engine_index=str(i)).inc(19) + + metrics = get_metrics_snapshot() + assert len(metrics) == num_engines + engine_labels = [str(i) for i in range(num_engines)] + for m in metrics: + assert isinstance(m, Counter) + assert m.name == "vllm:test_counter" + assert m.value == 19 + assert m.labels["model"] == "bar" + assert m.labels["engine_index"] in engine_labels + engine_labels.remove(m.labels["engine_index"]) + + +@pytest.mark.parametrize("num_engines", [1, 4]) +def test_histogram_metric(test_registry, num_engines): + h = prometheus_client.Histogram("vllm:test_histogram", + "Test histogram metric", + labelnames=["model", "engine_index"], + buckets=[10, 20, 30, 40, 50], + registry=test_registry) + for i in range(num_engines): + hist = h.labels(model="blaa", engine_index=str(i)) + hist.observe(42) + hist.observe(21) + hist.observe(7) + + metrics = get_metrics_snapshot() + assert len(metrics) == num_engines + engine_labels = [str(i) for i in range(num_engines)] + for m in metrics: + assert isinstance(m, Histogram) + assert m.name == "vllm:test_histogram" + assert m.count == 3 + assert m.sum == 70 + assert m.buckets["10.0"] == 1 + assert m.buckets["20.0"] == 1 + assert m.buckets["30.0"] == 2 + assert m.buckets["40.0"] == 2 + assert m.buckets["50.0"] == 3 + assert m.labels["model"] == "blaa" + assert m.labels["engine_index"] in engine_labels + engine_labels.remove(m.labels["engine_index"]) + + +@pytest.mark.parametrize("num_engines", [1, 4]) +def test_vector_metric(test_registry, num_engines): + c = prometheus_client.Counter( + "vllm:spec_decode_num_accepted_tokens_per_pos", + "Vector-like counter metric", + labelnames=["position", "model", "engine_index"], + registry=test_registry) + for i in range(num_engines): + c.labels(position="0", model="llama", engine_index=str(i)).inc(10) + c.labels(position="1", model="llama", engine_index=str(i)).inc(5) + c.labels(position="2", model="llama", engine_index=str(i)).inc(1) + + metrics = get_metrics_snapshot() + assert len(metrics) == num_engines + engine_labels = [str(i) for i in range(num_engines)] + for m in metrics: + assert isinstance(m, Vector) + assert m.name == "vllm:spec_decode_num_accepted_tokens_per_pos" + assert m.values == [10, 5, 1] + assert m.labels["model"] == "llama" + assert m.labels["engine_index"] in engine_labels + engine_labels.remove(m.labels["engine_index"]) diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index c34c673e985e3..1b77417a1bd35 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -12,7 +12,7 @@ UNSUPPORTED_MODELS_V1 = [ "openai/whisper-large-v3", # transcription "facebook/bart-large-cnn", # encoder decoder "mistralai/Mamba-Codestral-7B-v0.1", # mamba - "hmellor/bamba-tiny-random", # hybrid + "hmellor/tiny-random-BambaForCausalLM", # hybrid "BAAI/bge-m3", # embedding ] diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index 319b38b4ca09d..348f12887a446 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -81,7 +81,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: mm_hashes=[], mm_positions=[], sampling_params=SamplingParams(), - block_ids=[0], + block_ids=[[0]], # block_ids should be list[list[int]] num_computed_tokens=0, lora_request=None, )) @@ -112,14 +112,35 @@ def _is_req_added(model_runner, req_id: str) -> bool: def _is_req_state_block_table_match(model_runner, req_id: str) -> bool: + """Check if the request state block IDs match the block table. + + This function handles both legacy BlockTable and new MultiGroupBlockTable + structures for backward compatibility. + """ + req_index = model_runner.input_batch.req_id_to_index[req_id] - block_table = model_runner.input_batch.block_table + multi_group_block_table = model_runner.input_batch.block_table req_state = model_runner.requests[req_id] - if block_table.num_blocks_per_row[req_index] != len(req_state.block_ids): + + # Access the first block table from MultiGroupBlockTable + # This is safe since we currently only use single KV cache groups + block_table = multi_group_block_table[0] + + # req_state.block_ids is now list[list[int]] for MultiGroupBlockTable + # Extract the first group's block IDs + if isinstance(req_state.block_ids[0], list): + # New format: list[list[int]] - extract first group + req_block_ids = req_state.block_ids[0] + else: + # Legacy format: list[int] - use directly + req_block_ids = req_state.block_ids + + if block_table.num_blocks_per_row[req_index] != len(req_block_ids): return False + num_blocks = block_table.num_blocks_per_row[req_index] - return (block_table.block_table_np[req_index, :num_blocks] == - req_state.block_ids).all() + block_table_values = block_table.block_table_np[req_index, :num_blocks] + return (block_table_values == req_block_ids).all() def test_update_states_new_request(model_runner): diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 638f5bedcfcac..27741bd156be1 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -251,7 +251,7 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int): device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, - kv_cache_config=get_kv_cache_config(), + block_size=1, ) reqs: list[CachedRequestState] = [] req_id_reqs = {} @@ -341,7 +341,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int, device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, - kv_cache_config=get_kv_cache_config(), + block_size=1, ) ref_input_batch: InputBatch = InputBatch( max_num_reqs=batch_size, @@ -350,7 +350,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int, device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, - kv_cache_config=get_kv_cache_config(), + block_size=1, ) reqs: list[CachedRequestState] = [] diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index e44660525763c..6ba6d1f6f131d 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -1,7 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 +import random + import pytest +from vllm.attention import Attention from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, SchedulerConfig, VllmConfig) from vllm.sampling_params import SamplingParams @@ -13,27 +16,30 @@ from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.worker.gpu_input_batch import InputBatch from vllm.v1.worker.gpu_model_runner import GPUModelRunner +BLOCK_SIZE = 16 +NUM_BLOCKS = 10 + def initialize_kv_cache(runner: GPUModelRunner): """ Only perform necessary steps in GPUModelRunner.initialize_kv_cache() """ + attn_spec = FullAttentionSpec( + block_size=BLOCK_SIZE, + num_kv_heads=runner.model_config.get_num_kv_heads( + runner.parallel_config), + head_size=runner.model_config.get_head_size(), + dtype=runner.kv_cache_dtype, + use_mla=False, + ) + tensor_size = attn_spec.page_size_bytes * NUM_BLOCKS kv_cache_config = KVCacheConfig( - num_blocks=10, + num_blocks=NUM_BLOCKS, tensors={ - "layer.0": KVCacheTensor(size=1024), + "layer.0": KVCacheTensor(size=tensor_size), }, kv_cache_groups=[ - KVCacheGroupSpec( - layer_names=["layer.0"], - kv_cache_spec=FullAttentionSpec( - block_size=16, - num_kv_heads=runner.model_config.get_num_kv_heads( - runner.parallel_config), - head_size=runner.model_config.get_head_size(), - dtype=runner.kv_cache_dtype, - use_mla=False, - )) + KVCacheGroupSpec(layer_names=["layer.0"], kv_cache_spec=attn_spec) ]) runner.kv_cache_config = kv_cache_config runner.input_batch = InputBatch( @@ -43,7 +49,7 @@ def initialize_kv_cache(runner: GPUModelRunner): device=runner.device, pin_memory=runner.pin_memory, vocab_size=runner.model_config.get_vocab_size(), - kv_cache_config=kv_cache_config, + block_size=kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size, ) runner.initialize_attn_backend(kv_cache_config) @@ -65,7 +71,7 @@ def model_runner(): seed=42, ) cache_config = CacheConfig( - block_size=16, + block_size=BLOCK_SIZE, gpu_memory_utilization=0.9, swap_space=0, cache_dtype="auto", @@ -77,6 +83,10 @@ def model_runner(): scheduler_config=scheduler_config, parallel_config=parallel_config, ) + num_heads = model_config.get_num_kv_heads(parallel_config) + head_size = model_config.get_head_size() + vllm_config.compilation_config.static_forward_context[ + "layer.0"] = Attention(num_heads, head_size, 0.1) device = "cuda" runner = GPUModelRunner(vllm_config, device) @@ -84,6 +94,9 @@ def model_runner(): return runner +model_runner_2 = model_runner + + def _schedule_new_request(*req_ids: str) -> SchedulerOutput: new_reqs = [] num_scheduled_tokens = {} @@ -321,3 +334,53 @@ def test_update_states_request_unscheduled(model_runner): assert _is_req_added(model_runner, req_ids[1]) assert not _is_req_scheduled(model_runner, req_ids[1]) + + +def test_kv_cache_stride_order(monkeypatch, model_runner): + # This test checks if GPUModelRunner initializes correctly when an attention + # backend enforces a non-default KV cache stride order. + n_heads = model_runner.model_config.get_num_kv_heads( + model_runner.parallel_config) + expected_kv_cache_shape = [ + 2, NUM_BLOCKS, BLOCK_SIZE, n_heads, + model_runner.model_config.get_head_size() + ] + # TODO mla test + default_stride = list(range(5)) + # Permutation that gets you back to expected kv shape + rnd_stride = tuple(random.sample(default_stride, len(default_stride))) + + def rnd_stride_order(): + return rnd_stride + + # Patch the attention backend class and re-trigger the KV cache creation. + for attn_backend in model_runner.attn_backends: + monkeypatch.setattr(attn_backend, "get_kv_cache_stride_order", + rnd_stride_order) + + model_runner.attn_backends = [] + model_runner.attn_metadata_builders = [] + model_runner.initialize_kv_cache(model_runner.kv_cache_config) + + # Shape is unchanged, but layout may differ + kv_cache_shape = model_runner.kv_caches[0].shape + assert list(kv_cache_shape) == expected_kv_cache_shape + if default_stride == rnd_stride: + assert all(kv.is_contiguous() for kv in model_runner.kv_caches) + else: + assert all(not kv.is_contiguous() for kv in model_runner.kv_caches) + + +def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2): + # In this test, model_runner loads model + weights in one go, while + # model_runner_2 loads dummy weights first then load real weights inplace + model_runner.load_model() + original_load_format = model_runner_2.load_config.load_format + model_runner_2.load_config.load_format = "dummy" + model_runner_2.load_model() # Initial model loading with dummy weights + assert str(model_runner.get_model().state_dict()) != str( + model_runner_2.get_model().state_dict()) + model_runner_2.load_config.load_format = original_load_format + model_runner_2.load_model() # Load real weights inplace + assert str(model_runner.get_model().state_dict()) == str( + model_runner_2.get_model().state_dict()) diff --git a/tools/check_triton_import.py b/tools/check_triton_import.py new file mode 100644 index 0000000000000..18c9726a11ac0 --- /dev/null +++ b/tools/check_triton_import.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: Apache-2.0 +import subprocess +import sys + +import regex as re + +FORBIDDEN_IMPORT_RE = re.compile(r"^(from|import)\s+triton(\s|\.|$)") + +# the way allowed to import triton +ALLOWED_LINES = { + "from vllm.triton_utils import triton", + "from vllm.triton_utils import tl", + "from vllm.triton_utils import tl, triton", +} + + +def is_forbidden_import(line: str) -> bool: + stripped = line.strip() + return bool( + FORBIDDEN_IMPORT_RE.match(stripped)) and stripped not in ALLOWED_LINES + + +def parse_diff(diff: str) -> list[str]: + violations = [] + current_file = None + current_lineno = None + + for line in diff.splitlines(): + if line.startswith("+++ b/"): + current_file = line[6:] + elif line.startswith("@@"): + match = re.search(r"\+(\d+)", line) + if match: + current_lineno = int( + match.group(1)) - 1 # next "+ line" is here + elif line.startswith("+") and not line.startswith("++"): + current_lineno += 1 + code_line = line[1:] + if is_forbidden_import(code_line): + violations.append( + f"{current_file}:{current_lineno}: {code_line.strip()}") + return violations + + +def get_diff(diff_type: str) -> str: + if diff_type == "staged": + return subprocess.check_output( + ["git", "diff", "--cached", "--unified=0"], text=True) + elif diff_type == "unstaged": + return subprocess.check_output(["git", "diff", "--unified=0"], + text=True) + else: + raise ValueError(f"Unknown diff_type: {diff_type}") + + +def main(): + all_violations = [] + for diff_type in ["staged", "unstaged"]: + try: + diff_output = get_diff(diff_type) + violations = parse_diff(diff_output) + all_violations.extend(violations) + except subprocess.CalledProcessError as e: + print(f"[{diff_type}] Git diff failed: {e}", file=sys.stderr) + + if all_violations: + print("โŒ Forbidden direct `import triton` detected." + " โžค Use `from vllm.triton_utils import triton` instead.\n") + for v in all_violations: + print(f"โŒ {v}") + return 1 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tools/enforce_regex_import.py b/tools/enforce_regex_import.py new file mode 100644 index 0000000000000..6c201dd2543e9 --- /dev/null +++ b/tools/enforce_regex_import.py @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import subprocess +from pathlib import Path + +import regex as re + +FORBIDDEN_PATTERNS = re.compile( + r'^\s*(?:import\s+re(?:$|\s|,)|from\s+re\s+import)') +ALLOWED_PATTERNS = [ + re.compile(r'^\s*import\s+regex\s+as\s+re\s*$'), + re.compile(r'^\s*import\s+regex\s*$'), +] + + +def get_staged_python_files() -> list[str]: + try: + result = subprocess.run( + ['git', 'diff', '--cached', '--name-only', '--diff-filter=AM'], + capture_output=True, + text=True, + check=True) + files = result.stdout.strip().split( + '\n') if result.stdout.strip() else [] + return [f for f in files if f.endswith('.py')] + except subprocess.CalledProcessError: + return [] + + +def is_forbidden_import(line: str) -> bool: + line = line.strip() + return bool( + FORBIDDEN_PATTERNS.match(line) + and not any(pattern.match(line) for pattern in ALLOWED_PATTERNS)) + + +def check_file(filepath: str) -> list[tuple[int, str]]: + violations = [] + try: + with open(filepath, encoding='utf-8') as f: + for line_num, line in enumerate(f, 1): + if is_forbidden_import(line): + violations.append((line_num, line.strip())) + except (OSError, UnicodeDecodeError): + pass + return violations + + +def main() -> int: + files = get_staged_python_files() + if not files: + return 0 + + total_violations = 0 + + for filepath in files: + if not Path(filepath).exists(): + continue + + if filepath == "setup.py": + continue + + violations = check_file(filepath) + if violations: + print(f"\nโŒ {filepath}:") + for line_num, line in violations: + print(f" Line {line_num}: {line}") + total_violations += 1 + + if total_violations > 0: + print(f"\n๐Ÿ’ก Found {total_violations} violation(s).") + print("โŒ Please replace 'import re' with 'import regex as re'") + print( + " Also replace 'from re import ...' with 'from regex import ...'" + ) # noqa: E501 + print("โœ… Allowed imports:") + print(" - import regex as re") + print(" - import regex") # noqa: E501 + return 1 + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/install_nixl.sh b/tools/install_nixl.sh new file mode 100644 index 0000000000000..56717cfb77f7b --- /dev/null +++ b/tools/install_nixl.sh @@ -0,0 +1,109 @@ +#!/bin/bash +# Usage: ./install_nixl.sh [--force] + +FORCE=false +if [ "$1" == "--force" ]; then + FORCE=true +fi + +SUDO=false +if command -v sudo >/dev/null 2>&1 && sudo -n true 2>/dev/null; then + SUDO=true +fi + +ARCH=$(uname -m) + +ROOT_DIR="/usr/local" +mkdir -p "$ROOT_DIR" +GDR_HOME="$ROOT_DIR/gdrcopy" +UCX_HOME="$ROOT_DIR/ucx" +NIXL_HOME="$ROOT_DIR/nixl" +CUDA_HOME=/usr/local/cuda + +export PATH="$GDR_HOME/bin:$UCX_HOME/bin:$NIXL_HOME/bin:$PATH" +export LD_LIBRARY_PATH="$GDR_HOME/lib:$UCX_HOME/lib:$NIXL_HOME/lib/$ARCH-linux-gnu:$LD_LIBRARY_PATH" + +TEMP_DIR="nixl_installer" +mkdir -p "$TEMP_DIR" +cd "$TEMP_DIR" + +pip install meson ninja pybind11 + +if [ ! -e "/dev/gdrdrv" ] || [ "$FORCE" = true ]; then + echo "Installing gdrcopy\n" + wget https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.5.tar.gz + tar xzf v2.5.tar.gz; rm v2.5.tar.gz + cd gdrcopy-2.5 + make prefix=$GDR_HOME CUDA=$CUDA_HOME all install + + if $SUDO; then + echo "Running insmod.sh with sudo" + sudo ./insmod.sh + else + echo "Skipping insmod.sh - sudo not available" + echo "Please run 'sudo ./gdrcopy-2.5/insmod.sh' manually if needed" + fi + + cd .. +else + echo "Found /dev/gdrdrv. Skipping gdrcopy installation" +fi + +if ! command -v ucx_info &> /dev/null || [ "$FORCE" = true ]; then + echo "Installing UCX" + wget https://github.com/openucx/ucx/releases/download/v1.18.0/ucx-1.18.0.tar.gz + tar xzf ucx-1.18.0.tar.gz; rm ucx-1.18.0.tar.gz + cd ucx-1.18.0 + + # Checking Mellanox NICs + MLX_OPTS="" + if lspci | grep -i mellanox > /dev/null || command -v ibstat > /dev/null; then + echo "Mellanox NIC detected, adding Mellanox-specific options" + MLX_OPTS="--with-rdmacm \ + --with-mlx5-dv \ + --with-ib-hw-tm" + fi + + ./configure --prefix=$UCX_HOME \ + --enable-shared \ + --disable-static \ + --disable-doxygen-doc \ + --enable-optimizations \ + --enable-cma \ + --enable-devel-headers \ + --with-cuda=$CUDA_HOME \ + --with-dm \ + --with-gdrcopy=$GDR_HOME \ + --with-verbs \ + --enable-mt \ + $MLX_OPTS + make -j + make -j install-strip + + if $SUDO; then + echo "Running ldconfig with sudo" + sudo ldconfig + else + echo "Skipping ldconfig - sudo not available" + echo "Please run 'sudo ldconfig' manually if needed" + fi + + cd .. +else + echo "Found existing UCX. Skipping UCX installation" +fi + +if ! command -v nixl_test &> /dev/null || [ "$FORCE" = true ]; then + echo "Installing NIXL" + wget https://github.com/ai-dynamo/nixl/archive/refs/tags/0.2.0.tar.gz + tar xzf 0.2.0.tar.gz; rm 0.2.0.tar.gz + cd nixl-0.2.0 + meson setup build --prefix=$NIXL_HOME -Ducx_path=$UCX_HOME + cd build + ninja + ninja install + + cd ../.. +else + echo "Found existing NIXL. Skipping NIXL installation" +fi diff --git a/tools/update-dockerfile-graph.sh b/tools/update-dockerfile-graph.sh index a1e22a69cdc7b..88189e8ab2087 100755 --- a/tools/update-dockerfile-graph.sh +++ b/tools/update-dockerfile-graph.sh @@ -24,7 +24,7 @@ if printf '%s\n' "${FILES[@]}" | grep -q "^docker/Dockerfile$"; then fi # Define the target file path - TARGET_GRAPH_FILE="docs/source/assets/contributing/dockerfile-stages-dependency.png" + TARGET_GRAPH_FILE="docs/assets/contributing/dockerfile-stages-dependency.png" # Ensure target directory exists mkdir -p "$(dirname "$TARGET_GRAPH_FILE")" diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index e74d139ab980f..3c8e6b95ce763 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1085,7 +1085,6 @@ def scaled_fp4_experts_quant( blockscale_offsets: torch.Tensor, topk: int, expert_map: Optional[torch.Tensor] = None, - MAX_TOKENS_PER_EXPERT: int = 163840, ) -> tuple[torch.Tensor, torch.Tensor]: """ Quantize input tensor to FP4 and return quantized tensor and scale, for @@ -1107,9 +1106,16 @@ def scaled_fp4_experts_quant( input_tensor = input_tensor[ expert_map] if expert_map is not None else input_tensor m_numtopk, k = input_tensor.shape + # Control the maximum number of tokens per expert supported by the + # NVFP4 MoE Expert Quantization. This is used to prevent the kernel + # from running out of memory. This value can also be increased to support + # larger models. + MAX_TOKENS_PER_EXPERT = envs.VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE assert (m_numtopk <= MAX_TOKENS_PER_EXPERT * topk), ( - f"m_numtopk must be less than MAX_TOKENS_PER_EXPERT * topk for" - f" scaled_fp4_experts_quant kernel, observed m_numtopk = {m_numtopk}") + f"m_numtopk must be less than MAX_TOKENS_PER_EXPERT(" + f"{MAX_TOKENS_PER_EXPERT})" + f" for cutlass_moe_fp4, observed m_numtopk = {m_numtopk}. Use" + f" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE to set this value.") scales_k = k // 16 padded_k = (scales_k + (4 - 1)) // 4 diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index d48462684906a..1007140ef3863 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -1093,10 +1093,6 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): if isinstance(attn_out, tuple): attn_out, *rest = attn_out - # unpad if necessary - if self._pad_v: - attn_out = attn_out[..., :v.shape[-1]] - # Remain consistent with old `flash_attn_varlen_func` where there # is only one output tensor if `return_softmax_lse` is False. if return_softmax_lse: @@ -1294,6 +1290,10 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): suffix_lse=suffix_lse, ) + # unpad if necessary + if self._pad_v: + output = output[..., :v.shape[-1]] + return output.flatten(start_dim=-2) @abstractmethod diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py index b048220020f14..c974f2a15a0ef 100644 --- a/vllm/attention/backends/rocm_aiter_mla.py +++ b/vllm/attention/backends/rocm_aiter_mla.py @@ -132,8 +132,6 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): def __init__(self, input_builder: "ModelInputForGPUBuilder"): super().__init__(input_builder) - assert self.runner.model_config.max_model_len == 32768,\ - "AITER MLA requires max model len to be set to 32768" assert self.block_size == 1, "AITER MLA requires only block size 1." def prepare(self): diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 8076c4791d3c0..7134472daa605 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -770,8 +770,9 @@ class ROCmFlashAttentionImpl(AttentionImpl): and layer._v_scale and layer._prob_scale and self.kv_cache_dtype == "fp8") full_scales = ( - layer._q_scale, layer._k_scale, layer._v_scale, - layer._prob_scale) if use_fp8_scales else None + layer._q_scale.item(), layer._k_scale.item(), + layer._v_scale.item(), + layer._prob_scale.item()) if use_fp8_scales else None self.triton_attn_func( query, key, @@ -861,7 +862,8 @@ class ROCmFlashAttentionImpl(AttentionImpl): gqa_ratio = num_heads // self.num_kv_heads use_custom = use_rocm_custom_paged_attention( decode_query.dtype, head_size, block_size, gqa_ratio, - decode_meta.max_decode_seq_len, self.sliding_window) + decode_meta.max_decode_seq_len, self.sliding_window, + self.kv_cache_dtype, self.alibi_slopes) if use_custom: max_seq_len = (decode_meta.max_decode_seq_len if self.attn_type != AttentionType.ENCODER_DECODER else diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py index 217db3bf965de..6ca2a64145bd6 100644 --- a/vllm/attention/ops/chunked_prefill_paged_decode.py +++ b/vllm/attention/ops/chunked_prefill_paged_decode.py @@ -264,8 +264,8 @@ def chunked_prefill_paged_decode( # Conversion of FP8 Tensor from uint8 storage to # appropriate torch.dtype for interpretation by Triton if "fp8" in kv_cache_dtype: - assert key_cache.dtype == torch.uint8 - assert value_cache.dtype == torch.uint8 + assert key_cache.dtype in [torch.uint8, current_platform.fp8_dtype()] + assert value_cache.dtype in [torch.uint8, current_platform.fp8_dtype()] if kv_cache_dtype in ("fp8", "fp8_e4m3"): target_dtype = current_platform.fp8_dtype() @@ -283,7 +283,8 @@ def chunked_prefill_paged_decode( use_custom = use_rocm_custom_paged_attention(query.dtype, head_size, block_size, num_queries_per_kv, - max_seq_len, sliding_window) + max_seq_len, sliding_window, + kv_cache_dtype, alibi_slopes) if use_custom: _PARTITION_SIZE_ROCM = 256 max_num_partitions = ((max_seq_len + _PARTITION_SIZE_ROCM - 1) // diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index 86d256b630bf5..729b61b029063 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -744,8 +744,8 @@ def context_attention_fwd(q, # Conversion of FP8 Tensor from uint8 storage to # appropriate torch.dtype for interpretation by Triton if "fp8" in kv_cache_dtype: - assert (k_cache.dtype == torch.uint8) - assert (v_cache.dtype == torch.uint8) + assert k_cache.dtype in [torch.uint8, current_platform.fp8_dtype()] + assert v_cache.dtype in [torch.uint8, current_platform.fp8_dtype()] if kv_cache_dtype in ("fp8", "fp8_e4m3"): target_dtype = current_platform.fp8_dtype() diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py index 8940d0b662258..62cfb813d5f94 100644 --- a/vllm/attention/ops/triton_flash_attention.py +++ b/vllm/attention/ops/triton_flash_attention.py @@ -1,236 +1,33 @@ +#!/usr/bin/env python # SPDX-License-Identifier: Apache-2.0 """ Fused Attention =============== -This is a Triton implementation of the Flash Attention v2 algorithm -See https://tridao.me/publications/flash2/flash2.pdf +This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao +(https://tridao.me/publications/flash2/flash2.pdf) +Credits: OpenAI kernel team, AMD ML Frameworks Triton team -Credits: -AMD Triton kernels team -OpenAI kernel team - -Currently only the forward kernel is supported, and contains these features: +Features supported: 1) Fwd with causal masking -2) Arbitrary Q and KV sequence lengths -3) Arbitrary head sizes -4) Multi and grouped query attention -5) Variable sequence lengths -6) ALiBi and matrix bias -7) FP8 support +2) Any sequence lengths without padding (currently fwd kernel only) +3) Support for different sequence lengths for q and k +4) Nested tensor API currently does not support dropout or bias. + +Not currently supported: + +1) Non power of two head dims """ -from typing import Optional - import torch -from vllm import _custom_ops as ops from vllm.platforms import current_platform +from vllm.platforms.rocm import on_gfx1x from vllm.triton_utils import tl, triton -SUPPORTED_LAYOUTS = ['thd', 'bhsd', 'bshd'] - -default_eight_bit_dtype_triton = tl.float8e4b8 -default_eight_bit_dtype_torch = current_platform.fp8_dtype() -default_float8_info = torch.finfo(default_eight_bit_dtype_torch) - -FP8_MIN = triton.language.constexpr(default_float8_info.min) - -# According to https://github.com/vllm-project/vllm/blob/main -# /csrc/quantization/utils.cuh#L31, -# need to make the max for the uz datatype be 224.0 for accuracy reasons. -FP8_MAX = triton.language.constexpr( - default_float8_info.max if default_eight_bit_dtype_torch != - torch.float8_e4m3fnuz else 224.0) - - -class MetaData: - cu_seqlens_q = None - cu_seqlens_k = None - max_seqlens_q = 0 - max_seqlens_k = 0 - bias = None - alibi_slopes = None - causal = False - num_contexts = 0 - varlen = False - eight_bit = False - layout = None - return_encoded_softmax = False - eight_bit_dtype_triton = default_eight_bit_dtype_triton - eight_bit_dtype_torch = default_eight_bit_dtype_torch - output_dtype = None - - # Note about layouts: - # - # thd - [num_tokens, num_heads, head_size] - # bshd - [batch_size, seq_len, num_heads, head_size] - # bhsd - [batch_size, num_heads, seq_len, head_size] - # - # This is for each tensor, all tensors must have same layout. - # Q can have num_heads and seq_len differ from from K and V, - # however K and V must agree on this. - # - # Notes about varlen and bias: - # Only one or the other is implemented, meaning can't combine - # both varlen and bias right now. - # - # Note about quantization: - # Only 8-bit quantization supported (for now) and specifically fp8. - # Scales must be tensors. - # o_scale: This is 'output scaling', but comes from parameter called - # 'input_scale', this is applied to the output from the kernel. - # o_scale should be None if none of the other quantization parameters - # are used. - # - # NOTE: Object is in a tentatively good state after initialized, however, - # to verify, call check_args(q,k,v,o) where o is the output tensor. - def __init__( - self, - sm_scale=1.0, - layout=None, # layout can be 'bshd', 'bhsd', or 'thd' - output_dtype=None, - max_seqlens_q=0, - max_seqlens_k=0, - # varlen params - cu_seqlens_q=None, # only 'thd' layout supported for varlen - cu_seqlens_k=None, - # quant params - q_descale=None, - k_descale=None, - v_descale=None, - p_scale=None, - o_scale=None, - # bias params - bias=None, # varlen not implemented for bias - seqlen_q=None, - seqlen_k=None, - # alibi params - alibi_slopes=None, - alibi_batch=None, - alibi_nheads=None, - # causal - causal=None, - ): - self.sm_scale = sm_scale - self.output_dtype = output_dtype - self.max_seqlens_q = max_seqlens_q - self.max_seqlens_k = max_seqlens_k - self.layout = layout - if cu_seqlens_q is not None or cu_seqlens_k is not None: - assert cu_seqlens_q is not None and cu_seqlens_k is not None - assert layout is None or layout not in [ - 'bshd', 'bhsd' - ], "Varlen only implemented for thd layout" - self.set_varlen_params(cu_seqlens_q, cu_seqlens_k) - quant_params = [q_descale, k_descale, v_descale, p_scale, o_scale] - if any(x is not None for x in quant_params): - p_descale = 1.0 / p_scale if p_scale is not None else None - self.set_eight_bit_params(q_descale, k_descale, v_descale, p_scale, - p_descale, o_scale) - if bias is not None: - self.need_bias(bias, seqlen_q, seqlen_k) - if alibi_slopes is not None: - self.need_alibi(alibi_slopes, alibi_batch, alibi_nheads) - if causal is not None and causal: - self.need_causal() - - def set_varlen_params(self, cu_seqlens_q, cu_seqlens_k): - self.varlen = True - self.layout = 'thd' - self.cu_seqlens_q = cu_seqlens_q - self.cu_seqlens_k = cu_seqlens_k - # Without "varlen", there should still be one sequence. - assert len(cu_seqlens_q) >= 2 - assert len(cu_seqlens_q) == len(cu_seqlens_k) - self.num_contexts = len(cu_seqlens_q) - 1 - for i in range(0, self.num_contexts): - self.max_seqlens_q = max( - cu_seqlens_q[i + 1].item() - cu_seqlens_q[i].item(), - self.max_seqlens_q) - self.max_seqlens_k = max( - cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item(), - self.max_seqlens_k) - - def set_eight_bit_params(self, q_descale, k_descale, v_descale, p_scale, - p_descale, o_scale): - self.eight_bit = True - self.q_descale = q_descale - self.k_descale = k_descale - self.v_descale = v_descale - self.p_scale = p_scale - self.p_descale = p_descale - self.o_scale = o_scale - self.use_p_scale = (p_scale is not None) and ( - p_descale is not None) and (v_descale is not None) - self.eight_bit_kv = ((q_descale is None) and (k_descale is not None) - and (v_descale is not None)) - self.eight_bit_dtype_torch = default_eight_bit_dtype_torch - - def need_bias(self, bias, seqlen_q, seqlen_k): - assert bias is not None - assert bias.is_cuda - assert bias.dim() == 4 - assert bias.shape[0] == 1 - assert bias.shape[2:] == (seqlen_q, seqlen_k) - self.bias = bias - - def need_alibi(self, alibi_slopes, batch, nheads): - assert alibi_slopes.is_cuda - assert alibi_slopes.dim() == 2 - assert alibi_slopes.shape[0] == batch - assert alibi_slopes.shape[1] == nheads - self.alibi_slopes = alibi_slopes - - def need_causal(self): - self.causal = True - - def check_args(self, q, k, v, o): - assert q.dim() == k.dim() and q.dim() == v.dim() - - batch, nheads_q, nheads_k, head_size = get_shape_from_layout( - q, k, self) - if self.varlen: - assert q.dim() == 3 - assert self.cu_seqlens_q is not None - assert self.cu_seqlens_k is not None - assert len(self.cu_seqlens_q) == len(self.cu_seqlens_k) - # TODO: Remove once bias is supported with varlen - assert self.bias is None - assert not self.return_encoded_softmax - else: - assert q.dim() == 4 - assert self.max_seqlens_q > 0 and self.max_seqlens_k > 0 - assert self.cu_seqlens_q is None and self.cu_seqlens_k is None - assert k.shape == v.shape - assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1] - # TODO: Change assert if we support qkl f8 and v f16 - if self.eight_bit: - if self.eight_bit_kv: - assert (v.dtype == k.dtype - and k.dtype == self.eight_bit_dtype_torch) - assert q.dtype != k.dtype - assert (self.v_descale is not None) and (self.k_descale - is not None) - else: - assert (q.dtype == k.dtype and q.dtype == v.dtype - and q.dtype == self.eight_bit_dtype_torch) - assert (self.q_descale - is not None) and (self.k_descale - is not None) and (self.v_descale - is not None) - if self.use_p_scale: - assert (self.p_scale is not None) and (self.p_descale - is not None) - else: - assert (q.dtype == k.dtype) and (q.dtype == v.dtype) - assert head_size <= 256 - assert o.shape == q.shape - assert (nheads_q % nheads_k) == 0 - assert self.layout is not None - assert self.layout == 'thd' or not self.varlen +torch_dtype: tl.constexpr = torch.float16 @triton.jit @@ -243,155 +40,103 @@ def max_fn(x, y): return tl.math.max(x, y) -# Convenience function to load with optional boundary checks. -# "First" is the major dim, "second" is the minor dim. @triton.jit -def masked_load(ptrs, offset_first, offset_second, boundary_first, - boundary_second): - if offset_first is not None and offset_second is not None: - mask = (offset_first[:, None] < boundary_first) & \ - (offset_second[None, :] < boundary_second) - tensor = tl.load(ptrs, mask=mask, other=0.0) - elif offset_first is not None: - mask = offset_first[:, None] < boundary_first - tensor = tl.load(ptrs, mask=mask, other=0.0) - elif offset_second is not None: - mask = offset_second[None, :] < boundary_second - tensor = tl.load(ptrs, mask=mask, other=0.0) +def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride): + ms = tl.arange(0, m) + ns = tl.arange(0, n) + return philox_offset + ms[:, None] * stride + ns[None, :] + + +@triton.jit +def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride): + rng_offsets = dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, + stride).to(tl.uint32) + # TODO: use tl.randint for better performance + return tl.rand(philox_seed, rng_offsets) + + +@triton.jit +def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride): + rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n, + stride) + rng_keep = rng_output > dropout_p + return rng_keep + + +@triton.jit +def load_fn(block_ptr, first, second, pad): + if first and second: + tensor = tl.load(block_ptr, boundary_check=(0, 1), padding_option=pad) + elif first: + tensor = tl.load(block_ptr, boundary_check=(0, ), padding_option=pad) + elif second: + tensor = tl.load(block_ptr, boundary_check=(1, ), padding_option=pad) else: - tensor = tl.load(ptrs) + tensor = tl.load(block_ptr) return tensor -@triton.jit -def compute_alibi_block(alibi_slope, - seqlen_q, - seqlen_k, - offs_m, - offs_n, - transpose=False): - # when seqlen_k and seqlen_q are different we want the diagonal to stick to - # the bottom right of the attention matrix - # for casual mask we want something like this where (1 is kept and 0 is - # masked) - # seqlen_q = 2 and seqlen_k = 5 - # 1 1 1 1 0 - # 1 1 1 1 1 - # seqlen_q = 5 and seqlen_k = 2 - # 0 0 - # 0 0 - # 0 0 - # 1 0 - # 1 1 - # for alibi the diagonal is 0 indicating no penalty for attending to that - # spot and increasing penalty for attending further from the diagonal - # e.g. alibi_slope = 1, seqlen_q = 2, seqlen_k = 5, - # offs_m = [0, 1, 2, 3], offs_n = [0, 1, 2, 3, 4], transpose = False - # 1. offs_m[:,None] = [[0], - # [1], - # 2. offs_m[:,None] + seqlen_k = [[5], - # [6], - # 3. offs_m[:,None] + seqlen_k - seqlen_q = [[3], - # [4], - # 4. offs_m[:,None] + seqlen_k - seqlen_q - offs_n[None,:] = - # [[3], - [[0, 1, 2, 3, 4]] = [[ 3, 2, 1, 0,-1], [4], [ 4, 3, 2, 1, 0]] - # 5. -1 * alibi_slope * tl.abs(relative_pos_block) = [[ -3, -2, -1, 0,-1], - # [ -4, -3, -2, -1, 0]], - relative_pos_block = (offs_m[:, None] + seqlen_k - seqlen_q - - offs_n[None, :]) - alibi_block = -1 * alibi_slope * tl.abs(relative_pos_block) - if transpose: - return alibi_block.T - else: - return alibi_block - - -def compute_alibi_tensor(alibi_slopes, seqlen_q, seqlen_k): - q_idx = torch.arange(seqlen_q, dtype=torch.int32, - device="cuda").unsqueeze(-1) # (N_CTX_Q, 1) - k_idx = torch.arange(seqlen_k, dtype=torch.int32, - device="cuda").unsqueeze(0) # (1, N_CTX_K) - relative_pos = torch.abs(q_idx + seqlen_k - seqlen_q - - k_idx) # (N_CTX_Q, N_CTX_K) - return -1 * alibi_slopes.unsqueeze(-1).unsqueeze( - -1) * relative_pos # (Z, H, N_CTX_Q, N_CTX_K) - - -@triton.jit -def quant_fp8(x, scale): - x *= scale - x = tl.clamp(x, FP8_MIN, FP8_MAX) - return x - - @triton.jit def _attn_fwd_inner( acc, l_i, m_i, q, - k_ptrs, - v_ptrs, - bias_ptrs, - stride_kn, - stride_vk, - stride_bn, + K_block_ptr, + V_block_ptr, start_m, actual_seqlen_k, - actual_seqlen_q, + dropout_p, philox_seed, batch_philox_offset, - encoded_sm_ptrs, + encoded_softmax_block_ptr, block_min, block_max, offs_n_causal, masked_blocks, n_extra_tokens, - alibi_slope, - q_descale, - k_descale, - v_descale, - p_scale, + bias_ptr, IS_CAUSAL: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, BLOCK_N: tl.constexpr, OFFS_M: tl.constexpr, OFFS_N: tl.constexpr, - SHOULD_PRE_LOAD_V: tl.constexpr, - SHOULD_MASK_STEPS: tl.constexpr, - SHOULD_RETURN_ENCODED_SOFTMAX: tl.constexpr, - USE_PADDED_HEAD: tl.constexpr, - IS_ACTUAL_BLOCK_DMODEL: tl.constexpr, - QK_SCALE: tl.constexpr, - IS_EIGHT_BIT_GEMM: tl.constexpr, - USE_P_SCALE: tl.constexpr, - IS_EIGHT_BIT_KV: tl.constexpr, - QUANT_DTYPE: tl.constexpr = default_eight_bit_dtype_triton, + PRE_LOAD_V: tl.constexpr, + MASK_STEPS: tl.constexpr, + ENABLE_DROPOUT: tl.constexpr, + RETURN_ENCODED_SOFTMAX: tl.constexpr, + PADDED_HEAD: tl.constexpr, + USE_FP8: tl.constexpr, + qk_scale, + p_descale, ): - # loop over k, v, and update accumulator for start_n in range(block_min, block_max, BLOCK_N): # For padded blocks, we will overrun the tensor size if # we load all BLOCK_N. For others, the blocks are all within range. - k_offs_n = start_n + tl.arange(0, - BLOCK_N) if SHOULD_MASK_STEPS else None - k_offs_k = None if not USE_PADDED_HEAD else tl.arange(0, BLOCK_DMODEL) - k = masked_load(k_ptrs, k_offs_k, k_offs_n, IS_ACTUAL_BLOCK_DMODEL, - actual_seqlen_k) - if SHOULD_PRE_LOAD_V: - # We can use the same offsets as k, just with dims transposed. - v = masked_load(v_ptrs, k_offs_n, k_offs_k, actual_seqlen_k, - IS_ACTUAL_BLOCK_DMODEL) + k = load_fn( + K_block_ptr, + PADDED_HEAD, + MASK_STEPS and (n_extra_tokens != 0), + "zero", + ) + if PRE_LOAD_V: + v = load_fn( + V_block_ptr, + MASK_STEPS and (n_extra_tokens != 0), + PADDED_HEAD, + "zero", + ) qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) # We start from end of seqlen_k so only the first iteration would need # to be checked for padding if it is not a multiple of block_n # TODO: This can be optimized to only be true for the padded block. - if SHOULD_MASK_STEPS: # noqa: SIM102 + if MASK_STEPS: # noqa: SIM102 # If this is the last block / iteration, we want to # mask if the sequence length is not a multiple of block size - # a solution is to always do BLOCK_M // BLOCK_N + 1 steps if not - # is_modulo_mn. last step might get wasted but that is okay. + # a solution is to always do BLOCK_M // BLOCK_N + 1 steps + # if not is_modulo_mn. last step might get wasted but that is okay. # check if this masking works for that case. if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0): boundary_m = tl.full([BLOCK_M], @@ -404,97 +149,112 @@ def _attn_fwd_inner( causal_boundary = start_n + offs_n_causal causal_mask = OFFS_M[:, None] >= causal_boundary[None, :] qk = tl.where(causal_mask, qk, float("-inf")) - # -- compute qk ---- - if IS_EIGHT_BIT_GEMM: - qk += ((((tl.dot(q, k).to(tl.float32) * q_descale)) * k_descale) * - QK_SCALE) - else: - if IS_EIGHT_BIT_KV: - k = (k * k_descale).to(q.type.element_ty) - qk += (tl.dot(q, k) * QK_SCALE) - - if bias_ptrs is not None: - bias_offs_n = start_n + tl.arange( - 0, BLOCK_N) if SHOULD_MASK_STEPS else None - bias = masked_load(bias_ptrs, OFFS_M, bias_offs_n, actual_seqlen_q, - actual_seqlen_k) - # While bias is added after multiplying qk with sm_scale, - # our optimization to use 2^x instead of e^x results in an - # additional scale factor of log2(e) which we must also multiply - # the bias with. - qk += (bias * 1.44269504089) - - if alibi_slope is not None: - # Compute the global position of each token within the sequence - global_m_positions = start_m * BLOCK_M + tl.arange(0, BLOCK_M) - global_n_positions = start_n + tl.arange(0, BLOCK_N) - alibi_block = compute_alibi_block(alibi_slope, actual_seqlen_q, - actual_seqlen_k, - global_m_positions, - global_n_positions) - qk += (alibi_block * 1.44269504089) # scale factor of log2(e) - - # softmax + qk += tl.dot(q, k) + if USE_FP8: + qk *= qk_scale + if bias_ptr is not None: + bias = load_fn(bias_ptr, False, MASK_STEPS + and (n_extra_tokens != 0), "zero") + # While bias is added after multiplying qk with sm_scale, our + # optimization to use 2^x instead of e^x results in an additional + # scale factor of log2(e) which we must also multiply the bias with. + qk += bias * 1.44269504089 m_ij = tl.maximum(m_i, tl.max(qk, 1)) qk = qk - m_ij[:, None] p = tl.math.exp2(qk) # CAVEAT: Must update l_ij before applying dropout l_ij = tl.sum(p, 1) - if SHOULD_RETURN_ENCODED_SOFTMAX: - tl.store(encoded_sm_ptrs, p.to(encoded_sm_ptrs.type.element_ty)) + if ENABLE_DROPOUT: + philox_offset = (batch_philox_offset + + start_m * BLOCK_M * actual_seqlen_k + start_n - + BLOCK_N) + keep = dropout_mask( + philox_seed, + philox_offset, + dropout_p, + BLOCK_M, + BLOCK_N, + actual_seqlen_k, + ) + if RETURN_ENCODED_SOFTMAX: + tl.store( + encoded_softmax_block_ptr, + tl.where(keep, p, + -p).to(encoded_softmax_block_ptr.type.element_ty), + ) + p = tl.where(keep, p, 0.0) + elif RETURN_ENCODED_SOFTMAX: + tl.store( + encoded_softmax_block_ptr, + p.to(encoded_softmax_block_ptr.type.element_ty), + ) # -- update output accumulator -- alpha = tl.math.exp2(m_i - m_ij) acc = acc * alpha[:, None] - if not SHOULD_PRE_LOAD_V: - v = masked_load(v_ptrs, k_offs_n, k_offs_k, actual_seqlen_k, - IS_ACTUAL_BLOCK_DMODEL) + if not PRE_LOAD_V: + v = load_fn( + V_block_ptr, + MASK_STEPS and (n_extra_tokens != 0), + PADDED_HEAD, + "zero", + ) # -- update m_i and l_i l_i = l_i * alpha + l_ij # update m_i and l_i m_i = m_ij - if IS_EIGHT_BIT_GEMM: - if USE_P_SCALE: - p = quant_fp8(p, p_scale).to(QUANT_DTYPE) - acc += tl.dot(p, v) - else: - # v is in eight_bit but p is not, we want the gemm in p's type - acc += tl.dot(p, v.to(p.type.element_ty)) - else: - if IS_EIGHT_BIT_KV: - v = (v * v_descale).to(p.type.element_ty) - acc += tl.dot(p.to(v.type.element_ty), v) + if USE_FP8: + p *= p_descale - k_ptrs += BLOCK_N * stride_kn - v_ptrs += BLOCK_N * stride_vk - if bias_ptrs is not None: - bias_ptrs += BLOCK_N * stride_bn - if SHOULD_RETURN_ENCODED_SOFTMAX: - encoded_sm_ptrs += BLOCK_N + acc += tl.dot(p.to(V_block_ptr.type.element_ty), v) + + V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0)) + K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N)) + if bias_ptr is not None: + bias_ptr = tl.advance(bias_ptr, (0, BLOCK_N)) + if RETURN_ENCODED_SOFTMAX: + encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr, + (0, BLOCK_N)) return acc, l_i, m_i def get_cdna_autotune_configs(): return [ + triton.Config( + { + 'BLOCK_M': 256, + 'BLOCK_N': 64, + 'waves_per_eu': 2, + 'PRE_LOAD_V': False + }, + num_stages=1, + num_warps=8), triton.Config( { 'BLOCK_M': 128, 'BLOCK_N': 128, 'waves_per_eu': 2, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 + 'PRE_LOAD_V': False }, num_stages=1, num_warps=4), + triton.Config( + { + 'BLOCK_M': 256, + 'BLOCK_N': 128, + 'waves_per_eu': 2, + 'PRE_LOAD_V': False + }, + num_stages=1, + num_warps=8), triton.Config( { 'BLOCK_M': 128, 'BLOCK_N': 64, - 'waves_per_eu': 2, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 + 'waves_per_eu': 1, + 'PRE_LOAD_V': False }, num_stages=1, num_warps=4), @@ -503,8 +263,7 @@ def get_cdna_autotune_configs(): 'BLOCK_M': 128, 'BLOCK_N': 64, 'waves_per_eu': 3, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 + 'PRE_LOAD_V': True }, num_stages=1, num_warps=4), @@ -512,26 +271,45 @@ def get_cdna_autotune_configs(): { 'BLOCK_M': 128, 'BLOCK_N': 64, - 'waves_per_eu': 1, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 + 'waves_per_eu': 3, + 'PRE_LOAD_V': False }, num_stages=1, num_warps=4), triton.Config( { - 'BLOCK_M': 128, - 'BLOCK_N': 32, - 'waves_per_eu': 2, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 + 'BLOCK_M': 64, + 'BLOCK_N': 64, + 'waves_per_eu': 4, + 'PRE_LOAD_V': False }, num_stages=1, - num_warps=4), - ], [ - 'IS_CAUSAL', 'MAX_SEQLENS_Q', 'MAX_SEQLENS_K', - 'IS_ACTUAL_BLOCK_DMODEL', 'VARLEN', 'HQ', 'HK' - ] + num_warps=8), + triton.Config( + { + 'BLOCK_M': 32, + 'BLOCK_N': 32, + 'waves_per_eu': 4, + 'PRE_LOAD_V': False + }, + num_stages=1, + num_warps=8), + # TODO: This config fails with head_size not pow2 with data mismatches. + # triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 1, + # 'PRE_LOAD_V': False}, num_stages=1, num_warps=4), + + # Fails in AccelerateAMDMatmul (Triton) assert when using FP8: + # triton.Config( + # { + # "BLOCK_M": 16, + # "BLOCK_N": 16, + # "waves_per_eu": 1, + # "PRE_LOAD_V": False, + # }, + # num_stages=1, + # num_warps=4, + # ), + ], ['IS_CAUSAL', 'dropout_p', 'BLOCK_DMODEL', 'USE_FP8'] def get_rdna_autotune_configs(): @@ -541,8 +319,7 @@ def get_rdna_autotune_configs(): 'BLOCK_M': 32, 'BLOCK_N': 32, 'waves_per_eu': 4, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 + 'PRE_LOAD_V': False }, num_stages=1, num_warps=2), @@ -551,8 +328,7 @@ def get_rdna_autotune_configs(): 'BLOCK_M': 32, 'BLOCK_N': 32, 'waves_per_eu': 2, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 + 'PRE_LOAD_V': False }, num_stages=1, num_warps=2), @@ -561,8 +337,7 @@ def get_rdna_autotune_configs(): 'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 4, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 + 'PRE_LOAD_V': False }, num_stages=1, num_warps=2), @@ -571,109 +346,57 @@ def get_rdna_autotune_configs(): 'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 2, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 + 'PRE_LOAD_V': False }, num_stages=1, num_warps=2), - triton.Config( - { - 'BLOCK_M': 16, - 'BLOCK_N': 16, - 'waves_per_eu': 4, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 - }, - num_stages=1, - num_warps=2), - triton.Config( - { - 'BLOCK_M': 16, - 'BLOCK_N': 16, - 'waves_per_eu': 2, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 - }, - num_stages=1, - num_warps=2), - # Fall-back config. - triton.Config( - { - 'BLOCK_M': 16, - 'BLOCK_N': 16, - 'waves_per_eu': 1, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 - }, - num_stages=1, - num_warps=2), - ], [ - 'IS_CAUSAL', 'MAX_SEQLENS_Q', 'MAX_SEQLENS_K', - 'IS_ACTUAL_BLOCK_DMODEL', 'VARLEN', 'HQ', 'HK' - ] - - -def get_general_autotune_configs(): - return [ - triton.Config( - { - 'BLOCK_M': 128, - 'BLOCK_N': 128, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 - }, - num_stages=1, - num_warps=4), - triton.Config( - { - 'BLOCK_M': 128, - 'BLOCK_N': 64, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 - }, - num_stages=1, - num_warps=4), - triton.Config( - { - 'BLOCK_M': 128, - 'BLOCK_N': 32, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 - }, - num_stages=1, - num_warps=4), - ], [ - 'IS_CAUSAL', 'MAX_SEQLENS_Q', 'MAX_SEQLENS_K', - 'IS_ACTUAL_BLOCK_DMODEL', 'VARLEN', 'HQ', 'HK' - ] - - -def has_cdna_target(): - ROCM_CDNA_TARGETS = ["gfx942", "gfx90a", "gfx908"] - return triton.runtime.driver.active.get_current_target( - ).arch in ROCM_CDNA_TARGETS - - -def is_rocm_cdna(): - return current_platform.is_rocm() and has_cdna_target() + # Fails in AccelerateAMDMatmul (Triton) assert when using FP8: + # triton.Config( + # { + # 'BLOCK_M': 16, + # 'BLOCK_N': 16, + # 'waves_per_eu': 4, + # 'PRE_LOAD_V': False + # }, + # num_stages=1, + # num_warps=2), + # triton.Config( + # { + # 'BLOCK_M': 16, + # 'BLOCK_N': 16, + # 'waves_per_eu': 2, + # 'PRE_LOAD_V': False + # }, + # num_stages=1, + # num_warps=2), + # # Fall-back config. + # triton.Config( + # { + # 'BLOCK_M': 16, + # 'BLOCK_N': 16, + # 'waves_per_eu': 1, + # 'PRE_LOAD_V': False + # }, + # num_stages=1, + # num_warps=2), + ], ['IS_CAUSAL', 'dropout_p', 'BLOCK_DMODEL', 'USE_FP8'] def get_autotune_configs(): - if is_rocm_cdna(): - return get_cdna_autotune_configs() - elif current_platform.is_rocm(): + if on_gfx1x(): return get_rdna_autotune_configs() else: - return get_general_autotune_configs() + return get_cdna_autotune_configs() autotune_configs, autotune_keys = get_autotune_configs() +float8_info = torch.finfo(current_platform.fp8_dtype()) + @triton.autotune( configs=autotune_configs, key=autotune_keys, - use_cuda_graph=True, ) @triton.jit def attn_fwd( @@ -681,7 +404,13 @@ def attn_fwd( K, V, bias, - SM_SCALE: tl.constexpr, + sm_scale, + q_scale, + k_scale, + v_scale, + p_scale, + p_descale, + o_descale, L, Out, stride_qz: tl.int64, @@ -704,70 +433,44 @@ def attn_fwd( stride_bh: tl.int64, stride_bm: tl.int64, stride_bn: tl.int64, - stride_az: tl.int64, - stride_ah: tl.int64, - q_descale_ptr, - k_descale_ptr, - p_scale_ptr, - p_descale_ptr, - o_descale_ptr, - v_descale_ptr, - q_descale_has_singleton: tl.constexpr, - k_descale_has_singleton: tl.constexpr, - p_descale_has_singleton: tl.constexpr, - v_descale_has_singleton: tl.constexpr, cu_seqlens_q, cu_seqlens_k, + dropout_p, philox_seed, - NUM_CU: tl.constexpr, - GRID_CU_MULTIP: tl.constexpr, - B: tl.constexpr, philox_offset_base, encoded_softmax, - alibi_slopes, HQ: tl.constexpr, HK: tl.constexpr, - IS_ACTUAL_BLOCK_DMODEL: tl.constexpr, + ACTUAL_BLOCK_DMODEL: tl.constexpr, MAX_SEQLENS_Q: tl.constexpr, MAX_SEQLENS_K: tl.constexpr, VARLEN: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, + USE_FP8: tl.constexpr, + USE_FP8_OUT: tl.constexpr, BLOCK_N: tl.constexpr, - SHOULD_PRE_LOAD_V: tl.constexpr, - USE_BIAS: tl.constexpr, - SHOULD_RETURN_ENCODED_SOFTMAX: tl.constexpr, - USE_ALIBI: tl.constexpr, - IS_EIGHT_BIT: tl.constexpr, - USE_P_SCALE: tl.constexpr, - IS_EIGHT_BIT_KV: tl.constexpr, - QUANT_DTYPE: tl.constexpr = default_eight_bit_dtype_triton, + PRE_LOAD_V: tl.constexpr, + BIAS_TYPE: tl.constexpr, + ENABLE_DROPOUT: tl.constexpr, + RETURN_ENCODED_SOFTMAX: tl.constexpr, + FP8_MIN: tl.constexpr = float8_info.min, + FP8_MAX: tl.constexpr = float8_info.max, ): - - if o_descale_ptr is not None: - o_descale = tl.load(o_descale_ptr) - - start_m: tl.int64 = tl.program_id(0) - off_h_q: tl.int64 = tl.program_id(1) - off_z: tl.int64 = tl.program_id(2) - - offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M).to(tl.int64) - offs_n = tl.arange(0, BLOCK_N).to(tl.int64) - offs_d = tl.arange(0, BLOCK_DMODEL).to(tl.int64) - - # as we can't have return statements inside while loop in Triton - continue_condition = True - + start_m = tl.program_id(0) + off_h_q = tl.program_id(1) + off_z = tl.program_id(2) + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_n = tl.arange(0, BLOCK_N) if VARLEN: cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z) cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1) seqlen_q = cu_seqlens_q_end - cu_seqlens_q_start - # We have a one-size-fits-all grid in id(0). Some seqlens might be - # too small for all start_m so for those we return early. + # We have a one-size-fits-all grid in id(0). Some seqlens might be too + # small for all start_m so for those we return early. if start_m * BLOCK_M > seqlen_q: - continue_condition = False - # return + return cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z) cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1) seqlen_k = cu_seqlens_k_end - cu_seqlens_k_start @@ -777,598 +480,499 @@ def attn_fwd( seqlen_q = MAX_SEQLENS_Q seqlen_k = MAX_SEQLENS_K - if continue_condition: - # Now we compute whether we need to exit early due to causal - # masking. This is because for seqlen_q > seqlen_k, M rows of the - # attn scores are completely masked, resulting in 0s written to the - # output, and inf written to LSE. We don't need to do any GEMMs in - # this case. This block of code determines what N is, and if this - # WG is operating on those M rows. - n_blocks = cdiv_fn(seqlen_k, BLOCK_N) - if (IS_CAUSAL): - # If seqlen_q == seqlen_k, the attn scores are a square matrix. - # If seqlen_q != seqlen_k, attn scores are rectangular which - # means the causal mask boundary is bottom right aligned, and - # ends at either the top edge (seqlen_q < seqlen_k) or left - # edge. This captures the decrease in n_blocks if we have a - # rectangular attn matrix - n_blocks_seqlen = cdiv_fn( - (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N) - # This is what adjusts the block_max for the current WG, only - # if IS_CAUSAL. Otherwise we want to always iterate through all - # n_blocks - n_blocks = min(n_blocks, n_blocks_seqlen) - # If we have no blocks after adjusting for seqlen deltas, this - # WG is part of the blocks that are all 0. We exit early. - if n_blocks <= 0: - o_offset = (Out + off_z * stride_oz + off_h_q * stride_oh + - cu_seqlens_q_start * stride_om) - o_ptrs = (o_offset + offs_m[:, None] * stride_om + - offs_d[None, :] * stride_on) - acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) - o_ptrs_mask = (offs_m[:, None] < seqlen_q).broadcast_to( - [BLOCK_M, BLOCK_DMODEL]) - # We still need to write 0s to the result - tl.store(o_ptrs, acc, mask=o_ptrs_mask) - # The tensor allocated for L is based on MAX_SEQLENS_Q as - # that is statically known. - l_ptrs = (L + off_z * HQ * MAX_SEQLENS_Q + - off_h_q * MAX_SEQLENS_Q + offs_m) - # We store inf to LSE, not -inf because in the bwd pass, - # we subtract this from qk which makes it -inf, such that - # exp(qk - inf) = 0 for these masked blocks. - l_value = tl.full([BLOCK_M], - value=float("inf"), - dtype=tl.float32) - l_ptrs_mask = offs_m < MAX_SEQLENS_Q - tl.store(l_ptrs, l_value, mask=l_ptrs_mask) - # TODO: Should dropout and return encoded softmax be - # handled here too? - continue_condition = False - # return + # Now we compute whether we need to exit early due to causal masking. + # This is because for seqlen_q > seqlen_k, M rows of the attn scores + # are completely masked, resulting in 0s written to the output, and + # inf written to LSE. We don't need to do any GEMMs in this case. + # This block of code determines what N is, and if this WG is operating + # on those M rows. + n_blocks = cdiv_fn(seqlen_k, BLOCK_N) + if IS_CAUSAL: + # If seqlen_q == seqlen_k, the attn scores are a square matrix. + # If seqlen_q != seqlen_k, attn scores are rectangular which means + # the causal mask boundary is bottom right aligned, and ends at either + # the top edge (seqlen_q < seqlen_k) or left edge. + # This captures the decrease in n_blocks if we have a rectangular attn + # matrix + n_blocks_seqlen = cdiv_fn( + (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N) + # This is what adjusts the block_max for the current WG, only + # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks + n_blocks = min(n_blocks, n_blocks_seqlen) + # If we have no blocks after adjusting for seqlen deltas, this WG is + # part of the blocks that are all 0. We exit early. + if n_blocks <= 0: + o_offset = (off_z * stride_oz + cu_seqlens_q_start * stride_om + + off_h_q * stride_oh) + O_block_ptr = tl.make_block_ptr( + base=Out + o_offset, + shape=(seqlen_q, BLOCK_DMODEL), + strides=(stride_om, stride_on), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_DMODEL), + order=(1, 0), + ) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty) + # We still need to write 0s to the result + # tl.store(O_block_ptr, + # acc.to(Out.type.element_ty), boundary_check=(0,1)) + # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + # + offs_m + # We store inf to LSE, not -inf because in the bwd pass, + # we subtract this + # from qk which makes it -inf, such that exp(qk - inf) = 0 + # for these masked blocks. + # l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32) + # tl.store(l_ptrs, l) + # TODO: Should dropout and return encoded softmax be handled here? + return - if continue_condition: - # If MQA / GQA, set the K and V head offsets appropriately. - GROUP_SIZE: tl.constexpr = HQ // HK - off_h_k = off_h_q // GROUP_SIZE if GROUP_SIZE != 1 else off_h_q - n_extra_tokens = 0 - if seqlen_k < BLOCK_N: - n_extra_tokens = BLOCK_N - seqlen_k - elif seqlen_k % BLOCK_N: - n_extra_tokens = seqlen_k % BLOCK_N - USE_PADDED_HEAD: tl.constexpr = (IS_ACTUAL_BLOCK_DMODEL - != BLOCK_DMODEL) + # If MQA / GQA, set the K and V head offsets appropriately. + GROUP_SIZE: tl.constexpr = HQ // HK + off_h_k = off_h_q // GROUP_SIZE if GROUP_SIZE != 1 else off_h_q - # Compute pointers for all the tensors used in this kernel. - q_offset = (Q + off_z * stride_qz + off_h_q * stride_qh + - cu_seqlens_q_start * stride_qm) - q_ptrs = (q_offset + offs_m[:, None] * stride_qm + - offs_d[None, :] * stride_qk) - k_offset = (K + off_z * stride_kz + off_h_k * stride_kh + - cu_seqlens_k_start * stride_kn) - k_ptrs = (k_offset + offs_d[:, None] * stride_kk + - offs_n[None, :] * stride_kn) - v_offset = (V + off_z * stride_vz + off_h_k * stride_vh + - cu_seqlens_k_start * stride_vk) - v_ptrs = (v_offset + offs_n[:, None] * stride_vk + - offs_d[None, :] * stride_vn) - # Compute pointers for all scale tensors used in this kernel. + n_extra_tokens = 0 + if seqlen_k < BLOCK_N: + n_extra_tokens = BLOCK_N - seqlen_k + elif seqlen_k % BLOCK_N: + n_extra_tokens = seqlen_k % BLOCK_N + padded_head = ACTUAL_BLOCK_DMODEL != BLOCK_DMODEL - IS_EIGHT_BIT_GEMM: tl.constexpr = IS_EIGHT_BIT & ( - not IS_EIGHT_BIT_KV) - if IS_EIGHT_BIT: - if k_descale_has_singleton: - k_descale_ptrs = k_descale_ptr - else: - k_descale_ptrs = k_descale_ptr + off_h_k + # Compute pointers for all the tensors used in this kernel. + q_offset = (off_z * stride_qz + off_h_q * stride_qh + + cu_seqlens_q_start * stride_qm) + Q_block_ptr = tl.make_block_ptr( + base=Q + q_offset, + shape=(seqlen_q, ACTUAL_BLOCK_DMODEL), + strides=(stride_qm, stride_qk), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_DMODEL), + order=(1, 0), + ) + k_offset = (off_z * stride_kz + off_h_k * stride_kh + + cu_seqlens_k_start * stride_kn) + K_block_ptr = tl.make_block_ptr( + base=K + k_offset, + shape=(ACTUAL_BLOCK_DMODEL, seqlen_k), + strides=(stride_kk, stride_kn), + offsets=(0, 0), + block_shape=(BLOCK_DMODEL, BLOCK_N), + order=(0, 1), + ) + v_offset = (off_z * stride_vz + off_h_k * stride_vh + + cu_seqlens_k_start * stride_vk) + V_block_ptr = tl.make_block_ptr( + base=V + v_offset, + shape=(seqlen_k, ACTUAL_BLOCK_DMODEL), + strides=(stride_vk, stride_vn), + offsets=(0, 0), + block_shape=(BLOCK_N, BLOCK_DMODEL), + order=(1, 0), + ) + if BIAS_TYPE != 0: + bias_ptr = tl.make_block_ptr( + base=bias + off_h_q * stride_bh, + shape=(seqlen_q, seqlen_k), + strides=(stride_bm, stride_bn), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_N), + order=(1, 0), + ) + else: + bias_ptr = None + if ENABLE_DROPOUT: + batch_philox_offset = philox_offset_base \ + + (off_z * HQ + off_h_q) \ + * seqlen_q * seqlen_k + else: + batch_philox_offset = 0 + # We can ask to return the dropout mask without actually doing any dropout. + # In this case, we return an invalid pointer so indicate the mask is not i + # valid. + # TODO: Fix encoded softmax. It currently uses just h_q in the base offset. + if RETURN_ENCODED_SOFTMAX: + encoded_softmax_block_ptr = tl.make_block_ptr( + base=encoded_softmax + off_h_q * seqlen_q * seqlen_k, + shape=(seqlen_q, seqlen_k), + strides=(seqlen_k, 1), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_N), + order=(1, 0), + ) + else: + encoded_softmax_block_ptr = 0 + # initialize pointer to m and l + m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) + # scale sm_scale by log_2(e) and use 2^x in the loop as we do not + # have native e^x support in HW. + qk_scale = sm_scale * 1.44269504089 + # Q is loaded once at the beginning and shared by all N blocks. + q = load_fn(Q_block_ptr, True, padded_head, "zero") + if not USE_FP8: + q = (q * qk_scale).to(Q_block_ptr.type.element_ty) + acc_scale = 1.0 + else: + qk_scale *= q_scale * k_scale + acc_scale = p_scale * v_scale - if v_descale_has_singleton: - v_descale_ptrs = v_descale_ptr - else: - v_descale_ptrs = v_descale_ptr + off_h_k + # Here we compute how many full and masked blocks we have. + padded_block_k = n_extra_tokens != 0 + is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0) + if IS_CAUSAL: + # There are always at least BLOCK_M // BLOCK_N masked blocks. + # Additionally there might be one more due to dissimilar seqlens. + masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn) + else: + # Padding on Q does not need to be masked in the FA loop. + masked_blocks = padded_block_k + # if IS_CAUSAL, not is_modulo_mn does not always result in an additional + # block. In this case we might exceed n_blocks so pick the min. + masked_blocks = min(masked_blocks, n_blocks) + n_full_blocks = n_blocks - masked_blocks + block_min = 0 + block_max = n_blocks * BLOCK_N + # Compute for full blocks. Here we set causal to false regardless of its + # value because there is no masking. Similarly we do not need padding. + if n_full_blocks > 0: + block_max = (n_blocks - masked_blocks) * BLOCK_N + acc, l_i, m_i = _attn_fwd_inner( + acc, + l_i, + m_i, + q, + K_block_ptr, + V_block_ptr, + start_m, + seqlen_k, + dropout_p, + philox_seed, + batch_philox_offset, + encoded_softmax_block_ptr, + # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _ + block_min, + block_max, + 0, + 0, + 0, + bias_ptr, + # IS_CAUSAL, .... + False, + BLOCK_M, + BLOCK_DMODEL, + BLOCK_N, + offs_m, + offs_n, + # _, MASK_STEPS, ... + PRE_LOAD_V, + False, + ENABLE_DROPOUT, + RETURN_ENCODED_SOFTMAX, + padded_head, + USE_FP8, + qk_scale, + p_descale, + ) + block_min = block_max + block_max = n_blocks * BLOCK_N - if not IS_EIGHT_BIT_KV: - if q_descale_has_singleton: - q_descale_ptrs = q_descale_ptr - else: - q_descale_ptrs = q_descale_ptr + off_h_q - if USE_P_SCALE: - if p_descale_has_singleton: - p_scale_ptrs = p_scale_ptr - p_descale_ptrs = p_descale_ptr - else: - p_scale_ptrs = p_scale_ptr + off_h_q - p_descale_ptrs = p_descale_ptr + off_h_q + tl.debug_barrier() + # Remaining blocks, if any, are full / not masked. + if masked_blocks > 0: + offs_n_causal = offs_n + (seqlen_q - seqlen_k) if IS_CAUSAL else 0 + K_block_ptr = tl.advance(K_block_ptr, (0, n_full_blocks * BLOCK_N)) + V_block_ptr = tl.advance(V_block_ptr, (n_full_blocks * BLOCK_N, 0)) + if bias_ptr is not None: + bias_ptr = tl.advance(bias_ptr, (0, n_full_blocks * BLOCK_N)) + if RETURN_ENCODED_SOFTMAX: + encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr, + (0, n_full_blocks)) + acc, l_i, m_i = _attn_fwd_inner( + acc, + l_i, + m_i, + q, + K_block_ptr, + V_block_ptr, + start_m, + seqlen_k, + dropout_p, + philox_seed, + batch_philox_offset, + encoded_softmax_block_ptr, + block_min, + block_max, + offs_n_causal, + masked_blocks, + n_extra_tokens, + bias_ptr, + IS_CAUSAL, + BLOCK_M, + BLOCK_DMODEL, + BLOCK_N, + offs_m, + offs_n, + # _, MASK_STEPS, ... + PRE_LOAD_V, + True, + ENABLE_DROPOUT, + RETURN_ENCODED_SOFTMAX, + padded_head, + USE_FP8, + qk_scale, + p_descale, + ) + # epilogue - if USE_BIAS: - bias_offset = off_h_q * stride_bh - bias_ptrs = (bias + bias_offset + offs_m[:, None] * stride_bm + - offs_n[None, :] * stride_bn) - else: - bias_ptrs = None + if USE_FP8: + acc *= acc_scale + acc = acc / l_i[:, None] + if ENABLE_DROPOUT: + acc = acc / (1 - dropout_p) + # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M, + # then we have one block with a row of all NaNs which come from computing + # softmax over a row of all -infs (-inf - inf = NaN). We check for that here + # and store 0s where there are NaNs as these rows should've been zeroed out. + end_m_idx = (start_m + 1) * BLOCK_M + start_m_idx = start_m * BLOCK_M + causal_start_idx = seqlen_q - seqlen_k + if USE_FP8_OUT: + acc *= o_descale + acc = tl.clamp(acc, FP8_MIN, FP8_MAX) + acc = acc.to(Out.type.element_ty) + if IS_CAUSAL: # noqa: SIM102 + if causal_start_idx > start_m_idx and causal_start_idx < end_m_idx: + out_mask_boundary = tl.full((BLOCK_DMODEL, ), + causal_start_idx, + dtype=tl.int32) + mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M) + out_ptrs_mask = (mask_m_offsets[:, None] + >= out_mask_boundary[None, :]) + z = tl.zeros((1, ), tl.float32) + acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty)) + # write back LSE + # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m + # If seqlen_q not multiple of BLOCK_M, we need to mask out the last + # few rows. This is only true for the last M block. For others, + # overflow_size will be -ve + # overflow_size = end_m_idx - seqlen_q + # if overflow_size > 0: + # boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32) + # # This is a > check because mask being 0 blocks the store. + # l_ptrs_mask = boundary > tl.arange(0, BLOCK_M) + # tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask) + # else: + # tl.store(l_ptrs, m_i + tl.math.log2(l_i)) - if USE_ALIBI: - a_offset = off_z * stride_az + off_h_q * stride_ah - alibi_slope = tl.load(alibi_slopes + a_offset) - else: - alibi_slope = None - - batch_philox_offset = 0 - # We can ask to return the dropout mask without doing any - # dropout. In this case, we return an invalid pointer so - # indicate the mask is not valid. - if SHOULD_RETURN_ENCODED_SOFTMAX: - encoded_sm_base = (encoded_softmax + - off_h_q * seqlen_q * seqlen_k) - encoded_sm_ptrs = (encoded_sm_base + - offs_m[:, None] * seqlen_k + - offs_n[None, :]) - else: - encoded_sm_ptrs = None - # initialize pointer to m and l - m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) - l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32) - acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) - # scale sm_scale by log_2(e) and use 2^x in the loop as we do - # not have native e^x support in HW. - QK_SCALE: tl.constexpr = SM_SCALE * 1.44269504089 - # Q is loaded once at the beginning and shared by all N blocks. - q_ptrs_mask = offs_m[:, None] < seqlen_q - if USE_PADDED_HEAD: - q_ptrs_mask = q_ptrs_mask & (offs_d[None, :] - < IS_ACTUAL_BLOCK_DMODEL) - q = tl.load(q_ptrs, mask=q_ptrs_mask, other=0.0) - - if IS_EIGHT_BIT: - k_descale = tl.load(k_descale_ptrs) - v_descale = tl.load(v_descale_ptrs) - q_descale = None if IS_EIGHT_BIT_KV else tl.load( - q_descale_ptrs) - if USE_P_SCALE: - p_scale = tl.load(p_scale_ptrs) - p_descale = tl.load(p_descale_ptrs) - else: - p_scale = None - p_descale = None - else: - q_descale = None - k_descale = None - v_descale = None - p_scale = None - p_descale = None - # Here we compute how many full and masked blocks we have. - padded_block_k = n_extra_tokens != 0 - is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0) - if IS_CAUSAL: - # There are always at least BLOCK_M // BLOCK_N masked - # blocks. Additionally there might be one more due to - # dissimilar seqlens. - masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn) - else: - # Padding on Q does not need to be masked in the FA loop. - masked_blocks = padded_block_k - # if IS_CAUSAL, not is_modulo_mn does not always result in an - # additional block. In this case we might exceed n_blocks so - # pick the min. - masked_blocks = min(masked_blocks, n_blocks) - n_full_blocks = n_blocks - masked_blocks - block_min = 0 - block_max = n_blocks * BLOCK_N - # Compute for full blocks. Here we set causal to false - # regardless of its actual value because there is no masking. - # Similarly we do not need padding. - if n_full_blocks > 0: - block_max = (n_blocks - masked_blocks) * BLOCK_N - acc, l_i, m_i = _attn_fwd_inner( - acc, - l_i, - m_i, - q, - k_ptrs, - v_ptrs, - bias_ptrs, - stride_kn, - stride_vk, - stride_bn, - start_m, - seqlen_k, - seqlen_q, - philox_seed, - batch_philox_offset, - encoded_sm_ptrs, - # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _ - block_min, - block_max, - 0, - 0, - 0, - alibi_slope, - q_descale, - k_descale, - v_descale, - p_scale, - # IS_CAUSAL, .... - False, - BLOCK_M, - BLOCK_DMODEL, - BLOCK_N, - offs_m, - offs_n, - # _, SHOULD_MASK_STEPS, ... - SHOULD_PRE_LOAD_V, - False, - SHOULD_RETURN_ENCODED_SOFTMAX, - USE_PADDED_HEAD, - IS_ACTUAL_BLOCK_DMODEL, - QK_SCALE, - IS_EIGHT_BIT_GEMM, - USE_P_SCALE, - IS_EIGHT_BIT_KV, - QUANT_DTYPE) - block_min = block_max - block_max = n_blocks * BLOCK_N - - tl.debug_barrier() - # Remaining blocks, if any, are full / not masked. - if (masked_blocks > 0): - if IS_CAUSAL: - offs_n_causal = offs_n + (seqlen_q - seqlen_k) - else: - offs_n_causal = 0 - k_ptrs += n_full_blocks * BLOCK_N * stride_kn - v_ptrs += n_full_blocks * BLOCK_N * stride_vk - if USE_BIAS: - bias_ptrs += n_full_blocks * BLOCK_N * stride_bn - if SHOULD_RETURN_ENCODED_SOFTMAX: - encoded_sm_ptrs += n_full_blocks * BLOCK_N - acc, l_i, m_i = _attn_fwd_inner( - acc, - l_i, - m_i, - q, - k_ptrs, - v_ptrs, - bias_ptrs, - stride_kn, - stride_vk, - stride_bn, - start_m, - seqlen_k, - seqlen_q, - philox_seed, - batch_philox_offset, - encoded_sm_ptrs, - block_min, - block_max, - offs_n_causal, - masked_blocks, - n_extra_tokens, - alibi_slope, - q_descale, - k_descale, - v_descale, - p_scale, - IS_CAUSAL, - BLOCK_M, - BLOCK_DMODEL, - BLOCK_N, - offs_m, - offs_n, - # _, SHOULD_MASK_STEPS, ... - SHOULD_PRE_LOAD_V, - True, - SHOULD_RETURN_ENCODED_SOFTMAX, - USE_PADDED_HEAD, - IS_ACTUAL_BLOCK_DMODEL, - QK_SCALE, - IS_EIGHT_BIT_GEMM, - USE_P_SCALE, - IS_EIGHT_BIT_KV, - QUANT_DTYPE) - - if IS_EIGHT_BIT and not IS_EIGHT_BIT_KV: - if USE_P_SCALE: - acc *= p_descale - acc *= v_descale - - # epilogue - # This helps the compiler do Newton Raphson on l_i vs on acc - # which is much larger. - l_recip = 1 / l_i[:, None] - acc = acc * l_recip - - # If seqlen_q > seqlen_k but the delta is not a multiple of - # BLOCK_M, then we have one block with a row of all NaNs which - # come from computing softmax over a row of all - # -infs (-inf - inf = NaN). We check for that here and store 0s - # where there are NaNs as these rows should've been zeroed out. - end_m_idx = (start_m + 1) * BLOCK_M - start_m_idx = start_m * BLOCK_M - causal_start_idx = seqlen_q - seqlen_k - if IS_EIGHT_BIT and not IS_EIGHT_BIT_KV: # noqa: SIM102 - if o_descale_ptr is not None: - acc = quant_fp8(acc, o_descale) - - acc = acc.to(Out.type.element_ty) - if IS_CAUSAL: # noqa: SIM102 - if (causal_start_idx > start_m_idx - and causal_start_idx < end_m_idx): - out_mask_boundary = tl.full((BLOCK_DMODEL, ), - causal_start_idx, - dtype=tl.int32) - mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M) - out_ptrs_mask = (mask_m_offsets[:, None] - >= out_mask_boundary[None, :]) - z = tl.zeros((1, ), tl.float32) - acc = tl.where(out_ptrs_mask, acc, - z.to(acc.type.element_ty)) - # write back LSE - l_ptrs = (L + off_z * HQ * MAX_SEQLENS_Q + - off_h_q * MAX_SEQLENS_Q + offs_m) - # If seqlen_q not multiple of BLOCK_M, we need to mask out the - # last few rows. This is only true for the last M block. - # For others, overflow_size will be -ve - overflow_size = end_m_idx - seqlen_q - if overflow_size > 0: - boundary = tl.full((BLOCK_M, ), - BLOCK_M - overflow_size, - dtype=tl.int32) - l_ptrs_mask = tl.arange(0, BLOCK_M) < boundary - tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask) - else: - tl.store(l_ptrs, m_i + tl.math.log2(l_i)) - - # write back O - o_offset = (Out + off_z * stride_oz + off_h_q * stride_oh + - cu_seqlens_q_start * stride_om) - o_ptrs = (o_offset + offs_m[:, None] * stride_om + - offs_d[None, :] * stride_on) - o_ptrs_mask = tl.full([BLOCK_M, BLOCK_DMODEL], 1, dtype=tl.int1) - if overflow_size > 0: - o_ptrs_mask = o_ptrs_mask & (offs_m[:, None] < seqlen_q) - if USE_PADDED_HEAD: - o_ptrs_mask = o_ptrs_mask & (offs_d[None, :] - < IS_ACTUAL_BLOCK_DMODEL) - tl.store(o_ptrs, acc.to(Out.dtype.element_ty), mask=o_ptrs_mask) + # write back O + o_offset = (off_z * stride_oz + cu_seqlens_q_start * stride_om + + off_h_q * stride_oh) + O_block_ptr = tl.make_block_ptr( + base=Out + o_offset, + shape=(seqlen_q, ACTUAL_BLOCK_DMODEL), + strides=(stride_om, stride_on), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_DMODEL), + order=(1, 0), + ) + # Need boundary check on this to make sure the padding from the + # Q and KV tensors in both dims are not part of what we store back. + # TODO: Do the boundary check optionally. + tl.store(O_block_ptr, acc, boundary_check=(0, 1)) -def get_shape_from_layout(q, k, metadata): - assert metadata.layout in SUPPORTED_LAYOUTS, "Got unsupported layout." - - if metadata.layout == 'thd': - nheads_q, nheads_k = q.shape[1], k.shape[1] - head_size = q.shape[-1] - batch = metadata.num_contexts - elif metadata.layout == 'bhsd': - batch, nheads_q, _, head_size = q.shape - nheads_k = k.shape[1] - elif metadata.layout == 'bshd': - batch, _, nheads_q, head_size = q.shape - nheads_k = k.shape[2] - return batch, nheads_q, nheads_k, head_size - - -def get_strides_from_layout(q, k, v, o, metadata): - assert metadata.layout in SUPPORTED_LAYOUTS, "Got unsupported layout." - - STRIDE_PERMUTATIONS = { - 'thd': (None, 1, 0, 2), - 'bhsd': (0, 1, 2, 3), - 'bshd': (0, 2, 1, 3), - } - - perm = STRIDE_PERMUTATIONS[metadata.layout] - stride = lambda x, p: (0 if p is None else x.stride(p)) - strides = lambda x: (stride(x, p) for p in perm) - - return tuple(strides(x) for x in [q, k, v, o]) +def check_args( + q, + k, + v, + o, + varlen=True, + max_seqlens=None, + cu_seqlens_q=None, + cu_seqlens_k=None, +): + assert q.dim() == k.dim() and q.dim() == v.dim() + if varlen: + assert q.dim() == 3 + total_q, nheads_q, head_size = q.shape + total_k, nheads_k, _ = k.shape + assert cu_seqlens_q is not None + assert cu_seqlens_k is not None + assert len(cu_seqlens_q) == len(cu_seqlens_k) + else: + assert q.dim() == 4 + batch, nheads_q, seqlen_q, head_size = q.shape + _, nheads_k, seqlen_k, _ = k.shape + assert max_seqlens > 0 + assert k.shape == v.shape + assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1] + # TODO: Change assert if we support qkl f8 and v f16 + assert q.dtype == k.dtype and q.dtype == v.dtype + assert head_size <= 256 + assert o.shape == q.shape + assert (nheads_q % nheads_k) == 0 class _attention(torch.autograd.Function): @staticmethod - def forward(ctx, q, k, v, o, metadata: MetaData): - # NOTE: a large bias tensor leads to overflow during pointer arithmetic - if (metadata.bias is not None): - assert (metadata.bias.numel() < 2**31) + def forward( + ctx, + q, + k, + v, + o, + cu_seqlens_q, + cu_seqlens_k, + max_seqlens_q, + max_seqlens_k, + causal=False, + sm_scale=1.0, + bias=None, + fp8_scales=None, + fp8_out_scale=None, + ): + if fp8_scales is not None: + use_fp8 = True + (q_scale, k_scale, v_scale, p_scale) = fp8_scales + float8 = current_platform.fp8_dtype() + + def check_and_convert(t, scale): + if t.dtype != float8: + descale = 1.0 / scale + ts = (t * descale).clamp(min=float8_info.min, + max=float8_info.max) + return ts.to(float8) + else: + return t + + q = check_and_convert(q, q_scale) + k = check_and_convert(k, k_scale) + v = check_and_convert(v, v_scale) + else: + use_fp8 = False + q_scale = k_scale = v_scale = p_scale = 1.0 if o is None: - if metadata.eight_bit: - o = torch.empty_like( - q, - dtype=metadata.output_dtype if metadata.output_dtype - is not None else metadata.eight_bit_dtype_torch) - else: - o = torch.empty_like(q, dtype=q.dtype) + o = torch.empty_like(q, dtype=v.dtype) - metadata.check_args(q, k, v, o) - - batch, nheads_q, nheads_k, head_size = get_shape_from_layout( - q, k, metadata) - q_strides, k_strides, v_strides, o_strides = get_strides_from_layout( - q, k, v, o, metadata) + check_args( + q, + k, + v, + o, + varlen=True, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + ) + if True: # varlen + total_q, nheads_q, head_size = q.shape + total_k, nheads_k, _ = k.shape + batch = len(cu_seqlens_q) - 1 + q_strides = (0, q.stride(1), q.stride(0), q.stride(2)) + k_strides = (0, k.stride(1), k.stride(0), k.stride(2)) + v_strides = (0, v.stride(1), v.stride(0), v.stride(2)) + o_strides = (0, o.stride(1), o.stride(0), o.stride(2)) + else: + batch, seqlen_q, nheads_q, head_size = q.shape + _, seqlen_k, nheads_k, _ = k.shape + q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3)) + k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3)) + v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3)) + o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3)) # Get closest power of 2 over or equal to 32. - padded_d_model = 1 << (head_size - 1).bit_length() - # Smallest head_dim supported is 16. If smaller, the tile in the - # kernel is padded - there is no padding in memory for any dims. - padded_d_model = max(padded_d_model, 16) - - # encoded_softmax is used to validate dropout behavior vs the - # PyTorch SDPA math backend reference. We zero this out to give a - # consistent starting point and then populate it with the output of - # softmax with the sign bit set according to the dropout mask. - # The resulting return allows this mask to be fed into the reference - # implementation for testing only. This return holds no useful output - # aside from debugging. - if metadata.return_encoded_softmax: - encoded_softmax = torch.zeros( - (q.shape[0], q.shape[1], q.shape[2], k.shape[2]), - device=q.device, - dtype=torch.float32) + unpadded_head_dims = {32, 64, 128, 256} + if head_size not in unpadded_head_dims: + padded_d_model = None + for i in unpadded_head_dims: + if i > head_size: + padded_d_model = i + break + assert padded_d_model is not None else: - encoded_softmax = None + padded_d_model = head_size - M = torch.empty((batch, nheads_q, metadata.max_seqlens_q), - device=q.device, - dtype=torch.float32) + grid = lambda META: ( + triton.cdiv(max_seqlens_q, META["BLOCK_M"]), + nheads_q, + batch, + ) + + encoded_softmax = None # Seed the RNG so we get reproducible results for testing. philox_seed = 0x1BF52 philox_offset = 0x1D4B42 - if metadata.bias is not None: - bias_strides = (metadata.bias.stride(0), metadata.bias.stride(1), - metadata.bias.stride(2), metadata.bias.stride(3)) + if bias is not None: + bias_strides = ( + bias.stride(0), + bias.stride(1), + bias.stride(2), + bias.stride(3), + ) else: bias_strides = (0, 0, 0, 0) - if metadata.alibi_slopes is not None: - alibi_strides = (metadata.alibi_slopes.stride(0), - metadata.alibi_slopes.stride(1)) - else: - alibi_strides = (0, 0) + p_descale = 1.0 / p_scale + o_descale = 1.0 / fp8_out_scale.item( + ) if fp8_out_scale is not None else 1.0 - if metadata.eight_bit: - q_descale, k_descale, p_scale, p_descale, v_descale, o_scale = ( - metadata.q_descale, metadata.k_descale, metadata.p_scale, - metadata.p_descale, metadata.v_descale, metadata.o_scale) - o_descale = 1.0 / o_scale if o_scale is not None else None - else: - q_descale = k_descale = p_scale = None - p_descale = v_descale = o_descale = None - - # number of compute units available - NUM_CU = torch.cuda.get_device_properties("cuda").multi_processor_count - - grid = lambda META: (triton.cdiv(metadata.max_seqlens_q, META[ - 'BLOCK_M']), nheads_q, batch) + arg_max_seqlens_q = 0 if on_gfx1x() else max_seqlens_q + arg_max_seqlens_k = 0 if on_gfx1x() else max_seqlens_k attn_fwd[grid]( q, k, v, - metadata.bias, - metadata.sm_scale, - M, + bias, + sm_scale, + q_scale, + k_scale, + v_scale, + p_scale, + p_descale, + o_descale, + None, o, *q_strides, *k_strides, *v_strides, *o_strides, *bias_strides, - *alibi_strides, - q_descale, - k_descale, - p_scale, - p_descale, - o_descale, - v_descale, - q_descale.numel() == 1 if q_descale is not None else False, - k_descale.numel() == 1 if k_descale is not None else False, - p_descale.numel() == 1 if p_descale is not None else False, - v_descale.numel() == 1 if v_descale is not None else False, - metadata.cu_seqlens_q, - metadata.cu_seqlens_k, + cu_seqlens_q, + cu_seqlens_k, + dropout_p=0.0, philox_seed=philox_seed, philox_offset_base=philox_offset, encoded_softmax=encoded_softmax, - alibi_slopes=metadata.alibi_slopes, HQ=nheads_q, HK=nheads_k, - IS_ACTUAL_BLOCK_DMODEL=head_size, - MAX_SEQLENS_Q=metadata.max_seqlens_q, - MAX_SEQLENS_K=metadata.max_seqlens_k, - IS_CAUSAL=metadata.causal, - VARLEN=metadata.varlen, + ACTUAL_BLOCK_DMODEL=head_size, + MAX_SEQLENS_Q=arg_max_seqlens_q, + MAX_SEQLENS_K=arg_max_seqlens_k, + IS_CAUSAL=causal, + VARLEN=True, BLOCK_DMODEL=padded_d_model, - USE_BIAS=metadata.bias is not None, - USE_ALIBI=metadata.alibi_slopes is not None, - SHOULD_RETURN_ENCODED_SOFTMAX=metadata.return_encoded_softmax, - IS_EIGHT_BIT=metadata.eight_bit, - USE_P_SCALE=metadata.eight_bit and metadata.use_p_scale, - IS_EIGHT_BIT_KV=metadata.eight_bit and metadata.eight_bit_kv, - NUM_CU=NUM_CU, - B=batch, - QUANT_DTYPE=metadata.eight_bit_dtype_triton) + BIAS_TYPE=0 if bias is None else 1, + ENABLE_DROPOUT=False, + RETURN_ENCODED_SOFTMAX=False, + USE_FP8=use_fp8, + USE_FP8_OUT=fp8_out_scale is not None, + ) ctx.grid = grid - ctx.sm_scale = metadata.sm_scale + ctx.sm_scale = sm_scale ctx.BLOCK_DMODEL = head_size - ctx.causal = metadata.causal - ctx.alibi_slopes = metadata.alibi_slopes + ctx.causal = causal + ctx.dropout_p = 0.0 ctx.philox_seed = philox_seed ctx.philox_offset = philox_offset ctx.encoded_softmax = encoded_softmax - ctx.return_encoded_softmax = metadata.return_encoded_softmax + ctx.return_encoded_softmax = False return o, encoded_softmax -triton_attention_rocm = _attention.apply - - -def scale_fp8(t, scale=None): - t_scaled, scale_out = ops.scaled_fp8_quant(t.reshape(-1, t.shape[-1]), - scale) - return t_scaled.reshape(t.shape), scale_out - - -def maybe_quantize_fp8(t, scale): - eight_bit_dtype = current_platform.fp8_dtype() - if t.dtype != eight_bit_dtype: - t, _ = scale_fp8(t, scale) - return t - - -def check_and_maybe_quantize_qkv(q, k, v, fp8_scales): - (q_scale, k_scale, v_scale, p_scale) = fp8_scales - - q = maybe_quantize_fp8(q, q_scale) - k = maybe_quantize_fp8(k, k_scale) - v = maybe_quantize_fp8(v, v_scale) - - return q, k, v - - -# query - [num_tokens, num_heads, head_size] -# key - [num_tokens, num_kv_heads, head_size] -# value - [num_tokens, num_kv_heads, head_size -# output - [num_tokens, num_heads, head_size] -def triton_attention( - q: torch.Tensor, - k: torch.Tensor, - v: torch.Tensor, - o: torch.Tensor, - cu_seqlens_q: torch.Tensor, - cu_seqlens_k: torch.Tensor, - max_seqlens_q: int, - max_seqlens_k: int, - causal: bool = False, - sm_scale: float = 1.0, - bias: Optional[torch.Tensor] = None, - fp8_scales: Optional[tuple[float, ...]] = None, - input_scale: Optional[torch.Tensor] = None, -) -> torch.Tensor: - if fp8_scales is not None: - q_descale, k_descale, v_descale, p_scale = fp8_scales - else: - q_descale = k_descale = v_descale = p_scale = None - - attn_metadata = MetaData(sm_scale=sm_scale, - max_seqlens_q=max_seqlens_q, - max_seqlens_k=max_seqlens_k, - causal=causal, - bias=bias, - output_dtype=q.dtype, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - q_descale=q_descale, - k_descale=k_descale, - v_descale=v_descale, - p_scale=p_scale, - o_scale=input_scale) - - if fp8_scales is not None: - q, k, v = check_and_maybe_quantize_qkv(q, k, v, fp8_scales) - - return triton_attention_rocm(q, k, v, o, attn_metadata) +triton_attention = _attention.apply diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index 4bced779785af..87cf333f7f0a1 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -29,41 +29,42 @@ def apply_softcap(S, x): @triton.jit def kernel_unified_attention_2d( - output_ptr, # [num_tokens, num_query_heads, head_size] - query_ptr, # [num_tokens, num_query_heads, head_size] - key_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] - value_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] - block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] - seq_lens_ptr, # [num_seqs] - alibi_slopes_ptr, # [num_query_heads] - scale, # float32 - k_scale, # float32 - v_scale, # float32 - softcap, # float32 - num_query_heads: tl.constexpr, # int - num_queries_per_kv: tl.constexpr, # int - block_table_stride: tl.int64, # int - query_stride_0: tl.int64, # int - query_stride_1: tl.int64, # int, should be equal to head_size - output_stride_0: tl.int64, # int - output_stride_1: tl.int64, # int, should be equal to head_size - BLOCK_SIZE: tl.constexpr, # int - HEAD_SIZE: tl.constexpr, # int - HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 - USE_ALIBI_SLOPES: tl.constexpr, # bool - USE_SOFTCAP: tl.constexpr, # bool - SLIDING_WINDOW: tl.constexpr, # int - stride_k_cache_0: tl.int64, # int - stride_k_cache_1: tl.int64, # int - stride_k_cache_2: tl.int64, # int - stride_k_cache_3: tl.constexpr, # int - stride_v_cache_0: tl.int64, # int - stride_v_cache_1: tl.int64, # int - stride_v_cache_2: tl.int64, # int - stride_v_cache_3: tl.constexpr, # int - query_start_len_ptr, # [num_seqs+1] - BLOCK_Q: tl.constexpr, # int - num_seqs: tl.int32, + output_ptr, # [num_tokens, num_query_heads, head_size] + query_ptr, # [num_tokens, num_query_heads, head_size] + key_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] + value_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] + block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] + seq_lens_ptr, # [num_seqs] + alibi_slopes_ptr, # [num_query_heads] + scale, # float32 + k_scale, # float32 + v_scale, # float32 + softcap, # float32 + num_query_heads: tl.constexpr, # int + num_queries_per_kv: tl.constexpr, # int + block_table_stride: tl.int64, # int + query_stride_0: tl.int64, # int + query_stride_1: tl.int64, # int, should be equal to head_size + output_stride_0: tl.int64, # int + output_stride_1: tl.int64, # int, should be equal to head_size + BLOCK_SIZE: tl.constexpr, # int + HEAD_SIZE: tl.constexpr, # int + HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 + USE_ALIBI_SLOPES: tl.constexpr, # bool + USE_SOFTCAP: tl.constexpr, # bool + SLIDING_WINDOW: tl.constexpr, # int + stride_k_cache_0: tl.int64, # int + stride_k_cache_1: tl.int64, # int + stride_k_cache_2: tl.int64, # int + stride_k_cache_3: tl.constexpr, # int + stride_v_cache_0: tl.int64, # int + stride_v_cache_1: tl.int64, # int + stride_v_cache_2: tl.int64, # int + stride_v_cache_3: tl.constexpr, # int + query_start_len_ptr, # [num_seqs+1] + BLOCK_Q: tl.constexpr, # int + num_seqs: tl.int32, + BLOCK_M: tl.constexpr, # int ): q_block_global_idx = tl.program_id(0) @@ -94,15 +95,13 @@ def kernel_unified_attention_2d( if q_block_local_idx * BLOCK_Q >= cur_batch_query_len: return - offs_m = tl.arange(0, BLOCK_Q * num_queries_per_kv) + offs_m = tl.arange(0, BLOCK_M) offs_d = tl.arange(0, HEAD_SIZE_PADDED) - query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv query_offset_0 = cur_batch_in_all_start_index + query_pos query_offset_1 = kv_head_idx * num_queries_per_kv + \ offs_m % num_queries_per_kv - query_offset = (query_offset_0[:, None] * query_stride_0 + query_offset_1[:, None] * query_stride_1 + offs_d[None, :]) @@ -110,7 +109,7 @@ def kernel_unified_attention_2d( query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1) query_mask_1 = tl.where(query_offset_1 < num_query_heads, 1, 0).to(tl.int1) - # Q : (BLOCK_Q * num_queries_per_kv, HEAD_SIZE,) + # Q : (BLOCK_M, HEAD_SIZE_PADDED) Q = tl.load( query_ptr + query_offset, mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None], @@ -119,12 +118,9 @@ def kernel_unified_attention_2d( block_table_offset = seq_idx * block_table_stride - M = tl.full([BLOCK_Q * num_queries_per_kv], - float("-inf"), - dtype=tl.float32) - L = tl.full([BLOCK_Q * num_queries_per_kv], 1.0, dtype=tl.float32) - acc = tl.zeros([BLOCK_Q * num_queries_per_kv, HEAD_SIZE_PADDED], - dtype=tl.float32) + M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + L = tl.full([BLOCK_M], 1.0, dtype=tl.float32) + acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32) # sequence len for this particular sequence seq_len = tl.load(seq_lens_ptr + seq_idx) @@ -183,13 +179,12 @@ def kernel_unified_attention_2d( else: V = V_load - seq_offset = j * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + seq_offset = j * BLOCK_SIZE + offs_n seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1 - # S : (BLOCK_Q * num_queries_per_kv, BLOCK_SIZE,) - S = tl.zeros(shape=(BLOCK_Q * num_queries_per_kv, BLOCK_SIZE), - dtype=tl.float32) + # S : (BLOCK_M, BLOCK_SIZE) + S = tl.zeros(shape=(BLOCK_M, BLOCK_SIZE), dtype=tl.float32) S += scale * tl.dot(Q, K) @@ -207,29 +202,29 @@ def kernel_unified_attention_2d( S += alibi_slope[:, None] * (seq_offset - context_len) # compute running maximum - # m_j : (BLOCK_Q * num_queries_per_kv,) + # m_j : (BLOCK_M,) m_j = tl.maximum(M, tl.max(S, axis=1)) # For sliding window there's a chance the max is -inf due to masking of # the entire row. In this case we need to set m_j 0 to avoid NaN m_j = tl.where(m_j > float("-inf"), m_j, 0.0) - # P : (BLOCK_Q * num_queries_per_kv, BLOCK_SIZE,) + # P : (BLOCK_M, BLOCK_SIZE) P = tl.exp(S - m_j[:, None]) - # l_j : (BLOCK_Q * num_queries_per_kv,) + # l_j : (BLOCK_M,) l_j = tl.sum(P, axis=1) - # alpha : (BLOCK_Q * num_queries_per_kv, ) + # alpha : (BLOCK_M, ) alpha = tl.exp(M - m_j) - # acc : (BLOCK_Q * num_queries_per_kv, BLOCK_SIZE,) + # acc : (BLOCK_M, HEAD_SIZE_PADDED) acc = acc * alpha[:, None] # update constants L = L * alpha + l_j M = m_j - # acc : (BLOCK_Q * num_queries_per_kv, BLOCK_SIZE,) + # acc : (BLOCK_M, HEAD_SIZE_PADDED) acc += tl.dot(P.to(V.dtype), V) # epilogue @@ -334,4 +329,5 @@ def unified_attention( query_start_len_ptr=cu_seqlens_q, BLOCK_Q=BLOCK_Q, num_seqs=num_seqs, + BLOCK_M=BLOCK_M, ) diff --git a/vllm/beam_search.py b/vllm/beam_search.py index 967510abaeb9b..ddacc669551b9 100644 --- a/vllm/beam_search.py +++ b/vllm/beam_search.py @@ -3,6 +3,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional, Union +from vllm.lora.request import LoRARequest from vllm.sequence import Logprob if TYPE_CHECKING: @@ -19,6 +20,7 @@ class BeamSearchSequence: # The tokens includes the prompt. tokens: list[int] logprobs: list[dict[int, Logprob]] + lora_request: Optional[LoRARequest] = None cum_logprob: float = 0.0 text: Optional[str] = None finish_reason: Optional[str] = None @@ -41,6 +43,7 @@ class BeamSearchInstance: def __init__( self, prompt_tokens: list[int], + lora_request: Optional[LoRARequest] = None, logprobs: Optional[list[dict[int, Logprob]]] = None, **kwargs, ): @@ -48,6 +51,7 @@ class BeamSearchInstance: BeamSearchSequence( tokens=prompt_tokens, logprobs=[] if logprobs is None else list(logprobs), + lora_request=lora_request, **kwargs, ) ] diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index fab44fb6062d1..35cc303f60eeb 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -9,11 +9,7 @@ generation. Supported dataset types include: - BurstGPT - HuggingFace - VisionArena - -TODO: Implement CustomDataset to parse a JSON file and convert its contents into -SampleRequest instances, similar to the approach used in ShareGPT. """ - import base64 import io import json @@ -27,12 +23,14 @@ from io import BytesIO from typing import Any, Callable, Optional, Union import numpy as np +import pandas as pd from PIL import Image from transformers import PreTrainedTokenizerBase from vllm.lora.request import LoRARequest from vllm.lora.utils import get_adapter_absolute_path from vllm.multimodal import MultiModalDataDict +from vllm.multimodal.image import convert_image_mode from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer logger = logging.getLogger(__name__) @@ -62,6 +60,7 @@ class SampleRequest: class BenchmarkDataset(ABC): DEFAULT_SEED = 0 + IS_MULTIMODAL = False def __init__( self, @@ -129,16 +128,17 @@ class BenchmarkDataset(ABC): Args: tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no - LoRA is selected. max_loras (Optional[int]): The maximum number of - LoRAs available. If None, LoRA is not used. lora_path - (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA - is not used. + LoRA is selected. + max_loras (Optional[int]): The maximum number of LoRAs available. + If `None`, LoRA is not used. + lora_path (Optional[str]): Path to the LoRA parameters on disk. + If `None`, LoRA is not used. Returns: - tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first - element is a LoRARequest (or None if not applicable) and the second - element is the tokenizer associated with the LoRA request (or the - base tokenizer). + A tuple with the following elements: + - A new [LoRARequest][] (or `None` if not applicable). + - The tokenizer associated with the LoRA request + (or the base tokenizer). """ if max_loras is None or lora_path is None: return None, tokenizer @@ -167,7 +167,7 @@ class BenchmarkDataset(ABC): Args: tokenizer (PreTrainedTokenizerBase): The tokenizer to be used - for processing the dataset's text. + for processing the dataset's text. num_requests (int): The number of sample requests to generate. Returns: @@ -184,7 +184,8 @@ class BenchmarkDataset(ABC): Args: requests (List[SampleRequest]): The current list of sampled - requests. num_requests (int): The target number of requests. + requests. + num_requests (int): The target number of requests. """ if len(requests) < num_requests: random.seed(self.random_seed) @@ -259,7 +260,7 @@ def process_image(image: Any) -> Mapping[str, Any]: if isinstance(image, dict) and 'bytes' in image: image = Image.open(BytesIO(image['bytes'])) if isinstance(image, Image.Image): - image = image.convert("RGB") + image = convert_image_mode(image, "RGB") with io.BytesIO() as image_data: image.save(image_data, format="JPEG") image_base64 = base64.b64encode( @@ -314,13 +315,15 @@ class RandomDataset(BenchmarkDataset): ) vocab_size = tokenizer.vocab_size + num_special_tokens = tokenizer.num_special_tokens_to_add() + real_input_len = input_len - num_special_tokens prefix_token_ids = (np.random.randint( 0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else []) # New sampling logic: [X * (1 - b), X * (1 + b)] - input_low = int(input_len * (1 - range_ratio)) - input_high = int(input_len * (1 + range_ratio)) + input_low = int(real_input_len * (1 - range_ratio)) + input_high = int(real_input_len * (1 + range_ratio)) output_low = int(output_len * (1 - range_ratio)) output_high = int(output_len * (1 + range_ratio)) @@ -343,6 +346,17 @@ class RandomDataset(BenchmarkDataset): vocab_size).tolist() token_sequence = prefix_token_ids + inner_seq prompt = tokenizer.decode(token_sequence) + # After decoding the prompt we have to encode and decode it again. + # This is done because in some cases N consecutive tokens + # give a string tokenized into != N number of tokens. + # For example for GPT2Tokenizer: + # [6880, 6881] -> ['ฤ calls', 'here'] -> + # [1650, 939, 486] -> ['ฤ call', 'sh', 'ere'] + # To avoid uncontrolled change of the prompt length, + # the encoded sequence is truncated before being decode again. + re_encoded_sequence = tokenizer.encode( + prompt, add_special_tokens=False)[:input_lens[i]] + prompt = tokenizer.decode(re_encoded_sequence) total_input_len = prefix_len + int(input_lens[i]) requests.append( SampleRequest( @@ -427,6 +441,99 @@ class ShareGPTDataset(BenchmarkDataset): return samples +# ----------------------------------------------------------------------------- +# Custom Dataset Implementation +# ----------------------------------------------------------------------------- + + +class CustomDataset(BenchmarkDataset): + """ + Implements the Custom dataset. Loads data from a JSONL file and generates + sample requests based on conversation turns. E.g., + ``` + {"prompt": "What is the capital of India?"} + {"prompt": "What is the capital of Iran?"} + {"prompt": "What is the capital of China?"} + ``` + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + # self.data will be a list of dictionaries + # e.g., [{"prompt": "What is the capital of India?"}, ...] + # This will be the standardized format which load_data() + # has to convert into depending on the filetype of dataset_path. + # sample() will assume this standardized format of self.data + self.data = [] + + # Load the JSONL file + if self.dataset_path.endswith(".jsonl"): + jsonl_data = pd.read_json(path_or_buf=self.dataset_path, + lines=True) + + # check if the JSONL file has a 'prompt' column + if "prompt" not in jsonl_data.columns: + raise ValueError("JSONL file must contain a 'prompt' column.") + + # Convert each row to a dictionary and append to self.data + # This will convert the DataFrame to a list of dictionaries + # where each dictionary corresponds to a row in the DataFrame. + # This is the standardized format we want for self.data + for _, row in jsonl_data.iterrows(): + self.data.append(row.to_dict()) + else: + raise NotImplementedError( + "Only JSONL format is supported for CustomDataset.") + + random.seed(self.random_seed) + random.shuffle(self.data) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + lora_path: Optional[str] = None, + max_loras: Optional[int] = None, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + skip_chat_template: bool = False, + **kwargs, + ) -> list: + sampled_requests = [] + for item in self.data: + if len(sampled_requests) >= num_requests: + break + prompt = item["prompt"] + + # apply template + if not skip_chat_template: + prompt = tokenizer.apply_chat_template( + [{ + "role": "user", + "content": prompt + }], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + )) + self.maybe_oversample_requests(sampled_requests, num_requests) + + return sampled_requests + + # ----------------------------------------------------------------------------- # Sonnet Dataset Implementation # ----------------------------------------------------------------------------- @@ -635,6 +742,7 @@ class ConversationDataset(HuggingFaceDataset): SUPPORTED_DATASET_PATHS = { 'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered' } + IS_MULTIMODAL = True def sample(self, tokenizer: PreTrainedTokenizerBase, @@ -699,6 +807,7 @@ class VisionArenaDataset(HuggingFaceDataset): "lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"] } + IS_MULTIMODAL = True def sample( self, @@ -782,6 +891,64 @@ class InstructCoderDataset(HuggingFaceDataset): return sampled_requests +# ----------------------------------------------------------------------------- +# MT-Bench Dataset Implementation +# ----------------------------------------------------------------------------- + + +class MTBenchDataset(HuggingFaceDataset): + """ + MT-Bench Dataset. + https://huggingface.co/datasets/philschmid/mt-bench + + We create a single turn dataset for MT-Bench. + This is similar to Spec decoding benchmark setup in vLLM + https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18 + """ # noqa: E501 + + DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM + SUPPORTED_DATASET_PATHS = { + "philschmid/mt-bench", + } + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + **kwargs, + ) -> list: + output_len = (output_len + if output_len is not None else self.DEFAULT_OUTPUT_LEN) + sampled_requests = [] + + for item in self.data: + if len(sampled_requests) >= num_requests: + break + prompt = item["turns"][0] + + # apply template + prompt = tokenizer.apply_chat_template( + [{ + "role": "user", + "content": prompt + }], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + )) + self.maybe_oversample_requests(sampled_requests, num_requests) + return sampled_requests + + # ----------------------------------------------------------------------------- # AIMO Dataset Implementation # ----------------------------------------------------------------------------- @@ -856,18 +1023,18 @@ def _format_zeta_prompt( sample: dict, original_start_marker: str = "<|editable_region_start|>") -> dict: """Format the zeta prompt for the Next Edit Prediction (NEP) dataset. - - This function formats examples from the NEP dataset - into prompts and expected outputs. It could be + + This function formats examples from the NEP dataset + into prompts and expected outputs. It could be further extended to support more NEP datasets. - + Args: - sample: The dataset sample containing events, + sample: The dataset sample containing events, inputs, and outputs. - original_start_marker: The marker indicating the - start of the editable region. Defaults to + original_start_marker: The marker indicating the + start of the editable region. Defaults to "<|editable_region_start|>". - + Returns: A dictionary with the formatted prompts and expected outputs. """ @@ -917,3 +1084,94 @@ class NextEditPredictionDataset(HuggingFaceDataset): break self.maybe_oversample_requests(samples, num_requests) return samples + + +# ----------------------------------------------------------------------------- +# ASR Dataset Implementation +# ----------------------------------------------------------------------------- + + +class ASRDataset(HuggingFaceDataset): + """ + Dataset class for processing a ASR dataset for transcription. + Tested on the following set: + + +----------------+----------------------------------------+--------------------------+-----------------------------+ + | Dataset | Domain | Speaking Style | hf-subset | + +----------------+----------------------------------------+--------------------------+-----------------------------+ + | TED-LIUM | TED talks | Oratory | release1, release2, release3| + | | | | release3-speaker-adaptation | + | VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... | + | LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" | + | GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test | + | SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test | + | AMI | Meetings | Spontaneous | ihm, sdm | + +----------------+----------------------------------------+--------------------------+-----------------------------+ + + """ # noqa: E501 + + SUPPORTED_DATASET_PATHS = { + "openslr/librispeech_asr", + "facebook/voxpopuli", + "LIUM/tedlium", + "edinburghcstr/ami", + "speechcolab/gigaspeech", + "kensho/spgispeech", + } + + DEFAULT_OUTPUT_LEN = 128 + IS_MULTIMODAL = True + + # TODO Whisper-specific. Abstract interface when more models are supported. + TRANSCRIPTION_PREAMBLE = ( + "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>") + skip_long_audios: bool = True + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + **kwargs, + ) -> list: + try: + import librosa + except ImportError as e: + raise ImportError( + "librosa is required for ASRDataset. Please install it " + "using `pip install librosa`.") from e + + output_len = (output_len + if output_len is not None else self.DEFAULT_OUTPUT_LEN) + prompt = ASRDataset.TRANSCRIPTION_PREAMBLE + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests = [] + skipped = 0 + for item in self.data: + if len(sampled_requests) >= num_requests: + break + audio = item["audio"] + y, sr = audio["array"], audio["sampling_rate"] + duration_s = librosa.get_duration(y=y, sr=sr) + # Whisper max supported duration + if self.skip_long_audios and duration_s > 30: + skipped += 1 + continue + + mm_content = {"audio": (y, sr)} + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + )) + if skipped: + logger.warning( + "%d samples discarded from dataset due to" + " their length being greater than" + " what Whisper supports.", + skipped, + ) + self.maybe_oversample_requests(sampled_requests, num_requests) + return sampled_requests diff --git a/vllm/benchmarks/endpoint_request_func.py b/vllm/benchmarks/endpoint_request_func.py index 32767a896070c..a28630d50f261 100644 --- a/vllm/benchmarks/endpoint_request_func.py +++ b/vllm/benchmarks/endpoint_request_func.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """The request function for API endpoints.""" +import io import json import os import sys @@ -24,11 +25,11 @@ class RequestFuncInput: output_len: int model: str model_name: Optional[str] = None - best_of: int = 1 logprobs: Optional[int] = None extra_body: Optional[dict] = None multi_modal_content: Optional[dict] = None ignore_eos: bool = False + language: Optional[str] = None @dataclass @@ -71,7 +72,7 @@ async def async_request_openai_completions( if request_func_input.model_name else request_func_input.model, "prompt": request_func_input.prompt, "temperature": 0.0, - "best_of": request_func_input.best_of, + "repetition_penalty": 1.0, "max_tokens": request_func_input.output_len, "logprobs": request_func_input.logprobs, "stream": True, @@ -154,7 +155,226 @@ async def async_request_openai_completions( return output +async def async_request_openai_chat_completions( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith(("chat/completions", "profile")), ( + "OpenAI Chat Completions API URL must end with 'chat/completions'.") + + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: + content = [{"type": "text", "text": request_func_input.prompt}] + if request_func_input.multi_modal_content: + content.append(request_func_input.multi_modal_content) + payload = { + "model": + request_func_input.model_name + if request_func_input.model_name else request_func_input.model, + "messages": [ + { + "role": "user", + "content": content + }, + ], + "temperature": + 0.0, + "max_completion_tokens": + request_func_input.output_len, + "stream": + True, + "stream_options": { + "include_usage": True, + }, + } + if request_func_input.ignore_eos: + payload["ignore_eos"] = request_func_input.ignore_eos + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post(url=api_url, json=payload, + headers=headers) as response: + if response.status == 200: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data: ") + if chunk != "[DONE]": + timestamp = time.perf_counter() + data = json.loads(chunk) + + if choices := data.get("choices"): + content = choices[0]["delta"].get("content") + # First token + if ttft == 0.0: + ttft = timestamp - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") + + most_recent_timestamp = timestamp + + output.generated_text = generated_text + output.success = True + output.latency = most_recent_timestamp - st + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def async_request_openai_audio( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + # Lazy import without PlaceholderModule to avoid vllm dep. + import soundfile + + api_url = request_func_input.api_url + assert api_url.endswith(("transcriptions", "translations")), ( + "OpenAI Chat Completions API URL must end with 'transcriptions' ") + "or `translations`." + + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: + content = [{"type": "text", "text": request_func_input.prompt}] + payload = { + "model": + request_func_input.model_name + if request_func_input.model_name else request_func_input.model, + "temperature": + 0.0, + "max_completion_tokens": + request_func_input.output_len, + "stream": + True, + "language": + "en", + # Flattened due to multipart/form-data + "stream_include_usage": + True, + "stream_continuous_usage_stats": + True, + } + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + + # Send audio file + def to_bytes(y, sr): + buffer = io.BytesIO() + soundfile.write(buffer, y, sr, format="WAV") + buffer.seek(0) + return buffer + + with to_bytes(*request_func_input.multi_modal_content["audio"]) as f: + form = aiohttp.FormData() + form.add_field("file", f, content_type="audio/wav") + for key, value in payload.items(): + form.add_field(key, str(value)) + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post(url=api_url, + data=form, + headers=headers) as response: + if response.status == 200: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data: ") + if chunk != "[DONE]": + timestamp = time.perf_counter() + data = json.loads(chunk) + + if choices := data.get("choices"): + content = choices[0]["delta"].get( + "content") + # First token + if ttft == 0.0: + ttft = timestamp - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append( + timestamp - most_recent_timestamp) + + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") + + most_recent_timestamp = timestamp + + output.generated_text = generated_text + output.success = True + output.latency = most_recent_timestamp - st + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + # TODO: Add more request functions for different API protocols. ASYNC_REQUEST_FUNCS = { - "openai-comp": async_request_openai_completions, + "vllm": async_request_openai_completions, + "openai": async_request_openai_completions, + "openai-chat": async_request_openai_chat_completions, + "openai-audio": async_request_openai_audio, } + +OPENAI_COMPATIBLE_BACKENDS = [ + k for k, v in ASYNC_REQUEST_FUNCS.items() + if v in (async_request_openai_completions, + async_request_openai_chat_completions) +] diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py index 06f6848f50cb4..c9e03cc3bf781 100644 --- a/vllm/benchmarks/latency.py +++ b/vllm/benchmarks/latency.py @@ -6,13 +6,12 @@ import dataclasses import json import os import time -from pathlib import Path from typing import Any, Optional import numpy as np -import torch from tqdm import tqdm +import vllm.envs as envs from vllm import LLM, SamplingParams from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format, write_to_json) @@ -59,13 +58,6 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="profile the generation process of a single batch", ) - parser.add_argument( - "--profile-result-dir", - type=str, - default=None, - help=("path to save the pytorch profiler output. Can be visualized " - "with ui.perfetto.dev or Tensorboard."), - ) parser.add_argument( "--output-json", type=str, @@ -80,11 +72,17 @@ def add_cli_args(parser: argparse.ArgumentParser): ) parser = EngineArgs.add_cli_args(parser) + # V1 enables prefix caching by default which skews the latency + # numbers. We need to disable prefix caching by default. + parser.set_defaults(enable_prefix_caching=False) def main(args: argparse.Namespace): print(args) - + if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: + raise OSError( + "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " + "Please set it to a valid path to use torch profiler.") engine_args = EngineArgs.from_cli_args(args) # NOTE(woosuk): If the request cannot be processed in a single batch, @@ -128,16 +126,9 @@ def main(args: argparse.Namespace): def run_to_completion(profile_dir: Optional[str] = None): if profile_dir: - with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - on_trace_ready=torch.profiler.tensorboard_trace_handler( - str(profile_dir)), - ) as p: - llm_generate() - print(p.key_averages().table(sort_by="self_cuda_time_total")) + llm.start_profile() + llm_generate() + llm.stop_profile() else: start_time = time.perf_counter() llm_generate() @@ -150,10 +141,7 @@ def main(args: argparse.Namespace): run_to_completion(profile_dir=None) if args.profile: - profile_dir = args.profile_result_dir - if not profile_dir: - profile_dir = (Path(".") / "vllm_benchmark_result" / - f"latency_result_{time.time()}") + profile_dir = envs.VLLM_TORCH_PROFILER_DIR print(f"Profiling (results will be saved to '{profile_dir}')...") run_to_completion(profile_dir=profile_dir) return diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index dc0ec32194866..858a0c6a00e4b 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -7,7 +7,7 @@ to launch the vLLM OpenAI API server: On the client side, run: vllm bench serve \ - --endpoint-type <endpoint_type. Default 'openi-comp'> \ + --endpoint-type <endpoint_type. Default 'openai'> \ --label <benchmark result label. Default using endpoint_type> \ --model <your_model> \ --dataset-name <dataset_name. Default 'random'> \ @@ -22,7 +22,7 @@ import os import random import time import warnings -from collections.abc import AsyncGenerator +from collections.abc import AsyncGenerator, Iterable from dataclasses import dataclass from datetime import datetime from typing import Any, Optional @@ -31,7 +31,14 @@ import numpy as np from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase +from vllm.benchmarks.datasets import (AIMODataset, ASRDataset, BurstGPTDataset, + ConversationDataset, HuggingFaceDataset, + InstructCoderDataset, MTBenchDataset, + NextEditPredictionDataset, RandomDataset, + SampleRequest, ShareGPTDataset, + SonnetDataset, VisionArenaDataset) from vllm.benchmarks.endpoint_request_func import (ASYNC_REQUEST_FUNCS, + OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput, RequestFuncOutput) from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format, @@ -71,53 +78,18 @@ class BenchmarkMetrics: percentiles_e2el_ms: list[tuple[float, float]] -def sample_random_requests( - prefix_len: int, - input_len: int, - output_len: int, - num_prompts: int, - range_ratio: float, - tokenizer: PreTrainedTokenizerBase, -) -> list[tuple[str, int, int]]: - prefix_token_ids = np.random.randint(0, - tokenizer.vocab_size, - size=prefix_len).tolist() - - input_lens = np.random.randint( - int(input_len * range_ratio), - input_len + 1, - size=num_prompts, - ) - output_lens = np.random.randint( - int(output_len * range_ratio), - output_len + 1, - size=num_prompts, - ) - offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts) - input_requests = [] - for i in range(num_prompts): - prompt = tokenizer.decode(prefix_token_ids + - [(offsets[i] + i + j) % tokenizer.vocab_size - for j in range(input_lens[i])]) - - input_requests.append((prompt, int(prefix_len + input_lens[i]), - int(output_lens[i]), None)) - - return input_requests - - async def get_request( - input_requests: list[tuple[str, int, int]], + input_requests: list[SampleRequest], request_rate: float, burstiness: float = 1.0, -) -> AsyncGenerator[tuple[str, int, int], None]: +) -> AsyncGenerator[SampleRequest, None]: """ Asynchronously generates requests at a specified rate with OPTIONAL burstiness. Args: input_requests: - A list of input requests, each represented as a tuple. + A list of input requests, each represented as a SampleRequest. request_rate: The rate at which requests are generated (requests/s). burstiness (optional): @@ -129,7 +101,7 @@ async def get_request( in more bursty requests, while a higher burstiness value (burstiness > 1) results in a more uniform arrival of requests. """ - input_requests = iter(input_requests) + input_requests: Iterable[SampleRequest] = iter(input_requests) # Calculate scale parameter theta to maintain the desired request_rate. assert burstiness > 0, ( @@ -151,7 +123,7 @@ async def get_request( def calculate_metrics( - input_requests: list[tuple[str, int, int]], + input_requests: list[SampleRequest], outputs: list[RequestFuncOutput], dur_s: float, tokenizer: PreTrainedTokenizerBase, @@ -184,7 +156,7 @@ def calculate_metrics( if outputs[i].success: output_len = outputs[i].output_tokens - if output_len is None: + if not output_len: # We use the tokenizer to count the number of output tokens # for some serving backends instead of looking at # len(outputs[i].itl) since multiple output tokens may be @@ -194,7 +166,7 @@ def calculate_metrics( tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids) actual_output_lens.append(output_len) - total_input += input_requests[i][1] + total_input += input_requests[i].prompt_len tpot = 0 if output_len > 1: latency_minus_ttft = outputs[i].latency - outputs[i].ttft @@ -277,19 +249,19 @@ async def benchmark( model_id: str, model_name: str, tokenizer: PreTrainedTokenizerBase, - input_requests: list[tuple[str, int, int]], + input_requests: list[SampleRequest], logprobs: Optional[int], - best_of: int, request_rate: float, burstiness: float, disable_tqdm: bool, profile: bool, selected_percentile_metrics: list[str], - selected_percentiles: list[str], + selected_percentiles: list[float], ignore_eos: bool, goodput_config_dict: dict[str, float], max_concurrency: Optional[int], - lora_modules: Optional[list[str]], + lora_modules: Optional[Iterable[str]], + extra_body: Optional[dict], ): if endpoint_type in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[endpoint_type] @@ -298,11 +270,13 @@ async def benchmark( print("Starting initial single prompt test run...") test_prompt, test_prompt_len, test_output_len, test_mm_content = ( - input_requests[0]) - if endpoint_type != "openai-chat" and test_mm_content is not None: - # multi-modal benchmark is only available on OpenAI Chat endpoint. - raise ValueError("Multi-modal content is only supported on " - "'openai-chat' endpoint_type.") + input_requests[0].prompt, + input_requests[0].prompt_len, + input_requests[0].expected_output_len, + input_requests[0].multi_modal_data, + ) + + assert test_mm_content is None or isinstance(test_mm_content, dict) test_input = RequestFuncInput( model=model_id, model_name=model_name, @@ -311,9 +285,9 @@ async def benchmark( prompt_len=test_prompt_len, output_len=test_output_len, logprobs=logprobs, - best_of=best_of, multi_modal_content=test_mm_content, ignore_eos=ignore_eos, + extra_body=extra_body, ) test_output = await request_func(request_func_input=test_input) @@ -338,9 +312,9 @@ async def benchmark( prompt_len=test_prompt_len, output_len=test_output_len, logprobs=logprobs, - best_of=best_of, multi_modal_content=test_mm_content, - ignore_eos=ignore_eos) + ignore_eos=ignore_eos, + extra_body=extra_body) profile_output = await request_func(request_func_input=profile_input) if profile_output.success: print("Profiler started") @@ -374,7 +348,12 @@ async def benchmark( benchmark_start_time = time.perf_counter() tasks: list[asyncio.Task] = [] async for request in get_request(input_requests, request_rate, burstiness): - prompt, prompt_len, output_len, mm_content = request + prompt, prompt_len, output_len, mm_content = ( + request.prompt, + request.prompt_len, + request.expected_output_len, + request.multi_modal_data, + ) req_model_id, req_model_name = model_id, model_name if lora_modules: req_lora_module = next(lora_modules) @@ -387,9 +366,9 @@ async def benchmark( prompt_len=prompt_len, output_len=output_len, logprobs=logprobs, - best_of=best_of, multi_modal_content=mm_content, - ignore_eos=ignore_eos) + ignore_eos=ignore_eos, + extra_body=extra_body) tasks.append( asyncio.create_task( limited_request_func(request_func_input=request_func_input, @@ -405,7 +384,6 @@ async def benchmark( prompt_len=test_prompt_len, output_len=test_output_len, logprobs=logprobs, - best_of=best_of, ) profile_output = await request_func(request_func_input=profile_input) if profile_output.success: @@ -567,7 +545,7 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--endpoint-type", type=str, - default="openai-comp", + default="openai", choices=list(ASYNC_REQUEST_FUNCS.keys()), ) parser.add_argument( @@ -596,9 +574,16 @@ def add_cli_args(parser: argparse.ArgumentParser): "--dataset-name", type=str, default="random", - choices=["random"], + choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"], help="Name of the dataset to benchmark on.", ) + parser.add_argument( + "--dataset-path", + type=str, + default=None, + help="Path to the sharegpt/sonnet dataset. " + "Or the huggingface dataset ID if using HF dataset.", + ) parser.add_argument( "--max-concurrency", type=int, @@ -624,13 +609,6 @@ def add_cli_args(parser: argparse.ArgumentParser): help= "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 ) - parser.add_argument( - "--best-of", - type=int, - default=1, - help="Generates `best_of` sequences per prompt and " - "returns the best one.", - ) parser.add_argument("--use-beam-search", action="store_true") parser.add_argument( "--num-prompts", @@ -691,6 +669,17 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="Specify to save benchmark results to a json file", ) + parser.add_argument( + "--save-detailed", + action="store_true", + help="When saving the results, whether to include per request " + "information such as response, error, ttfs, tpots, etc.", + ) + parser.add_argument( + "--append-result", + action="store_true", + help="Append the benchmark result to the existing json file.", + ) parser.add_argument( "--metadata", metavar="KEY=VALUE", @@ -733,6 +722,7 @@ def add_cli_args(parser: argparse.ArgumentParser): default="99", help="Comma-separated list of percentiles for selected metrics. " "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " + "Default value is \"99\"." "Use \"--percentile-metrics\" to select metrics.", ) parser.add_argument( @@ -745,7 +735,41 @@ def add_cli_args(parser: argparse.ArgumentParser): "separated by spaces. Allowed request level metric names are " "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of " "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " - "and the blog: https://hao-ai-lab.github.io/blogs/distserve") + "and the blog: https://hao-ai-lab.github.io/blogs/distserve", + ) + + # group for dataset specific arguments + sonnet_group = parser.add_argument_group("sonnet dataset options") + sonnet_group.add_argument( + "--sonnet-input-len", + type=int, + default=550, + help= + "Number of input tokens per request, used only for sonnet dataset.", + ) + sonnet_group.add_argument( + "--sonnet-output-len", + type=int, + default=150, + help= + "Number of output tokens per request, used only for sonnet dataset.", + ) + sonnet_group.add_argument( + "--sonnet-prefix-len", + type=int, + default=200, + help= + "Number of prefix tokens per request, used only for sonnet dataset.", + ) + + sharegpt_group = parser.add_argument_group("sharegpt dataset options") + sharegpt_group.add_argument( + "--sharegpt-output-len", + type=int, + default=None, + help="Output length for each request. Overrides the output length " + "from the ShareGPT dataset.", + ) random_group = parser.add_argument_group("random dataset options") random_group.add_argument( @@ -765,9 +789,11 @@ def add_cli_args(parser: argparse.ArgumentParser): random_group.add_argument( "--random-range-ratio", type=float, - default=1.0, - help="Range of sampled ratio of input/output length, " - "used only for random sampling.", + default=0.0, + help="Range ratio for sampling input/output length, " + "used only for random sampling. Must be in the range [0, 1) to define " + "a symmetric sampling range" + "[length * (1 - range_ratio), length * (1 + range_ratio)].", ) random_group.add_argument( "--random-prefix-len", @@ -778,6 +804,54 @@ def add_cli_args(parser: argparse.ArgumentParser): " request is [random-prefix-len, " " random-prefix-len + random-prefix-len * random-range-ratio).") + hf_group = parser.add_argument_group("hf dataset options") + hf_group.add_argument("--hf-subset", + type=str, + default=None, + help="Subset of the HF dataset.") + hf_group.add_argument("--hf-split", + type=str, + default=None, + help="Split of the HF dataset.") + hf_group.add_argument( + "--hf-output-len", + type=int, + default=None, + help="Output length for each request. Overrides the output lengths " + "from the sampled HF dataset.", + ) + + sampling_group = parser.add_argument_group("sampling parameters") + sampling_group.add_argument( + "--top-p", + type=float, + default=None, + help="Top-p sampling parameter. Only has effect on " + "openai-compatible backends.", + ) + sampling_group.add_argument( + "--top-k", + type=int, + default=None, + help="Top-k sampling parameter. Only has effect on " + "openai-compatible backends.", + ) + sampling_group.add_argument( + "--min-p", + type=float, + default=None, + help="Min-p sampling parameter. Only has effect on " + "openai-compatible backends.", + ) + sampling_group.add_argument( + "--temperature", + type=float, + default=None, + help="Temperature sampling parameter. Only has effect on " + "openai-compatible backends. If not specified, default to greedy " + "decoding (i.e. temperature==0.0).", + ) + parser.add_argument( '--tokenizer-mode', type=str, @@ -826,27 +900,142 @@ def main(args: argparse.Namespace): tokenizer = get_tokenizer(tokenizer_id, tokenizer_mode=tokenizer_mode, trust_remote_code=args.trust_remote_code) - # TODO: This should be refactored to use the benchmark_dataset.py - # in later PRs. + if args.dataset_name is None: raise ValueError( "Please specify '--dataset-name' and the corresponding " "'--dataset-path' if required.") - elif args.dataset_name == "random": - input_requests = sample_random_requests( - prefix_len=args.random_prefix_len, - input_len=args.random_input_len, - output_len=args.random_output_len, - num_prompts=args.num_prompts, - range_ratio=args.random_range_ratio, + + if args.dataset_name == "sonnet": + dataset = SonnetDataset(dataset_path=args.dataset_path) + # For the "sonnet" dataset, formatting depends on the backend. + if args.backend == "openai-chat": + input_requests = dataset.sample( + num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + return_prompt_formatted=False, + ) + else: + assert tokenizer.chat_template or tokenizer.default_chat_template, ( + "Tokenizer/model must have chat template for sonnet dataset.") + input_requests = dataset.sample( + num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + return_prompt_formatted=True, + ) + + elif args.dataset_name == "hf": + # all following datasets are implemented from the + # HuggingFaceDataset base class + if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: + dataset_class = VisionArenaDataset + args.hf_split = "train" + args.hf_subset = None + elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: + dataset_class = InstructCoderDataset + args.hf_split = "train" + elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS: + dataset_class = MTBenchDataset + args.hf_split = "train" + elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: + dataset_class = ConversationDataset + args.hf_split = "train" + elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: + dataset_class = AIMODataset + args.hf_split = "train" + elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS: # noqa: E501 + dataset_class = NextEditPredictionDataset + args.hf_split = "train" + elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS: + dataset_class = ASRDataset + args.hf_split = "train" + else: + supported_datasets = set([ + dataset_name for cls in HuggingFaceDataset.__subclasses__() + for dataset_name in cls.SUPPORTED_DATASET_PATHS + ]) + raise ValueError( + f"Unsupported dataset path: {args.dataset_path}. " + "Huggingface dataset only supports dataset_path" + f" from one of following: {supported_datasets}. " + "Please consider contributing if you would " + "like to add support for additional dataset formats.") + + if dataset_class.IS_MULTIMODAL and endpoint_type not in [ + "openai-chat", + "openai-audio", + ]: + # multi-modal benchmark is only available on OpenAI Chat backend. + raise ValueError( + "Multi-modal content is only supported on 'openai-chat' and " + "'openai-audio' backend.") + input_requests = dataset_class( + dataset_path=args.dataset_path, + dataset_subset=args.hf_subset, + dataset_split=args.hf_split, + random_seed=args.seed, + ).sample( + num_requests=args.num_prompts, tokenizer=tokenizer, + output_len=args.hf_output_len, ) else: - raise ValueError(f"Unknown dataset: {args.dataset_name}") + # For datasets that follow a similar structure, use a mapping. + dataset_mapping = { + "sharegpt": + lambda: ShareGPTDataset(random_seed=args.seed, + dataset_path=args.dataset_path).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + output_len=args.sharegpt_output_len, + ), + "burstgpt": + lambda: BurstGPTDataset(random_seed=args.seed, + dataset_path=args.dataset_path). + sample(tokenizer=tokenizer, num_requests=args.num_prompts), + "random": + lambda: RandomDataset(dataset_path=args.dataset_path).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + prefix_len=args.random_prefix_len, + input_len=args.random_input_len, + output_len=args.random_output_len, + range_ratio=args.random_range_ratio, + ), + } + try: + input_requests = dataset_mapping[args.dataset_name]() + except KeyError as err: + raise ValueError(f"Unknown dataset: {args.dataset_name}") from err goodput_config_dict = check_goodput_args(args) + # Collect the sampling parameters. + sampling_params = { + k: v + for k, v in { + "top_p": args.top_p, + "top_k": args.top_k, + "min_p": args.min_p, + "temperature": args.temperature, + }.items() if v is not None + } + + # Sampling parameters are only supported by openai-compatible backend. + if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS: + raise ValueError("Sampling parameters are only supported by " + "openai-compatible backends.") + + if "temperature" not in sampling_params: + sampling_params["temperature"] = 0.0 # Default to greedy decoding. + # Avoid GC processing "static" data - reduce pause times. gc.collect() gc.freeze() @@ -861,7 +1050,6 @@ def main(args: argparse.Namespace): tokenizer=tokenizer, input_requests=input_requests, logprobs=args.logprobs, - best_of=args.best_of, request_rate=args.request_rate, burstiness=args.burstiness, disable_tqdm=args.disable_tqdm, @@ -874,10 +1062,11 @@ def main(args: argparse.Namespace): goodput_config_dict=goodput_config_dict, max_concurrency=args.max_concurrency, lora_modules=args.lora_modules, + extra_body=sampling_params, )) # Save config and results to json - if args.save_result: + if args.save_result or args.append_result: result_json: dict[str, Any] = {} # Setup @@ -887,7 +1076,6 @@ def main(args: argparse.Namespace): result_json["label"] = label result_json["model_id"] = model_id result_json["tokenizer_id"] = tokenizer_id - result_json["best_of"] = args.best_of result_json["num_prompts"] = args.num_prompts # Metadata @@ -910,6 +1098,21 @@ def main(args: argparse.Namespace): # Merge with benchmark result result_json = {**result_json, **benchmark_result} + if not args.save_detailed: + # Remove fields with too many data points + for field in [ + "input_lens", + "output_lens", + "ttfts", + "itls", + "generated_texts", + "errors", + ]: + if field in result_json: + del result_json[field] + if field in benchmark_result: + del benchmark_result[field] + # Save to file base_model_id = model_id.split("/")[-1] max_concurrency_str = (f"-concurrency{args.max_concurrency}" @@ -919,7 +1122,13 @@ def main(args: argparse.Namespace): if args.result_filename: file_name = args.result_filename if args.result_dir: + os.makedirs(args.result_dir, exist_ok=True) file_name = os.path.join(args.result_dir, file_name) - with open(file_name, "w", encoding='utf-8') as outfile: + with open(file_name, + mode="a+" if args.append_result else "w", + encoding="utf-8") as outfile: + # Append a newline. + if args.append_result and outfile.tell() != 0: + outfile.write("\n") json.dump(result_json, outfile) save_to_pytorch_benchmark_format(args, result_json, file_name) diff --git a/vllm/collect_env.py b/vllm/collect_env.py index 85746b7ef606a..86eb465b8f658 100644 --- a/vllm/collect_env.py +++ b/vllm/collect_env.py @@ -815,4 +815,4 @@ def main(): if __name__ == '__main__': - main() + main() \ No newline at end of file diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 0c1381a565c16..b724479a95dee 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -6,23 +6,22 @@ import os import pprint import time from collections.abc import Sequence -from contextlib import ExitStack from typing import Any, Callable, Optional -from unittest.mock import patch import torch import torch.fx as fx +from torch._dispatch.python import enable_python_dispatcher import vllm.envs as envs from vllm.config import CompilationConfig, VllmConfig from vllm.logger import init_logger -from vllm.utils import weak_ref_tensors +from vllm.platforms import current_platform +from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname from .compiler_interface import (CompilerInterface, EagerAdaptor, InductorAdaptor, InductorStandaloneAdaptor) from .counter import compilation_counter from .inductor_pass import InductorPass -from .monitor import end_monitoring_torch_compile from .pass_manager import PostGradPassManager logger = init_logger(__name__) @@ -30,7 +29,8 @@ logger = init_logger(__name__) def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: if compilation_config.use_inductor: - if envs.VLLM_TEST_STANDALONE_COMPILE: + if envs.VLLM_USE_STANDALONE_COMPILE and is_torch_equal_or_newer( + "2.8.0"): logger.info("Using InductorStandaloneAdaptor") return InductorStandaloneAdaptor() else: @@ -271,7 +271,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t for t in args ] - with self.fake_mode: + with self.fake_mode, enable_python_dispatcher(): return super().run(*fake_args) def call_module(self, target: torch.fx.node.Target, @@ -297,7 +297,9 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): num_graphs=len(self.compile_submod_names), runtime_shape=None) - self.module.__dict__[target] = PiecewiseBackend( + piecewise_backend = resolve_obj_by_qualname( + current_platform.get_piecewise_backend_cls()) + self.module.__dict__[target] = piecewise_backend( submod, self.vllm_config, self.graph_pool, index, len(self.compile_submod_names), sym_shape_indices, compiled_graph_for_general_shape, self.vllm_backend) @@ -341,7 +343,7 @@ class VllmBackend: ): global global_graph_pool if global_graph_pool is None: - global_graph_pool = torch.cuda.graph_pool_handle() + global_graph_pool = current_platform.graph_pool_handle() # TODO: in the future, if we want to use multiple # streams, it might not be safe to share a global pool. @@ -558,197 +560,3 @@ class VllmBackend: return self.split_gm(*list_args) return copy_and_call - - -@dataclasses.dataclass -class ConcreteSizeEntry: - runtime_shape: int - need_to_compile: bool # the size is in compile_sizes - use_cudagraph: bool # the size is in cudagraph_capture_sizes - - compiled: bool = False - runnable: Callable = None # type: ignore - num_finished_warmup: int = 0 - cudagraph: Optional[torch.cuda.CUDAGraph] = None - output: Optional[Any] = None - - # for cudagraph debugging, track the input addresses - # during capture, and check if they are the same during replay - input_addresses: Optional[list[int]] = None - - -class PiecewiseBackend: - - def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, - graph_pool: Any, piecewise_compile_index: int, - total_piecewise_compiles: int, sym_shape_indices: list[int], - compiled_graph_for_general_shape: Callable, - vllm_backend: VllmBackend): - """ - The backend for piecewise compilation. - It mainly handles the compilation and cudagraph capturing. - - We will compile `self.graph` once for the general shape, - and then compile for different shapes specified in - `compilation_config.compile_sizes`. - - Independently, we will capture cudagraph for different shapes. - - If a shape needs both compilation and cudagraph, we will - compile it first, and then capture cudagraph. - """ - self.graph = graph - self.vllm_config = vllm_config - self.compilation_config = vllm_config.compilation_config - self.graph_pool = graph_pool - self.piecewise_compile_index = piecewise_compile_index - self.total_piecewise_compiles = total_piecewise_compiles - self.vllm_backend = vllm_backend - - self.is_first_graph = piecewise_compile_index == 0 - self.is_last_graph = ( - piecewise_compile_index == total_piecewise_compiles - 1) - - self.compile_sizes: set[int] = set( - self.compilation_config.compile_sizes) - self.cudagraph_capture_sizes: set[int] = set( - self.compilation_config.cudagraph_capture_sizes - ) if self.compilation_config.use_cudagraph else set() - - self.first_run_finished = False - - self.compiled_graph_for_general_shape = compiled_graph_for_general_shape # noqa - - self.sym_shape_indices = sym_shape_indices - - self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG" - - # the entries for different shapes that we need to either - # compile or capture cudagraph - self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {} - - # to_be_compiled_sizes tracks the remaining sizes to compile, - # and updates during the compilation process, so we need to copy it - self.to_be_compiled_sizes: set[int] = self.compile_sizes.copy() - for shape in self.compile_sizes.union(self.cudagraph_capture_sizes): - self.concrete_size_entries[shape] = ConcreteSizeEntry( - runtime_shape=shape, - need_to_compile=shape in self.compile_sizes, - use_cudagraph=shape in self.cudagraph_capture_sizes, - ) - - def check_for_ending_compilation(self): - if self.is_last_graph and not self.to_be_compiled_sizes: - # no specific sizes to compile - # save the hash of the inductor graph for the next run - self.vllm_backend.compiler_manager.save_to_file() - end_monitoring_torch_compile(self.vllm_config) - - def __call__(self, *args) -> Any: - if not self.first_run_finished: - self.first_run_finished = True - self.check_for_ending_compilation() - return self.compiled_graph_for_general_shape(*args) - - runtime_shape = args[self.sym_shape_indices[0]] - if runtime_shape not in self.concrete_size_entries: - # we don't need to do anything for this shape - return self.compiled_graph_for_general_shape(*args) - - entry = self.concrete_size_entries[runtime_shape] - - if entry.runnable is None: - entry.runnable = self.compiled_graph_for_general_shape - - if entry.need_to_compile and not entry.compiled: - entry.compiled = True - self.to_be_compiled_sizes.remove(runtime_shape) - # args are real arguments - entry.runnable = self.vllm_backend.compiler_manager.compile( - self.graph, - args, - self.compilation_config.inductor_compile_config, - self.compilation_config, - graph_index=self.piecewise_compile_index, - num_graphs=self.total_piecewise_compiles, - runtime_shape=runtime_shape) - - # finished compilations for all required shapes - if self.is_last_graph and not self.to_be_compiled_sizes: - self.check_for_ending_compilation() - - if not entry.use_cudagraph: - return entry.runnable(*args) - - if entry.cudagraph is None: - if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups: # noqa - entry.num_finished_warmup += 1 - if self.is_first_graph: - logger.debug( - "Warming up %s/%s for shape %s", - entry.num_finished_warmup, - self.compilation_config.cudagraph_num_of_warmups, - runtime_shape) - return entry.runnable(*args) - - if self.is_first_graph: - # Since we capture cudagraph for many different shapes and - # capturing is fast, we don't need to log it for every shape. - # We only log it in the debug mode. - logger.debug("Capturing a cudagraph for shape %s", - runtime_shape) - - input_addresses = [ - x.data_ptr() for x in args if isinstance(x, torch.Tensor) - ] - entry.input_addresses = input_addresses - cudagraph = torch.cuda.CUDAGraph() - - with ExitStack() as stack: - if not self.is_first_graph: - # during every model forward, we will capture - # many pieces of cudagraphs (roughly one per layer). - # running gc again and again across layers will - # make the cudagraph capture very slow. - # therefore, we only run gc for the first graph, - # and disable gc for the rest of the graphs. - stack.enter_context(patch("gc.collect", lambda: None)) - stack.enter_context( - patch("torch.cuda.empty_cache", lambda: None)) - - # mind-exploding: carefully manage the reference and memory. - with torch.cuda.graph(cudagraph, pool=self.graph_pool): - # `output` is managed by pytorch's cudagraph pool - output = entry.runnable(*args) - if self.is_last_graph: - # by converting it to weak ref, - # the original `output` will immediately be released - # to save memory. It is only safe to do this for - # the last graph, because the output of the last graph - # will not be used by any other cuda graph. - output = weak_ref_tensors(output) - - # here we always use weak ref for the output - # to save memory - entry.output = weak_ref_tensors(output) - entry.cudagraph = cudagraph - - compilation_counter.num_cudagraph_caputured += 1 - - # important: we need to return the output, rather than - # the weak ref of the output, so that pytorch can correctly - # manage the memory during cuda graph capture - return output - - if self.is_debugging_mode: - # check if the input addresses are the same - new_input_addresses = [ - x.data_ptr() for x in args if isinstance(x, torch.Tensor) - ] - assert new_input_addresses == entry.input_addresses, ( - "Input addresses for cudagraphs are different during replay." - f" Expected {entry.input_addresses}, got {new_input_addresses}" - ) - - entry.cudagraph.replay() - return entry.output diff --git a/vllm/compilation/base_piecewise_backend.py b/vllm/compilation/base_piecewise_backend.py new file mode 100644 index 0000000000000..84d1e1f77739e --- /dev/null +++ b/vllm/compilation/base_piecewise_backend.py @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Callable, Protocol + +import torch.fx as fx + +from vllm.compilation.backends import VllmBackend +from vllm.config import VllmConfig + + +class AbstractPiecewiseBackend(Protocol): + """ + PiecewiseBackend interface that allows platforms to extend + piecewise static graph. + """ + + def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, + graph_pool: Any, piecewise_compile_index: int, + total_piecewise_compiles: int, sym_shape_indices: list[int], + compiled_graph_for_general_shape: Callable, + vllm_backend: VllmBackend, **kwargs): + """ + Initializes the PiecewiseBackend class with compilation and + execution-related configurations. + + This class handles piecewise compilation, graph capturing, + and dispatching for specific input shapes. + + Args: + graph (fx.GraphModule): The graph represented in fx. + vllm_config (VllmConfig): Global configuration for vLLM. + graph_pool (Any): + Graph memory pool handle, e.g., + `torch.cuda.graph_pool_handle()`. + piecewise_compile_index (int): + Index of the current piecewise subgraph. + total_piecewise_compiles (int): + Total number of piecewise-compiled graphs. + sym_shape_indices (list[int]): + Indices of symbolic shape. + compiled_graph_for_general_shape (Callable): + Callable that executes the graph compiled for general shapes. + vllm_backend (VllmBackend): + Backend compiler that manages compilation and graph runtime + for vLLM. + + Keyword Args: + kwargs: Additional keyword arguments reserved for future + extensions or custom platforms. + """ + raise NotImplementedError + + def __call__(self, *args) -> Any: + """Executes the compiled graph for given input args. + + If this is the first invocation, executes the general compiled graph + and initiates the compilation process tracking. For subsequent calls, + dynamically dispatches execution to either a compiled graph or a static + graph based on the input shape. + + Args: + *args: Variable length input arguments to be passed into the + graph. The symbolic shape is expected to be in position + `sym_shape_indices[0]`. + + Returns: + Any: Output of the executed graph. This can be from the general + compiled graph, a specialized compiled version for the given shape, + or a replayed static graph. + """ + raise NotImplementedError diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py new file mode 100644 index 0000000000000..f651ee6912abb --- /dev/null +++ b/vllm/compilation/collective_fusion.py @@ -0,0 +1,126 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional + +import torch +import torch._inductor.pattern_matcher as pm +import torch.fx as fx +from torch._inductor.pattern_matcher import PatternMatcherPass +from torch.distributed._symmetric_memory import enable_symm_mem_for_group + +from vllm.config import VllmConfig +from vllm.distributed import get_tp_group +from vllm.distributed.parallel_state import ( + get_tensor_model_parallel_world_size) +from vllm.logger import init_logger + +from .vllm_inductor_pass import VllmInductorPass + +logger = init_logger(__name__) + + +class BasePattern: + + def __init__(self, dtype: torch.dtype, device: str): + self.dtype = dtype + self.device = device + self.tp = get_tp_group() + self.tp_size = get_tensor_model_parallel_world_size() + + +class GEMMReduceScatterPattern(BasePattern): + + def get_inputs(self): + mul = torch.empty([16, 4], device=self.device, dtype=self.dtype) + mm_weight = torch.empty([4, 4], device=self.device, dtype=self.dtype) + return [mul, mm_weight] + + def register(self, pm_pass: PatternMatcherPass): + + def pattern(mul: torch.Tensor, mm_weight: torch.Tensor): + mm = torch.ops.aten.mm.default(mul, mm_weight) + reduce_scatter = torch.ops.vllm.reduce_scatter.default( + mm, + dim=0, + world_size=self.tp_size, + group_name=self.tp.unique_name) + return reduce_scatter + + def replacement(mul: torch.Tensor, mm_weight: torch.Tensor): + gemm_rs = torch.ops.symm_mem.fused_matmul_reduce_scatter( + mul, + mm_weight, + "avg", + scatter_dim=0, + group_name=self.tp.device_group.group_name, + ) + + return gemm_rs + + pm.register_replacement(pattern, replacement, self.get_inputs(), + pm.fwd_only, pm_pass) + + +class AllGatherGEMMPattern(BasePattern): + + def get_inputs(self): + x = torch.empty([4, 4], device=self.device, dtype=self.dtype) + weight = torch.empty([4, 4], device=self.device, dtype=self.dtype) + + return [x, weight] + + def register(self, pm_pass: PatternMatcherPass): + + def pattern( + x: torch.Tensor, + weight: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + all_gather = torch.ops.vllm.all_gather.default( + x, + dim=0, + world_size=self.tp_size, + group_name=self.tp.unique_name) + + return torch.ops.aten.mm.default(all_gather, weight) + + def replacement( + x: torch.Tensor, + weight: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_matmul( + x, + [weight], + gather_dim=0, + group_name=self.tp.device_group.group_name, + ) + return mm_outputs + + pm.register_replacement(pattern, replacement, self.get_inputs(), + pm.fwd_only, pm_pass) + + +class AsyncTPPass(VllmInductorPass): + + def __init__(self, config: VllmConfig): + super().__init__(config) + + # Enable symmetric memory for the TP process group + enable_symm_mem_for_group(get_tp_group().device_group.group_name) + self.patterns: PatternMatcherPass = PatternMatcherPass( + pass_name="async_tp_pass") + GEMMReduceScatterPattern(self.model_dtype, + self.device).register(self.patterns) + + AllGatherGEMMPattern(self.model_dtype, + self.device).register(self.patterns) + + def is_applicable_for_shape(self, shape: Optional[int]) -> bool: + # only do replace for specific shapes + tp_size = get_tensor_model_parallel_world_size() + return shape is not None and shape % tp_size == 0 + + def __call__(self, graph: fx.Graph): + self.begin() + self.dump_graph(graph, "before_async_tp_pass") + count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", count) + self.dump_graph(graph, "after_async_tp_pass") + self.end_and_log() diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 89a131e8ea24a..9293610cc2469 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -12,6 +12,7 @@ import torch._inductor.compile_fx import torch.fx as fx import vllm.envs as envs +from vllm.compilation.counter import compilation_counter from vllm.config import VllmConfig from vllm.utils import is_torch_equal_or_newer @@ -39,7 +40,8 @@ class CompilerInterface: Gather all the relevant information from the vLLM config, to compute a hash so that we can cache the compiled model. - See {meth}`VllmConfig.compute_hash` to check what information + See [`VllmConfig.compute_hash`][vllm.config.VllmConfig.compute_hash] + to check what information is already considered by default. This function should only consider the information that is specific to the compiler. """ @@ -153,7 +155,7 @@ class InductorStandaloneAdaptor(CompilerInterface): This is not on by default yet, but we plan to turn it on by default for PyTorch 2.8. - Use VLLM_TEST_STANDALONE_COMPILE to toggle this on or off. + Use VLLM_USE_STANDALONE_COMPILE to toggle this on or off. """ name = "inductor_standalone" @@ -174,6 +176,7 @@ class InductorStandaloneAdaptor(CompilerInterface): runtime_shape: Optional[int] = None, key: Optional[str] = None, ) -> tuple[Optional[Callable], Optional[Any]]: + compilation_counter.num_inductor_compiles += 1 current_config = {} if compiler_config is not None: current_config.update(compiler_config) @@ -261,6 +264,7 @@ class InductorAdaptor(CompilerInterface): runtime_shape: Optional[int] = None, key: Optional[str] = None, ) -> tuple[Optional[Callable], Optional[Any]]: + compilation_counter.num_inductor_compiles += 1 from torch._inductor.compile_fx import compile_fx current_config = {} if compiler_config is not None: @@ -411,8 +415,14 @@ class InductorAdaptor(CompilerInterface): # compilation cache. So turn off the checks if we disable the # compilation cache. if not envs.VLLM_DISABLE_COMPILE_CACHE: - assert hash_str is not None, ( - "failed to get the hash of the compiled graph") + if hash_str is None: + raise RuntimeError( + "vLLM failed to compile the model. The most " + "likely reason for this is that a previous compilation " + "failed, leading to a corrupted compilation artifact. " + "We recommend trying to " + "remove ~/.cache/vllm/torch_compile_cache and try again " + "to see the real issue. ") assert file_path is not None, ( "failed to get the file path of the compiled graph") return compiled_graph, (hash_str, file_path) @@ -527,6 +537,7 @@ class EagerAdaptor(CompilerInterface): runtime_shape: Optional[int] = None, key: Optional[str] = None, ) -> tuple[Optional[Callable], Optional[Any]]: + compilation_counter.num_eager_compiles += 1 # we don't need to compile the graph, just return the graph itself. # It does not support caching, return None for the handle. return graph, None diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py index 5be452593c620..2200671b8848b 100644 --- a/vllm/compilation/counter.py +++ b/vllm/compilation/counter.py @@ -15,6 +15,10 @@ class CompilationCounter: num_piecewise_capturable_graphs_seen: int = 0 num_backend_compilations: int = 0 num_cudagraph_caputured: int = 0 + # InductorAdapter.compile calls + num_inductor_compiles: int = 0 + # EagerAdapter.compile calls + num_eager_compiles: int = 0 def clone(self) -> "CompilationCounter": return copy.deepcopy(self) diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py new file mode 100644 index 0000000000000..0ad480e28cd70 --- /dev/null +++ b/vllm/compilation/cuda_piecewise_backend.py @@ -0,0 +1,213 @@ +# SPDX-License-Identifier: Apache-2.0 + +import dataclasses +from contextlib import ExitStack +from typing import Any, Callable, Optional +from unittest.mock import patch + +import torch +import torch.fx as fx + +import vllm.envs as envs +from vllm.compilation.backends import VllmBackend +from vllm.compilation.counter import compilation_counter +from vllm.compilation.monitor import end_monitoring_torch_compile +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.utils import weak_ref_tensors + +logger = init_logger(__name__) + + +@dataclasses.dataclass +class ConcreteSizeEntry: + runtime_shape: int + need_to_compile: bool # the size is in compile_sizes + use_cudagraph: bool # the size is in cudagraph_capture_sizes + + compiled: bool = False + runnable: Callable = None # type: ignore + num_finished_warmup: int = 0 + cudagraph: Optional[torch.cuda.CUDAGraph] = None + output: Optional[Any] = None + + # for cudagraph debugging, track the input addresses + # during capture, and check if they are the same during replay + input_addresses: Optional[list[int]] = None + + +class CUDAPiecewiseBackend: + + def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, + graph_pool: Any, piecewise_compile_index: int, + total_piecewise_compiles: int, sym_shape_indices: list[int], + compiled_graph_for_general_shape: Callable, + vllm_backend: VllmBackend): + """ + The backend for piecewise compilation. + It mainly handles the compilation and cudagraph capturing. + + We will compile `self.graph` once for the general shape, + and then compile for different shapes specified in + `compilation_config.compile_sizes`. + + Independently, we will capture cudagraph for different shapes. + + If a shape needs both compilation and cudagraph, we will + compile it first, and then capture cudagraph. + """ + self.graph = graph + self.vllm_config = vllm_config + self.compilation_config = vllm_config.compilation_config + self.graph_pool = graph_pool + self.piecewise_compile_index = piecewise_compile_index + self.total_piecewise_compiles = total_piecewise_compiles + self.vllm_backend = vllm_backend + + self.is_first_graph = piecewise_compile_index == 0 + self.is_last_graph = ( + piecewise_compile_index == total_piecewise_compiles - 1) + + self.compile_sizes: set[int] = set( + self.compilation_config.compile_sizes) + self.cudagraph_capture_sizes: set[int] = set( + self.compilation_config.cudagraph_capture_sizes + ) if self.compilation_config.use_cudagraph else set() + + self.first_run_finished = False + + self.compiled_graph_for_general_shape = compiled_graph_for_general_shape # noqa + + self.sym_shape_indices = sym_shape_indices + + self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG" + + # the entries for different shapes that we need to either + # compile or capture cudagraph + self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {} + + # to_be_compiled_sizes tracks the remaining sizes to compile, + # and updates during the compilation process, so we need to copy it + self.to_be_compiled_sizes: set[int] = self.compile_sizes.copy() + for shape in self.compile_sizes.union(self.cudagraph_capture_sizes): + self.concrete_size_entries[shape] = ConcreteSizeEntry( + runtime_shape=shape, + need_to_compile=shape in self.compile_sizes, + use_cudagraph=shape in self.cudagraph_capture_sizes, + ) + + def check_for_ending_compilation(self): + if self.is_last_graph and not self.to_be_compiled_sizes: + # no specific sizes to compile + # save the hash of the inductor graph for the next run + self.vllm_backend.compiler_manager.save_to_file() + end_monitoring_torch_compile(self.vllm_config) + + def __call__(self, *args) -> Any: + if not self.first_run_finished: + self.first_run_finished = True + self.check_for_ending_compilation() + return self.compiled_graph_for_general_shape(*args) + + runtime_shape = args[self.sym_shape_indices[0]] + if runtime_shape not in self.concrete_size_entries: + # we don't need to do anything for this shape + return self.compiled_graph_for_general_shape(*args) + + entry = self.concrete_size_entries[runtime_shape] + + if entry.runnable is None: + entry.runnable = self.compiled_graph_for_general_shape + + if entry.need_to_compile and not entry.compiled: + entry.compiled = True + self.to_be_compiled_sizes.remove(runtime_shape) + # args are real arguments + entry.runnable = self.vllm_backend.compiler_manager.compile( + self.graph, + args, + self.compilation_config.inductor_compile_config, + self.compilation_config, + graph_index=self.piecewise_compile_index, + num_graphs=self.total_piecewise_compiles, + runtime_shape=runtime_shape) + + # finished compilations for all required shapes + if self.is_last_graph and not self.to_be_compiled_sizes: + self.check_for_ending_compilation() + + if not entry.use_cudagraph: + return entry.runnable(*args) + + if entry.cudagraph is None: + if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups: # noqa + entry.num_finished_warmup += 1 + if self.is_first_graph: + logger.debug( + "Warming up %s/%s for shape %s", + entry.num_finished_warmup, + self.compilation_config.cudagraph_num_of_warmups, + runtime_shape) + return entry.runnable(*args) + + if self.is_first_graph: + # Since we capture cudagraph for many different shapes and + # capturing is fast, we don't need to log it for every shape. + # We only log it in the debug mode. + logger.debug("Capturing a cudagraph for shape %s", + runtime_shape) + + input_addresses = [ + x.data_ptr() for x in args if isinstance(x, torch.Tensor) + ] + entry.input_addresses = input_addresses + cudagraph = torch.cuda.CUDAGraph() + + with ExitStack() as stack: + if not self.is_first_graph: + # during every model forward, we will capture + # many pieces of cudagraphs (roughly one per layer). + # running gc again and again across layers will + # make the cudagraph capture very slow. + # therefore, we only run gc for the first graph, + # and disable gc for the rest of the graphs. + stack.enter_context(patch("gc.collect", lambda: None)) + stack.enter_context( + patch("torch.cuda.empty_cache", lambda: None)) + + # mind-exploding: carefully manage the reference and memory. + with torch.cuda.graph(cudagraph, pool=self.graph_pool): + # `output` is managed by pytorch's cudagraph pool + output = entry.runnable(*args) + if self.is_last_graph: + # by converting it to weak ref, + # the original `output` will immediately be released + # to save memory. It is only safe to do this for + # the last graph, because the output of the last graph + # will not be used by any other cuda graph. + output = weak_ref_tensors(output) + + # here we always use weak ref for the output + # to save memory + entry.output = weak_ref_tensors(output) + entry.cudagraph = cudagraph + + compilation_counter.num_cudagraph_caputured += 1 + + # important: we need to return the output, rather than + # the weak ref of the output, so that pytorch can correctly + # manage the memory during cuda graph capture + return output + + if self.is_debugging_mode: + # check if the input addresses are the same + new_input_addresses = [ + x.data_ptr() for x in args if isinstance(x, torch.Tensor) + ] + assert new_input_addresses == entry.input_addresses, ( + "Input addresses for cudagraphs are different during replay." + f" Expected {entry.input_addresses}, got {new_input_addresses}" + ) + + entry.cudagraph.replay() + return entry.output diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index f4d3fd9b457fc..07ebd3e1b7dde 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -6,6 +6,7 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from .activation_quant_fusion import ActivationQuantFusionPass +from .collective_fusion import AsyncTPPass from .fix_functionalization import FixFunctionalizationPass from .fusion import FusionPass from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context @@ -54,6 +55,8 @@ class PostGradPassManager(CustomGraphPass): if self.pass_config.enable_sequence_parallelism: self.passes += [SequenceParallelismPass(config)] + if self.pass_config.enable_async_tp: + self.passes += [AsyncTPPass(config)] self.fix_functionalization = FixFunctionalizationPass(config) diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py index f0476bfcb65af..17dded87fe8dc 100644 --- a/vllm/compilation/sequence_parallelism.py +++ b/vllm/compilation/sequence_parallelism.py @@ -243,24 +243,25 @@ class SequenceParallelismPass(VllmInductorPass): pass_name="sequence_parallelism_pass") for epsilon in [1e-5, 1e-6]: EmbeddingAllReduceRMSNormPattern( - epsilon, self.dtype, self.device).register(self.patterns) + epsilon, self.model_dtype, self.device).register(self.patterns) - MiddleAllReduceRMSNormPattern(epsilon, self.dtype, + MiddleAllReduceRMSNormPattern(epsilon, self.model_dtype, self.device).register(self.patterns) - LastAllReduceRMSNormPattern(epsilon, self.dtype, + LastAllReduceRMSNormPattern(epsilon, self.model_dtype, self.device).register(self.patterns) # WARNING: This is a hack to clear the pattern matcher cache # and allow multiple values of epsilon. torch._inductor.pattern_matcher._seen_patterns.clear() def is_applicable_for_shape(self, shape: Optional[int]) -> bool: - # only do replace for specific shapes tp_size = get_tensor_model_parallel_world_size() return shape is not None and shape % tp_size == 0 def __call__(self, graph: fx.Graph): + self.begin() self.dump_graph(graph, "before_sequence_parallelism_pass") count = self.patterns.apply(graph) logger.debug("Replaced %s patterns", count) self.dump_graph(graph, "after_sequence_parallelism_pass") + self.end_and_log() diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py index c95e0bce5f2e1..0fe73b72b1dee 100644 --- a/vllm/compilation/vllm_inductor_pass.py +++ b/vllm/compilation/vllm_inductor_pass.py @@ -26,7 +26,8 @@ class VllmInductorPass(InductorPass): def __init__(self, config: VllmConfig): self.pass_config = config.compilation_config.pass_config - self.dtype = config.model_config.dtype if config.model_config else None + self.model_dtype = config.model_config.dtype if config.model_config \ + else None self.device = config.device_config.device if config.device_config \ else None self.pass_name = self.__class__.__name__ diff --git a/vllm/config.py b/vllm/config.py index 5382e9a16829d..d0891d670b76d 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -6,24 +6,28 @@ import enum import hashlib import inspect import json -import re import textwrap import uuid import warnings from collections import Counter from contextlib import contextmanager -from dataclasses import (MISSING, Field, asdict, dataclass, field, fields, - is_dataclass, replace) +from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass, + replace) from functools import cached_property from importlib.util import find_spec from pathlib import Path from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional, Protocol, TypeVar, Union, cast, get_args, get_origin) +import regex as re import torch +from pydantic import (ConfigDict, SkipValidation, TypeAdapter, field_validator, + model_validator) +from pydantic.dataclasses import dataclass +from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE from torch.distributed import ProcessGroup, ReduceOp from transformers import PretrainedConfig -from typing_extensions import deprecated +from typing_extensions import deprecated, runtime_checkable import vllm.envs as envs from vllm import version @@ -39,12 +43,16 @@ from vllm.transformers_utils.config import ( ConfigFormat, get_config, get_hf_image_processor_config, get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, - try_get_generation_config, uses_mrope) + try_get_generation_config, try_get_safetensors_metadata, uses_mrope) from vllm.transformers_utils.s3_utils import S3Model from vllm.transformers_utils.utils import is_s3, maybe_model_redirect -from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless, - get_cpu_memory, get_open_port, is_torch_equal_or_newer, - random_uuid, resolve_obj_by_qualname) +from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, + MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, + POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes, + LayerBlockType, common_broadcastable_dtype, + cuda_device_count_stateless, get_cpu_memory, + get_open_port, is_torch_equal_or_newer, random_uuid, + resolve_obj_by_qualname) if TYPE_CHECKING: from _typeshed import DataclassInstance @@ -54,22 +62,21 @@ if TYPE_CHECKING: from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.model_loader import BaseModelLoader + from vllm.model_executor.model_loader.tensorizer import TensorizerConfig ConfigType = type[DataclassInstance] else: + PlacementGroup = Any + ExecutorBase = Any QuantizationConfig = Any + BaseModelLoader = Any + TensorizerConfig = Any ConfigType = type logger = init_logger(__name__) ConfigT = TypeVar("ConfigT", bound=ConfigType) -# This value is chosen to have a balance between ITL and TTFT. Note it is -# not optimized for throughput. -_DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048 -_POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768 -_MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120 - TaskOption = Literal["auto", "generate", "embedding", "embed", "classify", "score", "reward", "transcription"] @@ -95,6 +102,7 @@ HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig], PretrainedConfig]] +@runtime_checkable class SupportsHash(Protocol): def compute_hash(self) -> str: @@ -226,7 +234,7 @@ ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] @config -@dataclass +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) class ModelConfig: """Configuration for the model.""" @@ -239,7 +247,7 @@ class ModelConfig: task, even if the same model can be used for multiple tasks. When the model only supports one task, "auto" can be used to select it; otherwise, you must specify explicitly which task to use.""" - tokenizer: str = None # type: ignore + tokenizer: SkipValidation[str] = None # type: ignore """Name or path of the Hugging Face tokenizer to use. If unspecified, model name or path will be used.""" tokenizer_mode: TokenizerMode = "auto" @@ -287,7 +295,7 @@ class ModelConfig: """The specific revision to use for the tokenizer on the Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" - max_model_len: int = None # type: ignore + max_model_len: SkipValidation[int] = None # type: ignore """Model context length (prompt and output). If unspecified, will be automatically derived from the model config. @@ -298,7 +306,7 @@ class ModelConfig: - 25.6k -> 25,600""" spec_target_max_model_len: Optional[int] = None """Specify the maximum length for spec decoding draft models.""" - quantization: Optional[QuantizationMethods] = None + quantization: SkipValidation[Optional[QuantizationMethods]] = None """Method used to quantize the weights. If `None`, we first check the `quantization_config` attribute in the model config file. If that is `None`, we assume the model weights are not quantized and use `dtype` to @@ -374,7 +382,7 @@ class ModelConfig: """Initialize non-default neuron config or override default neuron config that are specific to Neuron devices, this argument will be used to configure the neuron config that can not be gathered from the vllm - arguments. e.g. `{"cast_logits_dtype": "bloat16"}`.""" + arguments. e.g. `{"cast_logits_dtype": "bfloat16"}`.""" pooler_config: Optional["PoolerConfig"] = field(init=False) """Pooler config which controls the behaviour of output pooling in pooling models.""" @@ -534,15 +542,38 @@ class ModelConfig: self.encoder_config = self._get_encoder_config() self.hf_image_processor_config = get_hf_image_processor_config( self.model, hf_token=self.hf_token, revision=self.revision) - self.dtype = _get_and_verify_dtype(self.hf_config, self.dtype) - interleaved_attn_models = ["gemma2", "gemma3_text", "cohere2"] + supported_tasks, task = self._resolve_task(self.task) + self.supported_tasks = supported_tasks + self.task = task + if self.task in ("draft", "generate"): + self.truncation_side = "left" + else: + self.truncation_side = "right" + + self.pooler_config = self._init_pooler_config() + + self.dtype = _get_and_verify_dtype( + self.model, + self.hf_config, + self.dtype, + is_pooling_model=self.runner_type == "pooling", + revision=self.revision, + ) + + # Workaround for Gemma 2 which uses interleaved sliding window + # attention, but it's not specified in its config. TODO: remove this + # when Gemma 2 is fixed in Transformers. + if self.hf_text_config.model_type == "gemma2": + self.hf_text_config.sliding_window_pattern = 2 + sliding_window = getattr(self.hf_text_config, "sliding_window", None) - has_interleaved_attention = (sliding_window is not None) and ( - isinstance(sliding_window, list) or - (self.hf_text_config.model_type in interleaved_attn_models)) + sliding_window_pattern = getattr(self.hf_text_config, + "sliding_window_pattern", None) + has_interleaved_attention = sliding_window_pattern is not None or ( + isinstance(sliding_window, list)) - if (not self.disable_sliding_window and has_interleaved_attention): + if not self.disable_sliding_window and has_interleaved_attention: if (backend := envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER"): sliding_window_len_min = get_min_sliding_window( @@ -562,16 +593,14 @@ class ModelConfig: # only the attention layer itself is aware of the sliding # window, and use the window size to compute the attention. self.hf_text_config.interleaved_sliding_window = sliding_window - delattr(self.hf_text_config, "sliding_window") + + if hasattr(self.hf_text_config, "sliding_window"): + delattr(self.hf_text_config, "sliding_window") + sliding_window = None - self.max_model_len = _get_and_verify_max_len( - hf_config=self.hf_text_config, - max_model_len=self.max_model_len, - disable_sliding_window=self.disable_sliding_window, - sliding_window_len=self.get_hf_config_sliding_window(), - spec_target_max_model_len=self.spec_target_max_model_len, - encoder_config=self.encoder_config) + self.original_max_model_len = self.max_model_len + self.max_model_len = self.get_and_verify_max_len(self.max_model_len) self.served_model_name = get_served_model_name(self.model, self.served_model_name) self.multimodal_config = self._init_multimodal_config() @@ -587,20 +616,26 @@ class ModelConfig: raise ValueError( "`override_neuron_config` is only supported on Neuron.") - supported_tasks, task = self._resolve_task(self.task) - self.supported_tasks = supported_tasks - self.task = task - if self.task in ("draft", "generate"): - self.truncation_side = "left" - else: - self.truncation_side = "right" - - self.pooler_config = self._init_pooler_config() - self._verify_quantization() self._verify_cuda_graph() self._verify_bnb_config() + @field_validator("quantization", mode="before") + @classmethod + def validate_quantization_before(cls, value: Any) -> Any: + if isinstance(value, str): + return value.lower() + return value + + @model_validator(mode="after") + def validate_model_config_after(self: "ModelConfig") -> "ModelConfig": + if not isinstance(self.tokenizer, str): + raise ValueError("tokenizer must be a string after __post_init__.") + if not isinstance(self.max_model_len, int): + raise ValueError( + "max_model_len must be an integer after __post_init__.") + return self + @property def registry(self): return ModelRegistry @@ -666,7 +701,6 @@ class ModelConfig: self.model, self.revision) def _init_pooler_config(self) -> Optional["PoolerConfig"]: - if self.runner_type == "pooling": if isinstance(self.override_pooler_config, dict): self.override_pooler_config = PoolerConfig( @@ -790,17 +824,12 @@ class ModelConfig: else: # Aliases if task_option == "embedding": - preferred_task = self._get_preferred_task( - architectures, supported_tasks) - if preferred_task != "embed": - msg = ("The 'embedding' task will be restricted to " - "embedding models in a future release. Please " - "pass `--task classify`, `--task score`, or " - "`--task reward` explicitly for other pooling " - "models.") - warnings.warn(msg, DeprecationWarning, stacklevel=2) + msg = ("The 'embedding' task has been renamed to " + "'embed', please use the new name. The old name " + "will be removed in v1.0.") + warnings.warn(msg, DeprecationWarning, stacklevel=2) - task_option = preferred_task or "embed" + task_option = "embed" if task_option not in supported_tasks: msg = ( @@ -827,8 +856,7 @@ class ModelConfig: "quark", "modelopt_fp4", "bitblas", "gptq_bitblas" ] if self.quantization is not None: - self.quantization = cast(QuantizationMethods, - self.quantization.lower()) + self.quantization = cast(QuantizationMethods, self.quantization) # Parse quantization method from the HF model config, if available. quant_cfg = self._parse_quant_hf_config() @@ -987,7 +1015,7 @@ class ModelConfig: self.use_async_output_proc = False return - # Reminder: Please update docs/source/features/compatibility_matrix.md + # Reminder: Please update docs/features/compatibility_matrix.md # If the feature combo become valid from vllm.platforms import current_platform if not current_platform.is_async_output_supported(self.enforce_eager): @@ -1003,7 +1031,7 @@ class ModelConfig: if self.runner_type == "pooling": self.use_async_output_proc = False - # Reminder: Please update docs/source/features/compatibility_matrix.md + # Reminder: Please update docs/features/compatibility_matrix.md # If the feature combo become valid if speculative_config: self.use_async_output_proc = False @@ -1340,6 +1368,16 @@ class ModelConfig: @property def is_encoder_decoder(self) -> bool: """Extract the HF encoder/decoder model flag.""" + """ + For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to + True to enable cross-attention + Neuron needs all multimodal data to be in the decoder and does not + need to explicitly enable cross-attention + """ + if (current_platform.is_neuron() + and self.hf_config.model_type == "mllama"): + return False + return is_encoder_decoder(self.hf_config) @property @@ -1380,6 +1418,16 @@ class ModelConfig: def matryoshka_dimensions(self): return getattr(self.hf_config, "matryoshka_dimensions", None) + def get_and_verify_max_len(self, max_model_len: int): + max_model_len = _get_and_verify_max_len( + hf_config=self.hf_text_config, + max_model_len=max_model_len, + disable_sliding_window=self.disable_sliding_window, + sliding_window_len=self.get_hf_config_sliding_window(), + spec_target_max_model_len=self.spec_target_max_model_len, + encoder_config=self.encoder_config) + return max_model_len + BlockSize = Literal[1, 8, 16, 32, 64, 128] CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2"] @@ -1391,7 +1439,7 @@ PrefixCachingHashAlgo = Literal["builtin", "sha256"] class CacheConfig: """Configuration for the KV cache.""" - block_size: BlockSize = None # type: ignore + block_size: SkipValidation[BlockSize] = None # type: ignore """Size of a contiguous cache block in number of tokens. This is ignored on neuron devices and set to `--max-model-len`. On CUDA devices, only block sizes up to 32 are supported. On HPU devices, block size defaults to 128. @@ -1613,7 +1661,8 @@ class LoadConfig: download_dir: Optional[str] = None """Directory to download and load the weights, default to the default cache directory of Hugging Face.""" - model_loader_extra_config: dict = field(default_factory=dict) + model_loader_extra_config: Union[dict, TensorizerConfig] = field( + default_factory=dict) """Extra config for model loader. This will be passed to the model loader corresponding to the chosen load_format.""" ignore_patterns: Optional[Union[list[str], str]] = None @@ -1741,6 +1790,10 @@ class ParallelConfig: rank: int = 0 """Global rank in distributed setup.""" + enable_multimodal_encoder_data_parallel: bool = False + """ Use data parallelism instead of tensor parallelism for vision encoder. + Only support LLama4 for now""" + @property def world_size_across_dp(self) -> int: """world_size_across_dp is TPxPPxDP, it is the size of the world @@ -1923,19 +1976,19 @@ class SchedulerConfig: runner_type: RunnerType = "generate" """The runner type to launch for the model.""" - max_num_batched_tokens: int = None # type: ignore + max_num_batched_tokens: SkipValidation[int] = None # type: ignore """Maximum number of tokens to be processed in a single iteration. This config has no static default. If left unspecified by the user, it will be set in `EngineArgs.create_engine_config` based on the usage context.""" - max_num_seqs: int = None # type: ignore + max_num_seqs: SkipValidation[int] = None # type: ignore """Maximum number of sequences to be processed in a single iteration. This config has no static default. If left unspecified by the user, it will be set in `EngineArgs.create_engine_config` based on the usage context.""" - max_model_len: int = None # type: ignore + max_model_len: SkipValidation[int] = None # type: ignore """Maximum length of a sequence (including prompt and generated text). This is primarily set in `ModelConfig` and that value should be manually duplicated here.""" @@ -1974,7 +2027,7 @@ class SchedulerConfig: """Apply a delay (of delay factor multiplied by previous prompt latency) before scheduling next prompt.""" - enable_chunked_prefill: bool = None # type: ignore + enable_chunked_prefill: SkipValidation[bool] = None # type: ignore """If True, prefill requests can be chunked based on the remaining max_num_batched_tokens.""" @@ -2074,28 +2127,28 @@ class SchedulerConfig: # so we don't reject sequences on account of a short # max_num_batched_tokens. self.max_num_batched_tokens = max( - self.max_model_len, _DEFAULT_MAX_NUM_BATCHED_TOKENS) + self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS) else: self.max_num_batched_tokens = ( - _DEFAULT_MAX_NUM_BATCHED_TOKENS) + DEFAULT_MAX_NUM_BATCHED_TOKENS) else: # If max_model_len is too short, use - # _DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value + # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value # for higher throughput. self.max_num_batched_tokens = max( - self.max_model_len, _DEFAULT_MAX_NUM_BATCHED_TOKENS) + self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS) if self.runner_type == "pooling": # Choose specific value for higher throughput self.max_num_batched_tokens = max( self.max_num_batched_tokens, - _POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, + POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, ) if self.is_multimodal_model: # The value needs to be at least the number of multimodal tokens self.max_num_batched_tokens = max( self.max_num_batched_tokens, - _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, + MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, ) # When using default settings, @@ -2196,12 +2249,16 @@ Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"] @config -@dataclass +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) class DeviceConfig: """Configuration for the device to use for vLLM execution.""" - device: Union[Device, torch.device] = "auto" - """Device type for vLLM execution.""" + device: SkipValidation[Union[Device, torch.device]] = "auto" + """Device type for vLLM execution. + This parameter is deprecated and will be + removed in a future release. + It will now be set automatically based + on the current platform.""" device_type: str = field(init=False) """Device type from the current platform. This is set in `__post_init__`.""" @@ -2250,8 +2307,8 @@ class DeviceConfig: self.device = torch.device(self.device_type) -SpeculativeMethod = Literal["ngram", "eagle", "medusa", "mlp_speculator", - "draft_model"] +SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa", + "mlp_speculator", "draft_model", "deepseek_mtp"] SpeculativeAcceptanceMethod = Literal["rejection_sampler", "typical_acceptance_sampler"] @@ -2262,8 +2319,7 @@ class SpeculativeConfig: """Configuration for speculative decoding.""" # General speculative decoding control - num_speculative_tokens: int = field(default=None, - init=True) # type: ignore + num_speculative_tokens: SkipValidation[int] = None # type: ignore """The number of speculative tokens, if provided. It will default to the number in the draft model config if present, otherwise, it is required.""" model: Optional[str] = None @@ -2339,26 +2395,23 @@ class SpeculativeConfig: """Specifies the tree structure for speculative token generation. """ # required configuration params passed from engine - target_model_config: ModelConfig = field(default=None, - init=True) # type: ignore + target_model_config: SkipValidation[ModelConfig] = None # type: ignore """The configuration of the target model.""" - target_parallel_config: ParallelConfig = field(default=None, - init=True) # type: ignore + target_parallel_config: SkipValidation[ + ParallelConfig] = None # type: ignore """The parallel configuration for the target model.""" - enable_chunked_prefill: bool = field(default=None, - init=True) # type: ignore + enable_chunked_prefill: SkipValidation[bool] = None # type: ignore """Whether vLLM is configured to use chunked prefill or not. Used for raising an error since it's not yet compatible with speculative decode.""" - disable_log_stats: bool = field(default=None, init=True) # type: ignore + disable_log_stats: SkipValidation[bool] = None # type: ignore """Whether to disable the periodic printing of stage times in speculative decoding.""" # params generated in the post-init stage - draft_model_config: ModelConfig = field(default=None, - init=True) # type: ignore + draft_model_config: SkipValidation[ModelConfig] = None # type: ignore """The configuration of the draft model initialized internal.""" - draft_parallel_config: ParallelConfig = field(default=None, - init=True) # type: ignore + draft_parallel_config: SkipValidation[ + ParallelConfig] = None # type: ignore """The parallel configuration for the draft model initialized internal.""" def compute_hash(self) -> str: @@ -2515,6 +2568,15 @@ class SpeculativeConfig: elif (self.draft_model_config.hf_config.model_type == "mlp_speculator"): self.method = "mlp_speculator" + elif (self.draft_model_config.hf_config.model_type == + "deepseek_mtp"): + self.method = "deepseek_mtp" + if self.num_speculative_tokens > 1: + logger.warning( + "All Deepseek MTP models only have " \ + "one layer. Might need some code changes " \ + "to support multiple layers." + ) else: self.method = "draft_model" @@ -2525,16 +2587,16 @@ class SpeculativeConfig: "Chunked prefill and EAGLE are not compatible " "when using V0.") - from vllm.platforms import current_platform from vllm.transformers_utils.configs.eagle import ( EAGLEConfig) if isinstance(self.draft_model_config.hf_config, - EAGLEConfig) or current_platform.is_neuron(): + EAGLEConfig): pass else: eagle_config = EAGLEConfig( self.draft_model_config.hf_config, - method=self.method) + method=self.method, + model_type="eagle") self.draft_model_config.hf_config = eagle_config if (self.num_speculative_tokens is not None @@ -2735,7 +2797,7 @@ class SpeculativeConfig: return self.num_speculative_tokens def use_eagle(self) -> bool: - return self.method in ("eagle", "eagle3") + return self.method in ("eagle", "eagle3", "deepseek_mtp") def __repr__(self) -> str: method = self.method @@ -2748,7 +2810,7 @@ LoRADType = Literal["auto", "float16", "bfloat16"] @config -@dataclass +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) class LoRAConfig: """Configuration for LoRA.""" @@ -2845,7 +2907,7 @@ class LoRAConfig: @config -@dataclass +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) class PromptAdapterConfig: """Configuration for PromptAdapters.""" @@ -2968,7 +3030,7 @@ class PoolerConfig: pooling_type: Optional[str] = None """ The pooling method of the pooling model. This should be a key in - {class}`vllm.model_executor.layers.pooler.PoolingType`. + [`vllm.model_executor.layers.pooler.PoolingType`][]. """ normalize: Optional[bool] = None @@ -3025,13 +3087,37 @@ _STR_DTYPE_TO_TORCH_DTYPE = { "bfloat16": torch.bfloat16, } -_ROCM_NOT_SUPPORTED_DTYPE: list[str] = [] # +# model_type -> reason +_FLOAT16_NOT_SUPPORTED_MODELS = { + "gemma2": "Numerical instability. Please use bfloat16 or float32 instead.", + "gemma3": "Numerical instability. Please use bfloat16 or float32 instead.", + "plamo2": "Numerical instability. Please use bfloat16 or float32 instead.", + "glm4": "Numerical instability. Please use bfloat16 or float32 instead.", +} -def _get_and_verify_dtype( +def _is_valid_dtype(model_type: str, dtype: torch.dtype): + if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16: # noqa: E501, SIM103 + return False + + return True + + +def _check_valid_dtype(model_type: str, dtype: torch.dtype): + if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16: + reason = _FLOAT16_NOT_SUPPORTED_MODELS[model_type] + raise ValueError(f"The model type {model_type!r} " + f"does not support float16. Reason: {reason}") + + return True + + +def _find_dtype( + model_id: str, config: PretrainedConfig, - dtype: Union[str, torch.dtype], -) -> torch.dtype: + *, + revision: Optional[str], +): # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct # because config.torch_dtype can be None. config_dtype = getattr(config, "torch_dtype", None) @@ -3043,75 +3129,111 @@ def _get_and_verify_dtype( if config_dtype is None and hasattr(config, "vision_config"): config_dtype = getattr(config.vision_config, "torch_dtype", None) + # Try to read the dtype of the weights if they are in safetensors format + if config_dtype is None: + repo_mt = try_get_safetensors_metadata(model_id, revision=revision) + + if repo_mt and (files_mt := repo_mt.files_metadata): + param_dtypes: set[torch.dtype] = { + _SAFETENSORS_TO_TORCH_DTYPE[dtype_str] + for file_mt in files_mt.values() + for dtype_str in file_mt.parameter_count + if dtype_str in _SAFETENSORS_TO_TORCH_DTYPE + } + + if param_dtypes: + return common_broadcastable_dtype(param_dtypes) + if config_dtype is None: config_dtype = torch.float32 + return config_dtype + + +def _resolve_auto_dtype( + model_type: str, + config_dtype: torch.dtype, + *, + is_pooling_model: bool, +): + from vllm.platforms import current_platform + + supported_dtypes = [ + dtype for dtype in current_platform.supported_dtypes + if _is_valid_dtype(model_type, dtype) + ] + + if is_pooling_model and torch.float16 in supported_dtypes: + preferred_dtype = torch.float16 + else: + preferred_dtype = supported_dtypes[0] + + # Downcast for float32 models + if config_dtype == torch.float32: + config_dtype = preferred_dtype + + if config_dtype in supported_dtypes: + return config_dtype + + # Ensure device compatibility + device_name = current_platform.get_device_name() + device_capability = current_platform.get_device_capability() + + if device_capability is None: + device_str = f"{device_name!r}" + else: + version_str = device_capability.as_version_str() + device_str = f"{device_name!r} (with compute capability {version_str})" + + logger.warning( + "Your device %s doesn't support %s. " + "Falling back to %s for compatibility.", + device_str, + config_dtype, + preferred_dtype, + ) + + return preferred_dtype + + +def _get_and_verify_dtype( + model_id: str, + config: PretrainedConfig, + dtype: Union[str, torch.dtype], + *, + is_pooling_model: bool, + revision: Optional[str] = None, +) -> torch.dtype: + config_dtype = _find_dtype(model_id, config, revision=revision) + model_type = config.model_type + if isinstance(dtype, str): dtype = dtype.lower() if dtype == "auto": # Set default dtype from model config - if config_dtype == torch.float32: - # Following common practice, we use float16 for float32 models - torch_dtype = torch.float16 - else: - torch_dtype = config_dtype - - if config.model_type == "plamo2": - logger.warning( - "For PLaMo2, we cast models to bfloat16 instead of using " - "float16 by default. This is because float16 does not work." - ) - torch_dtype = torch.bfloat16 - - # Deal with torch dtype fallback for device compatibility. - from vllm.platforms import current_platform - if torch_dtype not in current_platform.supported_dtypes: - device_name = current_platform.get_device_name() - - if ((capability := current_platform.get_device_capability()) - is None): - compute_str = "" - else: - version_str = capability.as_version_str() - compute_str = f" (with compute capability {version_str})" - fallback_dtype = current_platform.supported_dtypes[0] - logger.warning( - "Your %s device%s doesn't support %s. " \ - "Falling back to %s for compatibility.", - device_name, compute_str, torch_dtype, fallback_dtype - ) - torch_dtype = fallback_dtype - - if current_platform.is_hpu() and torch_dtype == torch.float16: - logger.warning( - "For HPU, we cast models to bfloat16 instead of " - "using float16 by default. Please specify `dtype` if you " - "want to use float16.") - torch_dtype = torch.bfloat16 - elif dtype == "float16" and config.model_type == "plamo2": - logger.warning( - "For PLaMo2, using float16 is unstable and might cause " - "unexpected behavior. Please use bfloat16 or float32 instead.") - torch_dtype = torch.float16 + torch_dtype = _resolve_auto_dtype( + model_type, + config_dtype, + is_pooling_model=is_pooling_model, + ) else: if dtype not in _STR_DTYPE_TO_TORCH_DTYPE: - raise ValueError(f"Unknown dtype: {dtype}") + raise ValueError(f"Unknown dtype: {dtype!r}") torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype] elif isinstance(dtype, torch.dtype): torch_dtype = dtype else: raise ValueError(f"Unknown dtype: {dtype}") - # Verify the dtype. + _check_valid_dtype(model_type, torch_dtype) + if torch_dtype != config_dtype: if torch_dtype == torch.float32: # Upcasting to float32 is allowed. logger.info("Upcasting %s to %s.", config_dtype, torch_dtype) - pass elif config_dtype == torch.float32: # Downcasting from float32 to float16 or bfloat16 is allowed. logger.info("Downcasting %s to %s.", config_dtype, torch_dtype) - pass else: # Casting between float16 and bfloat16 is allowed with a warning. logger.warning("Casting %s to %s.", config_dtype, torch_dtype) @@ -3491,7 +3613,7 @@ class KVTransferConfig: """The KV connector for vLLM to transmit KV caches between vLLM instances. """ - engine_id: str = str(uuid.uuid4()) + engine_id: Optional[str] = None """The engine id for KV transfers.""" kv_buffer_device: Optional[str] = "cuda" @@ -3548,6 +3670,9 @@ class KVTransferConfig: return hash_str def __post_init__(self) -> None: + if self.engine_id is None: + self.engine_id = str(uuid.uuid4()) + if self.kv_role is not None and self.kv_role not in get_args(KVRole): raise ValueError(f"Unsupported kv_role: {self.kv_role}. " f"Supported roles are {get_args(KVRole)}") @@ -3646,6 +3771,8 @@ class PassConfig: """Whether to enable the custom no-op elimination pass.""" enable_sequence_parallelism: bool = False """Whether to enable sequence parallelism.""" + enable_async_tp: bool = False + """Whether to enable async TP.""" def uuid(self): """ @@ -3655,7 +3782,8 @@ class PassConfig: compilation. """ include = { - "enable_fusion", "enable_noop", "enable_sequence_parallelism" + "enable_fusion", "enable_noop", "enable_sequence_parallelism", + "enable_async_tp" } dict_ = {k: v for k, v in asdict(self).items() if k in include} return InductorPass.hash_dict(dict_) @@ -3673,23 +3801,27 @@ class CompilationConfig: """Configuration for compilation. It has three parts: - Top-level Compilation control: - - {attr}`level` - - {attr}`debug_dump_path` - - {attr}`cache_dir` - - {attr}`backend` - - {attr}`custom_ops` - - {attr}`splitting_ops` + - [`level`][vllm.config.CompilationConfig.level] + - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path] + - [`cache_dir`][vllm.config.CompilationConfig.cache_dir] + - [`backend`][vllm.config.CompilationConfig.backend] + - [`custom_ops`][vllm.config.CompilationConfig.custom_ops] + - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops] - CudaGraph capture: - - {attr}`use_cudagraph` - - {attr}`cudagraph_capture_sizes` - - {attr}`cudagraph_num_of_warmups` - - {attr}`cudagraph_copy_inputs` - - {attr}`full_cuda_graph` + - [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph] + - [`cudagraph_capture_sizes`] + [vllm.config.CompilationConfig.cudagraph_capture_sizes] + - [`cudagraph_num_of_warmups`] + [vllm.config.CompilationConfig.cudagraph_num_of_warmups] + - [`cudagraph_copy_inputs`] + [vllm.config.CompilationConfig.cudagraph_copy_inputs] + - [`full_cuda_graph`][vllm.config.CompilationConfig.full_cuda_graph] - Inductor compilation: - - {attr}`use_inductor` - - {attr}`compile_sizes` - - {attr}`inductor_compile_config` - - {attr}`inductor_passes` + - [`use_inductor`][vllm.config.CompilationConfig.use_inductor] + - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes] + - [`inductor_compile_config`] + [vllm.config.CompilationConfig.inductor_compile_config] + - [`inductor_passes`][vllm.config.CompilationConfig.inductor_passes] - custom inductor passes Why we have different sizes for cudagraph and inductor: @@ -3864,17 +3996,11 @@ class CompilationConfig: "pass_config", "traced_files", } - include = dict() - for k, v in asdict(self).items(): - if k in exclude: - continue - f = get_field(CompilationConfig, k) - if (d := f.default) is not MISSING and d == v: - continue - if (df := f.default_factory) is not MISSING and df() == v: - continue - include[k] = v - return json.dumps(include) + # The cast to string is necessary because Pydantic is mocked in docs + # builds and sphinx-argparse doesn't know the return type of decode() + return str( + TypeAdapter(CompilationConfig).dump_json( + self, exclude=exclude, exclude_unset=True).decode()) __str__ = __repr__ @@ -3883,7 +4009,7 @@ class CompilationConfig: """Parse the CLI value for the compilation config.""" if cli_value in ["0", "1", "2", "3"]: return cls(level=int(cli_value)) - return cls(**json.loads(cli_value)) + return TypeAdapter(CompilationConfig).validate_json(cli_value) def __post_init__(self) -> None: count_none = self.custom_ops.count("none") @@ -4009,7 +4135,7 @@ class CompilationConfig: @config -@dataclass +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) class VllmConfig: """Dataclass which contains all vllm-related configuration. This simplifies passing around the distinct configurations in the codebase. @@ -4239,25 +4365,22 @@ class VllmConfig: self.model_config.verify_dual_chunk_attention_config( self.load_config) - if self.cache_config is not None: - self.cache_config.verify_with_parallel_config(self.parallel_config) + self.cache_config.verify_with_parallel_config(self.parallel_config) - if self.lora_config: + if self.lora_config is not None: self.lora_config.verify_with_cache_config(self.cache_config) self.lora_config.verify_with_model_config(self.model_config) self.lora_config.verify_lora_support() - if self.prompt_adapter_config: + if self.prompt_adapter_config is not None: self.prompt_adapter_config.verify_with_model_config( self.model_config) - if self.quant_config is None and \ - self.model_config is not None and self.load_config is not None: + if self.quant_config is None and self.model_config is not None: self.quant_config = VllmConfig._get_quantization_config( self.model_config, self.load_config) from vllm.platforms import current_platform - if self.scheduler_config is not None and \ - self.model_config is not None and \ + if self.model_config is not None and \ self.scheduler_config.chunked_prefill_enabled and \ self.model_config.dtype == torch.float32 and \ current_platform.get_device_capability() == (7, 5): @@ -4266,21 +4389,19 @@ class VllmConfig: "To workaround this limitation, vLLM will set 'ieee' input " "precision for chunked prefill triton kernels.") - if self.compilation_config is None: - self.compilation_config = CompilationConfig() + # async tp is built on top of sequence parallelism + # and requires it to be enabled. + if self.compilation_config.pass_config.enable_async_tp: + self.compilation_config.pass_config.enable_sequence_parallelism = \ + True if self.compilation_config.pass_config.enable_sequence_parallelism: self.compilation_config.custom_ops.append("+rms_norm") if envs.VLLM_USE_V1 and self.model_config is not None and \ not self.model_config.enforce_eager: - # NOTE(woosuk): Currently, we use inductor because the piecewise - # CUDA graphs do not work properly with the custom CUDA kernels. - # FIXME(woosuk): Disable inductor to reduce the compilation time - # and avoid any potential issues with the inductor. # FIXME(rob): Add function to set all of these. if not self.compilation_config.custom_ops: self.compilation_config.custom_ops = ["none"] self.compilation_config.use_cudagraph = True - self.compilation_config.use_inductor = True self.compilation_config.cudagraph_num_of_warmups = 1 self.compilation_config.pass_config.enable_fusion = False self.compilation_config.pass_config.enable_noop = False @@ -4289,8 +4410,7 @@ class VllmConfig: self._set_cudagraph_sizes() - if self.cache_config is not None and \ - self.cache_config.cpu_offload_gb > 0 and \ + if self.cache_config.cpu_offload_gb > 0 and \ self.compilation_config.level != CompilationLevel.NO_COMPILATION \ and not envs.VLLM_USE_V1: logger.warning( @@ -4312,28 +4432,16 @@ class VllmConfig: "full_cuda_graph is not supported with " "cascade attention. Disabling cascade attention.") self.model_config.disable_cascade_attn = True + self.cache_config.enable_prefix_caching = False - if self.model_config and self.model_config.use_mla and \ - not (current_platform.is_cuda() or current_platform.is_rocm()): - logger.info( - "MLA is enabled on a non-GPU platform; forcing chunked " - "prefill and prefix caching to be disabled.") - self.scheduler_config.enable_chunked_prefill = False - self.scheduler_config.chunked_prefill_enabled = False - self.scheduler_config.max_num_batched_tokens = max( - self.scheduler_config.max_model_len, - _DEFAULT_MAX_NUM_BATCHED_TOKENS) - - if self.cache_config is not None: - self.cache_config.enable_prefix_caching = False - - if (self.kv_events_config + if (self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events and not self.cache_config.enable_prefix_caching): logger.warning( "KV cache events are on, but prefix caching is not enabled." "Use --enable-prefix-caching to enable.") - if (self.kv_events_config and self.kv_events_config.publisher != "null" + if (self.kv_events_config is not None + and self.kv_events_config.publisher != "null" and not self.kv_events_config.enable_kv_cache_events): logger.warning("KV cache events are disabled," "but the scheduler is configured to publish them." @@ -4449,6 +4557,13 @@ class VllmConfig: self.compilation_config.init_with_cudagraph_sizes( batch_size_capture_list) + def recalculate_max_model_len(self, max_model_len: int): + model_config = self.model_config + max_model_len = model_config.get_and_verify_max_len(max_model_len) + self.model_config.max_model_len = max_model_len + self.scheduler_config.max_model_len = max_model_len + self.compute_hash() + def __str__(self): return ( f"model={self.model_config.model!r}," @@ -4549,7 +4664,7 @@ def contains_object_print(text): text (str): The text to check Returns: - bool: True if a match is found, False otherwise + result (bool): `True` if a match is found, `False` otherwise. """ pattern = r'at 0x[a-fA-F0-9]{2,16}>' match = re.search(pattern, text) diff --git a/vllm/connections.py b/vllm/connections.py index 9abc66050e18a..84e32a4d5ca9c 100644 --- a/vllm/connections.py +++ b/vllm/connections.py @@ -167,4 +167,7 @@ class HTTPConnection: global_http_connection = HTTPConnection() -"""The global {class}`HTTPConnection` instance used by vLLM.""" +""" +The global [`HTTPConnection`][vllm.connections.HTTPConnection] instance used +by vLLM. +""" diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index b69647b00586e..a250ec89cd5ba 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -1,44 +1,24 @@ # SPDX-License-Identifier: Apache-2.0 +import importlib.util +from typing import TYPE_CHECKING + import torch +import torch.distributed as dist from vllm.forward_context import get_forward_context +from vllm.logger import init_logger + +from .base_device_communicator import All2AllManagerBase, Cache + +logger = init_logger(__name__) + +if TYPE_CHECKING: + from vllm.model_executor.layers.fused_moe.layer import FusedMoE +else: + FusedMoE = None -class All2AllBase: - - def __init__(self, cpu_group, model): - self.cpu_group = cpu_group - - # compute some common properties - from vllm.distributed.parallel_state import (get_dp_group, - get_ep_group, - get_tp_group, - in_the_same_node_as) - - # all2all lives in ep group, which is merged from dp and tp group - self.dp_group = get_dp_group() - self.tp_group = get_tp_group() - self.ep_group = get_ep_group() - self.dp_rank = self.dp_group.rank_in_group - self.dp_world_size = self.dp_group.world_size - - # all2all communication often has separate implementations for - # intra-node and inter-node communication - self.intranode = in_the_same_node_as(cpu_group, source_rank=0) - self.internode = not self.intranode - - def dispatch(self, hidden_states: torch.Tensor, - router_logits: torch.Tensor): - raise NotImplementedError - - def combine(self, hidden_states: torch.Tensor) -> torch.Tensor: - raise NotImplementedError - - def destroy(self): - pass - - -class NaiveAll2All(All2AllBase): +class NaiveAll2AllManager(All2AllManagerBase): """ A naive implementation of all2all communication. It uses all-reduce under the hood, which is not @@ -46,8 +26,8 @@ class NaiveAll2All(All2AllBase): debugging. """ - def __init__(self, cpu_group, model): - super().__init__(cpu_group, model) + def __init__(self, cpu_group): + super().__init__(cpu_group) def naive_multicast(self, x: torch.Tensor, cu_tokens_across_dp_cpu: torch.Tensor): @@ -91,3 +71,56 @@ class NaiveAll2All(All2AllBase): def destroy(self): pass + + +class PPLXAll2AllManager(All2AllManagerBase): + """ + All2All communication based on PPLX kernels. + """ + + def __init__(self, cpu_group): + has_pplx = importlib.util.find_spec("pplx_kernels") is not None + assert has_pplx, "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels." # noqa + super().__init__(cpu_group) + + if self.internode: + # inter-node communication needs nvshmem, + # intra-node communication uses p2p mapping directly + from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id, + nvshmem_get_unique_id, + nvshmem_init) + logger.debug( + "Initialize NVSHMEM for pplx_kernels: " + "rank=%d, world size=%d", self.rank, self.world_size) + uid = nvshmem_get_unique_id( + ) if self.rank == 0 else nvshmem_alloc_empty_unique_id() + dist.broadcast(uid, + src=dist.get_process_group_ranks(self.cpu_group)[0], + group=self.cpu_group) + logger.debug("PPLX NVSHMEM UID = %s", uid) + nvshmem_init(uid, self.rank, self.world_size) + + self.handle_cache = Cache() + + def get_handle(self, kwargs): + import pplx_kernels as pplx + return self.handle_cache.get_or_create( + kwargs, pplx.AllToAll.internode + if self.internode else pplx.AllToAll.intranode) + + def dispatch(self, hidden_states: torch.Tensor, + router_logits: torch.Tensor): + raise NotImplementedError + + def combine(self, hidden_states: torch.Tensor) -> torch.Tensor: + raise NotImplementedError + + def destroy(self): + with self.handle_cache._lock: + for _, handle in self.handle_cache._cache.items(): + handle.destroy() + + if self.internode: + from pplx_kernels.nvshmem import nvshmem_finalize + logger.debug("PPLX NVSHMEM finalize") + nvshmem_finalize() diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py index ead79872bd499..52b970949144f 100644 --- a/vllm/distributed/device_communicators/base_device_communicator.py +++ b/vllm/distributed/device_communicators/base_device_communicator.py @@ -1,11 +1,76 @@ # SPDX-License-Identifier: Apache-2.0 +import threading from typing import Optional +from weakref import WeakValueDictionary import torch import torch.distributed as dist from torch.distributed import ProcessGroup +class Cache: + + def __init__(self): + self._cache: WeakValueDictionary = WeakValueDictionary() + self._lock = threading.RLock() # Reentrant lock for thread safety + + def get_or_create(self, kwargs, func): + # Create a hashable key from the kwargs + key = tuple(sorted((k, v) for k, v in kwargs.items())) + + with self._lock: + instance = self._cache.get(key) + if instance is None: + instance = func(**kwargs) + self._cache[key] = instance + return instance + + +class All2AllManagerBase: + + def __init__(self, cpu_group): + self.cpu_group = cpu_group + + # compute some common properties + from vllm.distributed.parallel_state import (get_dp_group, + get_tp_group, + in_the_same_node_as) + + # all2all lives in ep group, which is merged from dp and tp group + self.dp_group = get_dp_group() + self.tp_group = get_tp_group() + # no self.ep_group since self.ep_group is still in construction + # when we create this object + self.dp_rank = self.dp_group.rank_in_group + self.dp_world_size = self.dp_group.world_size + self.rank = dist.get_rank(cpu_group) + self.world_size = dist.get_world_size(cpu_group) + + # all2all communication often has separate implementations for + # intra-node and inter-node communication + self.intranode = in_the_same_node_as(cpu_group, source_rank=0) + self.internode = not self.intranode + + def get_handle(self, kwargs): + # get a handle for the all2all communication, + # based on the kwargs. + # different layers can have different configs, + # e.g. one layer has hidden size 1024, another has 2048. + # usually the underlying implementation caches the handle + # and reuse it for the same config. + raise NotImplementedError + + def dispatch(self, hidden_states: torch.Tensor, + router_logits: torch.Tensor): + raise NotImplementedError + + def combine(self, hidden_states: torch.Tensor) -> torch.Tensor: + raise NotImplementedError + + def destroy(self): + pass + + class DeviceCommunicatorBase: """ Base class for device-specific communicator. @@ -31,6 +96,18 @@ class DeviceCommunicatorBase: self.rank_in_group = dist.get_group_rank(self.cpu_group, self.global_rank) + use_ep = False + from vllm.config import get_current_vllm_config + config = get_current_vllm_config() + if config is not None: + # as long as we use data parallel (coupled data parallel + # where all data parallel ranks execute forward together), + # we initialize the all2all manager used in expert parallel. + use_ep = config.parallel_config.data_parallel_size > 1 + + self.use_all2all = "ep" in unique_name and use_ep + self.all2all_manager: Optional[All2AllManagerBase] = None + def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: dist.all_reduce(input_, group=self.device_group) return input_ @@ -154,9 +231,17 @@ class DeviceCommunicatorBase: model: torch.nn.Module) -> None: """ Prepare the communication buffer for the model. - This is a no-op in the base class. """ - pass + if not self.use_all2all: + return + + moe_modules = [ + module for module in model.modules() + if module.__class__.__name__ == "FusedMoE" + ] + for module in moe_modules: + module.quant_method.init_prepare_finalize(module.moe_config, + module.quant_config) def dispatch( self, hidden_states: torch.Tensor, diff --git a/vllm/distributed/device_communicators/cpu_communicator.py b/vllm/distributed/device_communicators/cpu_communicator.py index d4b34900b9515..c04218cb9f394 100644 --- a/vllm/distributed/device_communicators/cpu_communicator.py +++ b/vllm/distributed/device_communicators/cpu_communicator.py @@ -22,8 +22,10 @@ class CpuCommunicator(DeviceCommunicatorBase): super().__init__(cpu_group, device, device_group, unique_name) self.dist_module = torch.distributed - if (current_platform.get_cpu_architecture() == CpuArchEnum.X86) \ - and hasattr(torch.ops._C, "init_shm_manager"): + if (current_platform.get_cpu_architecture() + == CpuArchEnum.X86) and hasattr( + torch.ops._C, + "init_shm_manager") and unique_name.startswith("tp"): self.dist_module = _CPUSHMDistributed(self) def all_reduce(self, input_): @@ -96,6 +98,8 @@ class _CPUSHMDistributed: def __init__(self, communicator: CpuCommunicator): instance_identifier = os.environ["VLLM_DIST_IDENT"] + unique_name = communicator.unique_name + instance_identifier = f"{instance_identifier}-{unique_name}" self.communicator = communicator group_ranks = [str(rank) for rank in self.communicator.ranks] diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index 13303f94b8ea8..a05a13f51d4bc 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -6,10 +6,12 @@ import torch from torch.distributed import ProcessGroup import vllm.envs as envs +from vllm.logger import init_logger -from .all2all import All2AllBase from .base_device_communicator import DeviceCommunicatorBase +logger = init_logger(__name__) + class CudaCommunicator(DeviceCommunicatorBase): @@ -31,8 +33,6 @@ class CudaCommunicator(DeviceCommunicatorBase): use_pynccl = "ep" not in unique_name self.use_pynccl = use_pynccl - self.use_all2all = "ep" in unique_name - self.all2all_impl: Optional[All2AllBase] = None self.use_custom_allreduce = use_custom_allreduce # lazy import to avoid documentation build error @@ -56,6 +56,19 @@ class CudaCommunicator(DeviceCommunicatorBase): device=self.device, ) + if self.use_all2all: + all2all_backend = envs.VLLM_ALL2ALL_BACKEND + if all2all_backend == "naive": + from .all2all import NaiveAll2AllManager + self.all2all_manager = NaiveAll2AllManager(self.cpu_group) + logger.info("Using naive all2all manager.") + elif all2all_backend == "pplx": + from .all2all import PPLXAll2AllManager + self.all2all_manager = PPLXAll2AllManager(self.cpu_group) + logger.info("Using PPLX all2all manager.") + else: + raise ValueError(f"Unknown all2all backend: {all2all_backend}") + def all_reduce(self, input_): # always try custom allreduce first, # and then pynccl. @@ -136,31 +149,19 @@ class CudaCommunicator(DeviceCommunicatorBase): self.pynccl_comm = None if self.ca_comm is not None: self.ca_comm = None - if self.all2all_impl is not None: - self.all2all_impl.destroy() - self.all2all_impl = None - - def prepare_communication_buffer_for_model(self, - model: torch.nn.Module) -> None: - """ - Prepare the communication buffer for the model. - """ - if not self.use_all2all: - return - all2all_backend = envs.VLLM_ALL2ALL_BACKEND - if all2all_backend == "naive": - from .all2all import NaiveAll2All - self.all2all_impl = NaiveAll2All(self.cpu_group, model) + if self.all2all_manager is not None: + self.all2all_manager.destroy() + self.all2all_manager = None def dispatch( self, hidden_states: torch.Tensor, router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - assert self.all2all_impl is not None - hidden_states, router_logits = self.all2all_impl.dispatch( + assert self.all2all_manager is not None + hidden_states, router_logits = self.all2all_manager.dispatch( hidden_states, router_logits) return hidden_states, router_logits def combine(self, hidden_states: torch.Tensor) -> torch.Tensor: - assert self.all2all_impl is not None - hidden_states = self.all2all_impl.combine(hidden_states) + assert self.all2all_manager is not None + hidden_states = self.all2all_manager.combine(hidden_states) return hidden_states diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index fa944407a703c..40e57e6624d1e 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -import os import pickle -import sys import time from contextlib import contextmanager from dataclasses import dataclass, field @@ -19,7 +17,7 @@ from zmq import IPV6 # type: ignore from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context # type: ignore import vllm.envs as envs -from vllm.distributed.utils import StatelessProcessGroup +from vllm.distributed.utils import StatelessProcessGroup, sched_yield from vllm.logger import init_logger from vllm.utils import (get_ip, get_open_port, get_open_zmq_ipc_path, is_valid_ipv6_address) @@ -28,20 +26,6 @@ VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL logger = init_logger(__name__) -# We prefer to use os.sched_yield as it results in tighter polling loops, -# measured to be around 3e-7 seconds. However on earlier versions of Python -# os.sched_yield() does not release the GIL, so we fall back to time.sleep(0) -USE_SCHED_YIELD = ((sys.version_info[:3] >= (3, 11, 1)) - or (sys.version_info[:2] == (3, 10) - and sys.version_info[2] >= 8)) - - -def sched_yield(): - if USE_SCHED_YIELD: - os.sched_yield() - else: - time.sleep(0) - class ShmRingBuffer: diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py index a9f26607de49c..8b6abf5a80dd0 100644 --- a/vllm/distributed/kv_transfer/__init__.py +++ b/vllm/distributed/kv_transfer/__init__.py @@ -1,8 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType from vllm.distributed.kv_transfer.kv_transfer_state import ( - ensure_kv_transfer_initialized, get_kv_transfer_group, + KVConnectorBaseType, ensure_kv_transfer_initialized, get_kv_transfer_group, has_kv_transfer_group, is_v1_kv_transfer_group) __all__ = [ diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 06b3983ed68bd..dce0b545c188e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -70,7 +70,8 @@ class KVConnectorFactory: connector_module = importlib.import_module(connector_module_path) connector_cls = getattr(connector_module, connector_name) assert issubclass(connector_cls, KVConnectorBase_V1) - logger.info("Creating v1 connector with name: %s", connector_name) + logger.info("Creating v1 connector with name: %s and engine_id: %s", + connector_name, kv_transfer_config.engine_id) # NOTE(Kuntai): v1 connector is explicitly separated into two roles. # Scheduler connector: # - Co-locate with scheduler process diff --git a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py index 56b55c2bb59d2..58eabd0a37ebb 100644 --- a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py @@ -31,12 +31,12 @@ class MooncakeStoreConnector(KVConnectorBase): local_rank: int, config: VllmConfig, ): - self.config = config.kv_transfer_config + self.kv_transfer_config = config.kv_transfer_config self.kv_helper = kv_helper(config) self.local_tp_rank = local_rank # Init kv_store - if self.config.kv_connector == "MooncakeStoreConnector": + if self.kv_transfer_config.kv_connector == "MooncakeStoreConnector": # Check if MOONCAKE_CONFIG_PATH is set import os use_mooncake_store = os.getenv('MOONCAKE_CONFIG_PATH') is not None @@ -50,10 +50,11 @@ class MooncakeStoreConnector(KVConnectorBase): MooncakeStore) logger.info( "Initializing KVStoreConnector under kv_transfer_config %s", - self.config) + self.kv_transfer_config) self.kv_store = MooncakeStore(config) else: - logger.error("Can not find %s", self.config.kv_connector) + logger.error("Can not find %s", + self.kv_transfer_config.kv_connector) assert self.kv_store is not None diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py index 2e4bd20740e2e..ed8fe38161e97 100644 --- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py @@ -106,7 +106,7 @@ class SimpleConnector(KVConnectorBase): else: # the current vLLM instance is KV consumer, so it needs to connect - # its recv pipe to the send pipe of KV producder + # its recv pipe to the send pipe of KV producer if self.config.kv_connector == "PyNcclConnector": self.consumer_data_pipe = PyNcclPipe( local_rank=local_rank, diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 0b0ce9828a74d..b1c9c9af6e235 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -44,8 +44,9 @@ class model_aware_kv_ops_helper: head_size = model_config.qk_nope_head_dim + \ model_config.qk_rope_head_dim else: - head_size = getattr(model_config, "head_dim", - int(hidden_size // num_attention_heads)) + head_size = getattr(model_config, "head_dim", None) + if head_size is None: + head_size = int(hidden_size // num_attention_heads) return num_heads, head_size diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index ef4460a592bd6..bc9258e9d07b6 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -210,10 +210,11 @@ class KVConnectorBase_V1(ABC): computed tokens for this request Returns: - * the number of tokens that can be loaded from the - external KV cache beyond what is already computed. - * true if external KV cache tokens will be loaded - asynchronously (between scheduler steps). + A tuple with the following elements: + - The number of tokens that can be loaded from the + external KV cache beyond what is already computed. + - `True` if external KV cache tokens will be loaded + asynchronously (between scheduler steps). """ pass diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index cea454a0b5977..0aabb260fd3dc 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -40,7 +40,7 @@ class MultiConnector(KVConnectorBase_V1): def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): super().__init__(vllm_config=vllm_config, role=role) - self._connectors = [] + self._connectors: list[KVConnectorBase_V1] = [] ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get( "connectors") assert ktcs is not None diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 9c2e82b29c76c..4d228dbc9d492 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -172,6 +172,11 @@ class NixlConnectorScheduler: self.vllm_config = vllm_config self.block_size = vllm_config.cache_config.block_size self.engine_id = engine_id + self.side_channel_host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST + self.side_channel_port = ( + envs.VLLM_NIXL_SIDE_CHANNEL_PORT + + vllm_config.parallel_config.data_parallel_rank_local * + vllm_config.parallel_config.tensor_parallel_size) logger.info("Initializing NIXL Scheduler %s", engine_id) # Requests that need to start recv. @@ -259,6 +264,15 @@ class NixlConnectorScheduler: # Loop through scheduled reqs and convert to ReqMeta. for req_id, (req, block_ids) in self._reqs_need_recv.items(): assert req.kv_transfer_params is not None + # For the case where there are no remote blocks to pull + # (block_ids is empty), we don't need to schedule + # an async read on the worker side. + if not block_ids: + logger.debug( + "Skipping adding request %s to NixlConnectorMetadata, " + "as there are no remote blocks to pull", req_id) + continue + meta.add_new_req( request_id=req_id, local_block_ids=block_ids, @@ -301,8 +315,8 @@ class NixlConnectorScheduler: do_remote_decode=False, remote_block_ids=computed_block_ids, remote_engine_id=self.engine_id, - remote_host=envs.VLLM_NIXL_SIDE_CHANNEL_HOST, - remote_port=envs.VLLM_NIXL_SIDE_CHANNEL_PORT, + remote_host=self.side_channel_host, + remote_port=self.side_channel_port, ) @@ -321,9 +335,18 @@ class NixlConnectorWorker: # Map of engine_id -> agent_name. self._remote_agents: dict[str, str] = {} + # NIXL handshake port. + # NOTE(rob): Within a DP group, each DP rank gets its own + # base port (which is sent in the KVTransferParams). + # Each TP rank listens/queries on the base_port + tp_rank. + self.side_channel_port = ( + envs.VLLM_NIXL_SIDE_CHANNEL_PORT + + vllm_config.parallel_config.data_parallel_rank_local * + vllm_config.parallel_config.tensor_parallel_size) + # Metadata. self.engine_id = engine_id - self.rank = get_tensor_model_parallel_rank() + self.tp_rank = get_tensor_model_parallel_rank() self.world_size = get_tensor_model_parallel_world_size() self.tp_group = get_tp_group() @@ -373,15 +396,11 @@ class NixlConnectorWorker: @staticmethod def _nixl_handshake_listener(metadata: NixlAgentMetadata, - ready_event: threading.Event, rank: int): + ready_event: threading.Event, base_port: int, + tp_rank: int): """Background thread for getting new NIXL handshakes.""" # NOTE(rob): this is a simple implementation. We will move - # to a better approach like an ETCD server in the future. - - # NOTE(rob): to support heterogeneous TP, we will have to - # move this into the scheduler rather than worker, since - # each rank needs the metadata of all other ranks (whereas - # in this setup, each rank only gets one other rank's meta. + # to a better approach via HTTP endpoint soon. encoder = msgspec.msgpack.Encoder() encoded_data = encoder.encode(metadata) @@ -391,11 +410,7 @@ class NixlConnectorWorker: # Listen for new requests for metadata. host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST - # NOTE(rob): we need each rank to have a unique port. This - # hack to keeps us moving. We will switch when moving to etcd - # or where we have a single ZMQ socket in the scheduler. - port = envs.VLLM_NIXL_SIDE_CHANNEL_PORT + rank - path = make_zmq_path("tcp", host, port) + path = make_zmq_path("tcp", host, base_port + tp_rank) logger.debug("Starting listening on path: %s", path) with zmq_ctx(zmq.ROUTER, path) as sock: ready_event.set() @@ -410,10 +425,10 @@ class NixlConnectorWorker: """Do a NIXL handshake with a remote instance.""" start_time = time.perf_counter() - # NOTE(rob): we need each rank to have a unique port. This is - # a hack to keep us moving. We will switch when moving to etcd - # or where we have a single ZMQ socket in the scheduler. - path = make_zmq_path("tcp", host, port + self.rank) + # NOTE(rob): we need each tp_rank to have a unique port. + # This is a hack to keep us moving. We will switch when + # we switch to HTTP-based NIXL metadata exchange. + path = make_zmq_path("tcp", host, port + self.tp_rank) logger.debug("Querying metadata on path: %s", path) with zmq_ctx(zmq.REQ, path) as sock: # Send query for the request. @@ -477,7 +492,8 @@ class NixlConnectorWorker: for cache in cache_list: base_addr = cache.data_ptr() region_len = self.num_blocks * self.block_len - caches_data.append((base_addr, region_len, self.rank, "")) + caches_data.append( + (base_addr, region_len, cache.device.index, "")) kv_caches_base_addr.append(base_addr) self.kv_caches_base_addr[self.engine_id] = kv_caches_base_addr self.num_regions = len(caches_data) @@ -520,7 +536,7 @@ class NixlConnectorWorker: ready_event = threading.Event() self._nixl_handshake_listener_t = threading.Thread( target=self._nixl_handshake_listener, - args=(metadata, ready_event, self.rank), + args=(metadata, ready_event, self.side_channel_port, self.tp_rank), daemon=True, name="nixl_handshake_listener") self._nixl_handshake_listener_t.start() @@ -528,6 +544,7 @@ class NixlConnectorWorker: def add_remote_agent(self, nixl_agent_meta: NixlAgentMetadata): engine_id = nixl_agent_meta.engine_id + assert engine_id != self.engine_id, "Conflict engine id found!" if engine_id in self._remote_agents: return @@ -543,9 +560,9 @@ class NixlConnectorWorker: block_offset = block_id * self.block_len # (addr, len, device id) blocks_data.append( - (base_addr + block_offset, self.block_len, self.rank)) - logger.debug("Created %s blocks for src engine %s and rank %s", - len(blocks_data), self.engine_id, self.rank) + (base_addr + block_offset, self.block_len, self.tp_rank)) + logger.debug("Created %s blocks for src engine %s and tp_rank %s", + len(blocks_data), self.engine_id, self.tp_rank) # Register with NIXL. descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM") @@ -560,9 +577,9 @@ class NixlConnectorWorker: block_offset = block_id * self.block_len # (addr, len, device id) blocks_data.append( - (base_addr + block_offset, self.block_len, self.rank)) - logger.debug("Created %s blocks for dst engine %s and rank %s", - len(blocks_data), engine_id, self.rank) + (base_addr + block_offset, self.block_len, self.tp_rank)) + logger.debug("Created %s blocks for dst engine %s and tp_rank %s", + len(blocks_data), engine_id, self.tp_rank) # Register with NIXL. descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM") @@ -587,14 +604,14 @@ class NixlConnectorWorker: if len(done_sending) > 0 or len(done_recving) > 0: logger.debug( "Rank %s, get_finished: %s requests done sending " - "and %s requests done recving", self.rank, len(done_sending), - len(done_recving)) + "and %s requests done recving", self.tp_rank, + len(done_sending), len(done_recving)) if self.world_size == 1: return done_sending, done_recving # Rank 0: get finished from all other ranks. - if self.rank == 0: + if self.tp_rank == 0: for req_id in done_sending: self._done_sending_count[req_id] += 1 for req_id in done_recving: diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py index fcc38d7fbd125..761c56f7e41f5 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py @@ -118,11 +118,11 @@ class PyNcclPipe(KVPipeBase): """ Create the metadata as a dictionary based on the input tensor. - Parameters: - - tensor: The input tensor or None if no tensor is provided. + Args: + tensor: The input tensor or None if no tensor is provided. Returns: - - metadata: A dictionary with the following keys: + metadata: A dictionary with the following keys: - "dtype": The data type of the tensor or None. - "shape": The shape of the tensor or None. """ @@ -135,13 +135,13 @@ class PyNcclPipe(KVPipeBase): """ Create a buffer to receive the tensor based on the provided metadata. - Parameters: - - metadata: A dictionary with keys "dtype" and "shape", describing - the tensor's data type and shape. + Args: + metadata: A dictionary with keys "dtype" and "shape", + describing the tensor's data type and shape. Returns: - - buffer: A tensor of the specified type and shape, allocated on - self.device. + buffer: A tensor of the specified type and shape, + allocated on `self.device`. """ return torch.empty(metadata["shape"], dtype=metadata["dtype"], @@ -151,8 +151,8 @@ class PyNcclPipe(KVPipeBase): """ Send the metadata dictionary to the target rank. - Parameters: - - metadata: A dictionary with keys "dtype" and "shape". + Args: + metadata: A dictionary with keys "dtype" and "shape". """ self.group.send_obj(metadata, self.target_rank_for_send) @@ -161,8 +161,8 @@ class PyNcclPipe(KVPipeBase): Receive the metadata dictionary from the target rank. Returns: - - metadata: A dictionary with keys "dtype" and "shape" describing - the tensor. + metadata: A dictionary with keys "dtype" and "shape" + describing the tensor. """ return self.group.recv_obj(self.target_rank_for_recv) @@ -171,9 +171,9 @@ class PyNcclPipe(KVPipeBase): The actual implementation of sending the tensor and its metadata to the target rank. - Parameters: - - tensor: The input tensor to be sent, or None if no tensor is - being sent. + Args: + tensor: The input tensor to be sent, or `None` if no tensor is + being sent. """ metadata = self._make_metadata(tensor) self._send_metadata(metadata) @@ -187,7 +187,7 @@ class PyNcclPipe(KVPipeBase): the target rank. Returns: - - buffer: The received tensor, or None if no tensor is received. + buffer: The received tensor, or `None` if no tensor is received. """ metadata = self._recv_metadata() if metadata["dtype"] is None: @@ -227,8 +227,8 @@ class PyNcclPipe(KVPipeBase): Sends a tensor and its metadata to the destination rank in a non-blocking way. - Parameters: - - tensor: The tensor to send, or None if no tensor is being sent. + Args: + tensor: The tensor to send, or `None` if no tensor is being sent. """ if self.transport_thread is None: self.transport_thread = ThreadPoolExecutor(max_workers=1) @@ -250,8 +250,8 @@ class PyNcclPipe(KVPipeBase): """ Receives a tensor and its metadata from the source rank. Blocking call. - Returns: - - tensor: The received tensor, or None if no tensor is received. + Args: + tensor: The received tensor, or `None` if no tensor is received. """ if self.transport_thread is None: self.transport_thread = ThreadPoolExecutor(max_workers=1) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 51c519d8f8623..6e48c02da6692 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -23,7 +23,6 @@ If you only need to use the distributed environment without model/pipeline """ import contextlib import gc -import importlib.util import pickle import weakref from collections import namedtuple @@ -42,8 +41,8 @@ from vllm.distributed.device_communicators.base_device_communicator import ( DeviceCommunicatorBase) from vllm.distributed.utils import StatelessProcessGroup from vllm.logger import init_logger -from vllm.utils import (direct_register_custom_op, resolve_obj_by_qualname, - run_once, supports_custom_op) +from vllm.utils import (direct_register_custom_op, get_distributed_init_method, + resolve_obj_by_qualname, supports_custom_op) @dataclass @@ -120,7 +119,7 @@ def reduce_scatter(tensor: torch.Tensor, dim: int, world_size: int, group = _groups[group_name]() if group is None: raise ValueError(f"Group {group_name} is destroyed.") - return group.reduce_scatter(tensor, dim) + return group._reduce_scatter_out_place(tensor, dim) def reduce_scatter_fake(tensor: torch.Tensor, dim: int, world_size: int, @@ -136,7 +135,7 @@ def all_gather(tensor: torch.Tensor, dim: int, world_size: int, group = _groups[group_name]() if group is None: raise ValueError(f"Group {group_name} is destroyed.") - return group.all_gather(tensor, dim) + return group._all_gather_out_place(tensor, dim) def all_gather_fake(tensor: torch.Tensor, dim: int, world_size: int, @@ -161,6 +160,7 @@ if supports_custom_op(): op_func=reduce_scatter, mutates_args=[], fake_impl=reduce_scatter_fake, + dispatch_key=current_platform.dispatch_key, ) direct_register_custom_op( @@ -168,6 +168,7 @@ if supports_custom_op(): op_func=all_gather, mutates_args=[], fake_impl=all_gather_fake, + dispatch_key=current_platform.dispatch_key, ) @@ -367,6 +368,16 @@ class GroupCoordinator: assert -input_.dim() <= dim < input_.dim(), ( f"Invalid dim ({dim}) for input tensor with shape {input_.size()}") + if self.use_custom_op_call: + return torch.ops.vllm.all_gather(input_, + dim, + world_size, + group_name=self.unique_name) + else: + return self._all_gather_out_place(input_, dim) + + def _all_gather_out_place(self, input_: torch.Tensor, + dim: int) -> torch.Tensor: return self.device_communicator.all_gather(input_, dim) def reduce_scatter(self, @@ -379,6 +390,16 @@ class GroupCoordinator: assert -input_.dim() <= dim < input_.dim(), ( f"Invalid dim ({dim}) for input tensor with shape {input_.size()}") + if self.use_custom_op_call: + return torch.ops.vllm.reduce_scatter(input_, + dim, + world_size, + group_name=self.unique_name) + else: + return self._reduce_scatter_out_place(input_, dim) + + def _reduce_scatter_out_place(self, input_: torch.Tensor, + dim: int) -> torch.Tensor: return self.device_communicator.reduce_scatter(input_, dim) def gather(self, @@ -769,10 +790,14 @@ class GroupCoordinator: if self.device_communicator is not None: return self.device_communicator.dispatch(hidden_states, router_logits) + else: + return hidden_states, router_logits def combine(self, hidden_states) -> torch.Tensor: if self.device_communicator is not None: return self.device_communicator.combine(hidden_states) + else: + return hidden_states _WORLD: Optional[GroupCoordinator] = None @@ -904,7 +929,7 @@ def init_distributed_environment( world_size = parallel_config.world_size_across_dp ip = parallel_config.data_parallel_master_ip port = parallel_config.get_next_dp_init_port() - distributed_init_method = f"tcp://{ip}:{port}" # noqa + distributed_init_method = get_distributed_init_method(ip, port) logger.info( "Adjusting world_size=%d rank=%d distributed_init_method=%s for DP", world_size, rank, distributed_init_method) @@ -937,49 +962,9 @@ def init_distributed_environment( "world group already initialized with a different world size") -PPLX_DID_INIT: bool = False - - -@run_once -def pplx_init(rank, world_size): - has_pplx = importlib.util.find_spec("pplx_kernels") is not None - - if has_pplx and world_size > 1: - from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id, - nvshmem_get_unique_id, nvshmem_init) - try: - global PPLX_DID_INIT - logger.debug( - "Initialize NVSHMEM for PPLX kernels: rank=%d, " - "world size=%d", rank, world_size) - uid = nvshmem_get_unique_id( - ) if rank == 0 else nvshmem_alloc_empty_unique_id() - uid_gpu = uid.cuda() - get_world_group().broadcast(uid_gpu, src=0) - uid = uid_gpu.to(device='cpu') - logger.debug("PPLX NVSHMEM UID = %s", uid) - nvshmem_init(uid, rank, world_size) - PPLX_DID_INIT = True - except Exception as ex: - logger.error("Failed to initialize NVSHMEM for PPLX: %s", ex) - - -@run_once -def pplx_finalize(): - global PPLX_DID_INIT - if PPLX_DID_INIT: - from pplx_kernels.nvshmem import nvshmem_finalize - logger.debug("PPLX NVSHMEM finalize") - from vllm.model_executor.layers.fused_moe.layer import ( - _all_to_all_cache) - _all_to_all_cache.destroy() - nvshmem_finalize() - - def initialize_model_parallel( tensor_model_parallel_size: int = 1, pipeline_model_parallel_size: int = 1, - enable_expert_parallel: bool = False, backend: Optional[str] = None, ) -> None: """ @@ -1082,14 +1067,10 @@ def initialize_model_parallel( _DP.rank_in_group, _PP.rank_in_group, _TP.rank_in_group, _EP.rank_in_group) - if enable_expert_parallel: - pplx_init(rank, world_size) - def ensure_model_parallel_initialized( tensor_model_parallel_size: int, pipeline_model_parallel_size: int, - enable_expert_parallel: bool = False, backend: Optional[str] = None, ) -> None: """Helper to initialize model parallel groups if they are not initialized, @@ -1100,8 +1081,7 @@ def ensure_model_parallel_initialized( get_world_group().device_group) if not model_parallel_is_initialized(): initialize_model_parallel(tensor_model_parallel_size, - pipeline_model_parallel_size, - enable_expert_parallel, backend) + pipeline_model_parallel_size, backend) return assert ( @@ -1180,8 +1160,6 @@ def destroy_model_parallel(): """Set the groups to none and destroy them.""" global _TP - pplx_finalize() - if _TP: _TP.destroy() _TP = None @@ -1221,8 +1199,9 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False): ray.shutdown() gc.collect() from vllm.platforms import current_platform - if not current_platform.is_cpu(): - torch.cuda.empty_cache() + empty_cache = current_platform.empty_cache + if empty_cache is not None: + empty_cache() try: torch._C._host_emptyCache() except AttributeError: diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index 6bb323d79d64e..96d08dc1a3c18 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -5,20 +5,22 @@ # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. import dataclasses -import datetime +import os import pickle import socket +import sys import time +import uuid from collections import deque from collections.abc import Sequence +from datetime import timedelta from typing import Any, Optional import torch from torch.distributed import ProcessGroup, TCPStore from torch.distributed.distributed_c10d import (Backend, PrefixStore, _get_default_timeout, - _unregister_process_group, - is_nccl_available) + _unregister_process_group) from torch.distributed.rendezvous import rendezvous import vllm.envs as envs @@ -27,6 +29,20 @@ from vllm.utils import get_tcp_uri, is_torch_equal_or_newer logger = init_logger(__name__) +# We prefer to use os.sched_yield as it results in tighter polling loops, +# measured to be around 3e-7 seconds. However on earlier versions of Python +# os.sched_yield() does not release the GIL, so we fall back to time.sleep(0) +USE_SCHED_YIELD = ((sys.version_info[:3] >= (3, 11, 1)) + or (sys.version_info[:2] == (3, 10) + and sys.version_info[2] >= 8)) + + +def sched_yield(): + if USE_SCHED_YIELD: + os.sched_yield() + else: + time.sleep(0) + def ensure_divisibility(numerator, denominator): """Ensure that numerator is divisible by the denominator.""" @@ -212,10 +228,141 @@ class StatelessProcessGroup: gathered_objs.append(recv_obj) return gathered_objs - def barrier(self): - """A barrier to synchronize all ranks.""" + def barrier(self, timeout: float = 30.0): + """A robust barrier to synchronize all ranks. + + + Uses a multi-phase approach to ensure all processes reach the barrier + before proceeding: + + 1. Each process signals it has reached the barrier + + 2. Each process signals that it has confirmed the arrival of all other + ranks. + + 3. Rank 0 waits for all other ranks to signal their departure to ensure + that all ranks have departed the barrier first. + + Args: + timeout: Maximum time in seconds to wait for each phase (in seconds) + + + Raises: + RuntimeError: If coordination fails or times out + """ + # Generate a barrier ID that is globally unique + try: + if self.rank == 0: + barrier_id = f"barrier_{uuid.uuid4()}" + self.broadcast_obj(barrier_id, src=0) + else: + barrier_id = self.broadcast_obj(None, src=0) + except Exception as e: + raise RuntimeError("Failed to broadcast barrier_id") from e + + # Phase 1: Signal arrival at barrier + # Wait for all processes to arrive + # We need all ranks to confirm the arrival of all other ranks. + # This is the key synchronization point. + arrival_key = f"arrival_{barrier_id}_{self.rank}" + try: + self.store.set(arrival_key, b"1") + except Exception as e: + raise RuntimeError("Failed to signal barrier arrival") from e + + start_time = time.time() + processes_arrived: set[int] = set() + + while len(processes_arrived) < self.world_size: + # Check for timeout + cur_time = time.time() + if cur_time - start_time > timeout: + raise RuntimeError("Barrier timed out after %f seconds", + timeout) + + # Check for each process + for i in range(self.world_size): + if i in processes_arrived: + continue + + key = f"arrival_{barrier_id}_{i}" + try: + # Try to get the key - if it exists, we'll get a value + # If it doesn't exist, it will throw an exception + self.store.get(key) + processes_arrived.add(i) + except KeyError: + # Key doesn't exist yet + pass + except Exception as check_e: + logger.debug("Error checking key existence: %s", check_e) + sched_yield() + + # Short sleep to avoid tight polling + if len(processes_arrived) < self.world_size: + sched_yield() + + # Phase 2: Signal departure from barrier + # We only care to block at this stage in rank 0, which runs the + # server side of the TCPStore. We want to make sure that all + # clients have departed the barrier before rank 0 in case the + # next thing after the barrier is a shutdown, including tearing + # down the TCPStore. Other ranks can exit the barrier immediately + # after signaling their departure. + departure_key = f"departure_{barrier_id}_{self.rank}" + try: + self.store.set(departure_key, b"1") + except Exception as e: + raise RuntimeError("Failed to signal barrier departure") from e + + if self.rank != 0: + return + + # Make rank 0 wait for all processes to signal departure + start_time = time.time() + processes_departed: set[int] = set() + + while len(processes_departed) < self.world_size: + # Check for timeout + if time.time() - start_time > timeout: + raise RuntimeError("Barrier departure timed out after %f s", + timeout) + + # Check for each process + for i in range(self.world_size): + if i in processes_departed: + continue + + key = f"departure_{barrier_id}_{i}" + try: + # Try to get the key - if it exists, we'll get a value + # If it doesn't exist, it will throw an exception + self.store.get(key) + processes_departed.add(i) + except KeyError: + # Key doesn't exist yet + pass + except Exception as check_e: + logger.debug("Error checking key existence: %s", check_e) + sched_yield() + + # Short sleep to avoid tight polling + if len(processes_departed) < self.world_size: + sched_yield() + + # Clean up keys to avoid leaking memory in the store for i in range(self.world_size): - self.broadcast_obj(None, src=i) + try: + self.store.delete_key(f"arrival_{barrier_id}_{i}") + except Exception: + logger.debug("Error deleting key: %s", + f'arrival_{barrier_id}_{i}') + + try: + self.store.delete_key(f"departure_{barrier_id}_{i}") + except Exception: + logger.debug("Error deleting key: %s", + f'departure_{barrier_id}_{i}') @staticmethod def create( @@ -258,7 +405,7 @@ class StatelessProcessGroup: port=port, world_size=world_size, is_master=launch_server, - timeout=datetime.timedelta(seconds=store_timeout), + timeout=timedelta(seconds=store_timeout), use_libuv=False, # for now: github.com/pytorch/pytorch/pull/150215 master_listen_fd=listen_fd, ) @@ -271,6 +418,43 @@ class StatelessProcessGroup: data_expiration_seconds=data_expiration_seconds) +def init_gloo_process_group(backend: Backend, prefix_store: PrefixStore, + group_rank: int, group_size: int, + timeout: timedelta) -> ProcessGroup: + """ + Stateless init ProcessGroup with gloo backend compatible with + different torch versions. + """ + if is_torch_equal_or_newer("2.6"): + pg = ProcessGroup( + prefix_store, + group_rank, + group_size, + ) + else: + options = ProcessGroup.Options(backend=backend) + pg = ProcessGroup( + prefix_store, + group_rank, + group_size, + options, + ) + from torch.distributed.distributed_c10d import ProcessGroupGloo + backend_class = ProcessGroupGloo(prefix_store, + group_rank, + group_size, + timeout=timeout) + backend_type = ProcessGroup.BackendType.GLOO + device = torch.device("cpu") + if is_torch_equal_or_newer("2.6"): + # _set_default_backend is supported in torch >= 2.6 + pg._set_default_backend(backend_type) + backend_class._set_sequence_number_for_group() + + pg._register_backend(device, backend_type, backend_class) + return pg + + def stateless_init_torch_distributed_process_group( host: str, port: int, rank: int, world_size: int, backend: str) -> ProcessGroup: @@ -320,40 +504,19 @@ def stateless_init_torch_distributed_process_group( # different systems (e.g. RPC) in case the store is multi-tenant. prefix_store = PrefixStore(init_method, store) - pg: ProcessGroup = ProcessGroup( - prefix_store, - group_rank, - group_size, - ) - if backend == "gloo": - from torch.distributed.distributed_c10d import ProcessGroupGloo - backend_class = ProcessGroupGloo(prefix_store, - group_rank, - group_size, - timeout=timeout) - backend_type = ProcessGroup.BackendType.GLOO - device = torch.device("cpu") - elif backend == "nccl": - assert is_nccl_available() - from torch.distributed.distributed_c10d import ProcessGroupNCCL - - backend_options = ProcessGroupNCCL.Options() - backend_options._timeout = timeout - - backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size, - backend_options) - backend_type = ProcessGroup.BackendType.NCCL - device = torch.device("cuda") - else: - raise RuntimeError(f"Unsupported torch distributed backend: {backend}") - - pg._set_default_backend(backend_type) - backend_class._set_sequence_number_for_group() - - pg._register_backend(device, backend_type, backend_class) - - return pg + return init_gloo_process_group(backend=backend, + prefix_store=prefix_store, + group_rank=group_rank, + group_size=group_size, + timeout=timeout) + from vllm.platforms import current_platform + return current_platform.stateless_init_device_torch_dist_pg( + backend=backend, + prefix_store=prefix_store, + group_rank=group_rank, + group_size=group_size, + timeout=timeout) def stateless_destroy_torch_distributed_process_group( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 1c6df960ebe36..492c44f42e7a8 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -4,7 +4,6 @@ import argparse import dataclasses import json -import re import sys import threading import warnings @@ -13,7 +12,9 @@ from itertools import permutations from typing import (Annotated, Any, Callable, Dict, List, Literal, Optional, Type, TypeVar, Union, cast, get_args, get_origin) +import regex as re import torch +from pydantic import SkipValidation, TypeAdapter, ValidationError from typing_extensions import TypeIs, deprecated import vllm.envs as envs @@ -38,7 +39,7 @@ from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 from vllm.transformers_utils.utils import check_gguf_file from vllm.usage.usage_lib import UsageContext from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser, - GiB_bytes, is_in_doc_build, is_in_ray_actor) + GiB_bytes, is_in_ray_actor) # yapf: enable @@ -156,7 +157,8 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]: # Get the set of possible types for the field type_hints: set[TypeHint] = set() if get_origin(field.type) in {Union, Annotated}: - type_hints.update(get_args(field.type)) + predicate = lambda arg: not isinstance(arg, SkipValidation) + type_hints.update(filter(predicate, get_args(field.type))) else: type_hints.add(field.type) @@ -168,10 +170,7 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]: if field.default is not MISSING: default = field.default elif field.default_factory is not MISSING: - if is_dataclass(field.default_factory) and is_in_doc_build(): - default = {} - else: - default = field.default_factory() + default = field.default_factory() # Get the help text for the field name = field.name @@ -189,12 +188,16 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]: - `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n - `--json-arg.key1 value1 --json-arg.key2.key3 value2`\n\n""" if dataclass_cls is not None: - dataclass_init = lambda x, f=dataclass_cls: f(**json.loads(x)) - # Special case for configs with a from_cli method - if hasattr(dataclass_cls, "from_cli"): - from_cli = dataclass_cls.from_cli - dataclass_init = lambda x, f=from_cli: f(x) - kwargs[name]["type"] = dataclass_init + + def parse_dataclass(val: str, cls=dataclass_cls) -> Any: + try: + if hasattr(cls, "from_cli"): + return cls.from_cli(val) + return TypeAdapter(cls).validate_json(val) + except ValidationError as e: + raise argparse.ArgumentTypeError(repr(e)) from e + + kwargs[name]["type"] = parse_dataclass kwargs[name]["help"] += json_tip elif contains_type(type_hints, bool): # Creates --no-<name> and --<name> flags @@ -221,16 +224,15 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]: elif contains_type(type_hints, int): kwargs[name]["type"] = int # Special case for large integers - if name in {"max_model_len"}: + if name in {"max_model_len", "max_num_batched_tokens"}: kwargs[name]["type"] = human_readable_int elif contains_type(type_hints, float): kwargs[name]["type"] = float - elif contains_type(type_hints, - dict) and (contains_type(type_hints, str) or any( - is_not_builtin(th) for th in type_hints)): + elif (contains_type(type_hints, dict) + and (contains_type(type_hints, str) + or any(is_not_builtin(th) for th in type_hints))): kwargs[name]["type"] = union_dict_and_str elif contains_type(type_hints, dict): - # Dict arguments will always be optional kwargs[name]["type"] = parse_type(json.loads) kwargs[name]["help"] += json_tip elif (contains_type(type_hints, str) @@ -318,8 +320,7 @@ class EngineArgs: rope_scaling: dict[str, Any] = get_field(ModelConfig, "rope_scaling") rope_theta: Optional[float] = ModelConfig.rope_theta hf_token: Optional[Union[bool, str]] = ModelConfig.hf_token - hf_overrides: Optional[HfOverrides] = \ - get_field(ModelConfig, "hf_overrides") + hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides") tokenizer_revision: Optional[str] = ModelConfig.tokenizer_revision quantization: Optional[QuantizationMethods] = ModelConfig.quantization enforce_eager: bool = ModelConfig.enforce_eager @@ -399,7 +400,8 @@ class EngineArgs: get_field(ModelConfig, "override_neuron_config") override_pooler_config: Optional[Union[dict, PoolerConfig]] = \ ModelConfig.override_pooler_config - compilation_config: Optional[CompilationConfig] = None + compilation_config: CompilationConfig = \ + get_field(VllmConfig, "compilation_config") worker_cls: str = ParallelConfig.worker_cls worker_extension_cls: str = ParallelConfig.worker_extension_cls @@ -414,13 +416,17 @@ class EngineArgs: calculate_kv_scales: bool = CacheConfig.calculate_kv_scales - additional_config: Optional[Dict[str, Any]] = None + additional_config: dict[str, Any] = \ + get_field(VllmConfig, "additional_config") enable_reasoning: Optional[bool] = None # DEPRECATED reasoning_parser: str = DecodingConfig.reasoning_backend use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load pt_load_map_location: str = LoadConfig.pt_load_map_location + enable_multimodal_encoder_data_parallel: bool = \ + ParallelConfig.enable_multimodal_encoder_data_parallel + def __post_init__(self): # support `EngineArgs(compilation_config={...})` # without having to manually construct a @@ -578,7 +584,7 @@ class EngineArgs: action=argparse.BooleanOptionalAction, deprecated=True, help="[DEPRECATED] The `--enable-reasoning` flag is deprecated as " - "of v0.8.6. Use `--reasoning-parser` to specify the reasoning " + "of v0.9.0. Use `--reasoning-parser` to specify the reasoning " "parser backend instead. This flag (`--enable-reasoning`) will be " "removed in v0.10.0. When `--reasoning-parser` is specified, " "reasoning mode is automatically enabled.") @@ -637,6 +643,9 @@ class EngineArgs: **parallel_kwargs["worker_cls"]) parallel_group.add_argument("--worker-extension-cls", **parallel_kwargs["worker_extension_cls"]) + parallel_group.add_argument( + "--enable-multimodal-encoder-data-parallel", + **parallel_kwargs["enable_multimodal_encoder_data_parallel"]) # KV cache arguments cache_kwargs = get_kwargs(CacheConfig) @@ -740,7 +749,9 @@ class EngineArgs: title="DeviceConfig", description=DeviceConfig.__doc__, ) - device_group.add_argument("--device", **device_kwargs["device"]) + device_group.add_argument("--device", + **device_kwargs["device"], + deprecated=True) # Speculative arguments speculative_group = parser.add_argument_group( @@ -980,7 +991,7 @@ class EngineArgs: from vllm.platforms import current_platform current_platform.pre_register_and_update() - device_config = DeviceConfig(device=self.device) + device_config = DeviceConfig(device=current_platform.device_type) model_config = self.create_model_config() # * If VLLM_USE_V1 is unset, we enable V1 for "supported features" @@ -1077,6 +1088,8 @@ class EngineArgs: distributed_executor_backend=self.distributed_executor_backend, worker_cls=self.worker_cls, worker_extension_cls=self.worker_extension_cls, + enable_multimodal_encoder_data_parallel=self. + enable_multimodal_encoder_data_parallel, ) speculative_config = self.create_speculative_config( @@ -1086,7 +1099,7 @@ class EngineArgs: disable_log_stats=self.disable_log_stats, ) - # Reminder: Please update docs/source/features/compatibility_matrix.md + # Reminder: Please update docs/features/compatibility_matrix.md # If the feature combo become valid if self.num_scheduler_steps > 1: if speculative_config is not None: @@ -1197,8 +1210,7 @@ class EngineArgs: ############################################################# # Unsupported Feature Flags on V1. - if (self.load_format == LoadFormat.TENSORIZER.value - or self.load_format == LoadFormat.SHARDED_STATE.value): + if self.load_format == LoadFormat.SHARDED_STATE.value: _raise_or_fallback( feature_name=f"--load_format {self.load_format}", recommend_to_remove=False) @@ -1294,14 +1306,6 @@ class EngineArgs: recommend_to_remove=False) return False - # Some quantization is not compatible with torch.compile. - V1_UNSUPPORTED_QUANT = ["gguf"] - if model_config.quantization in V1_UNSUPPORTED_QUANT: - _raise_or_fallback( - feature_name=f"--quantization {model_config.quantization}", - recommend_to_remove=False) - return False - # No Embedding Models so far. if model_config.task not in ["generate"]: _raise_or_fallback(feature_name=f"--task {model_config.task}", @@ -1341,7 +1345,7 @@ class EngineArgs: is_ngram_enabled = True elif speculative_method == "medusa": is_medusa_enabled = True - elif speculative_method in ("eagle", "eagle3"): + elif speculative_method in ("eagle", "eagle3", "deepseek_mtp"): is_eagle_enabled = True else: speculative_model = self.speculative_config.get("model") @@ -1388,7 +1392,8 @@ class EngineArgs: if (self.pipeline_parallel_size > 1 and self.distributed_executor_backend - not in ("ray", "mp", "external_launcher")): + not in (ParallelConfig.distributed_executor_backend, "ray", + "mp", "external_launcher")): name = "Pipeline Parallelism without Ray distributed executor " \ "or multiprocessing executor or external launcher" _raise_or_fallback(feature_name=name, recommend_to_remove=False) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 56b9e49d24d97..19b219b674f38 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -475,7 +475,8 @@ class _AsyncLLMEngine(LLMEngine): *, inputs: Optional[PromptType] = None, # DEPRECATED ) -> None: - """Async version of {meth}`add_request`.""" + """Async version of + [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request].""" if inputs is not None: prompt = inputs assert prompt is not None and params is not None @@ -582,20 +583,21 @@ async def build_guided_decoding_logits_processor_async( class AsyncLLMEngine(EngineClient): - """An asynchronous wrapper for {class}`LLMEngine`. + """An asynchronous wrapper for [`LLMEngine`][vllm.LLMEngine]. - This class is used to wrap the {class}`LLMEngine` class to make it - asynchronous. It uses asyncio to create a background loop that keeps - processing incoming requests. The {class}`LLMEngine` is kicked by the - generate method when there are requests in the waiting queue. The generate - method yields the outputs from the {class}`LLMEngine` to the caller. + This class is used to wrap the [`LLMEngine`][vllm.LLMEngine] class to + make it asynchronous. It uses asyncio to create a background loop that keeps + processing incoming requests. The [`LLMEngine`][vllm.LLMEngine] is kicked + by the generate method when there are requests in the waiting queue. The + generate method yields the outputs from the [`LLMEngine`][vllm.LLMEngine] + to the caller. Args: log_requests: Whether to log the requests. start_engine_loop: If True, the background task to run the engine will be automatically started in the generate call. - *args: Arguments for {class}`LLMEngine`. - **kwargs: Arguments for {class}`LLMEngine`. + *args: Arguments for [`LLMEngine`][vllm.LLMEngine]. + **kwargs: Arguments for [`LLMEngine`][vllm.LLMEngine]. """ _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine @@ -985,8 +987,9 @@ class AsyncLLMEngine(EngineClient): from the LLMEngine to the caller. Args: - prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` - for more details about the format of each input. + prompt: The prompt to the LLM. See + [`PromptType`][vllm.inputs.PromptType] for more details about + the format of each input. sampling_params: The sampling parameters of the request. request_id: The unique id of the request. lora_request: LoRA request to use for generation, if any. @@ -1003,7 +1006,7 @@ class AsyncLLMEngine(EngineClient): Details: - If the engine is not running, start the background loop, which iteratively invokes - {meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step` + [`engine_step`][vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step] to process the waiting requests. - Add the request to the engine's `RequestTracker`. On the next background loop, this request will be sent to @@ -1075,8 +1078,9 @@ class AsyncLLMEngine(EngineClient): from the LLMEngine to the caller. Args: - prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` - for more details about the format of each input. + prompt: The prompt to the LLM. See + [`PromptType`][vllm.inputs.PromptType] for more details about + the format of each input. pooling_params: The pooling parameters of the request. request_id: The unique id of the request. lora_request: LoRA request to use for generation, if any. @@ -1089,15 +1093,15 @@ class AsyncLLMEngine(EngineClient): for the request. Details: - - If the engine is not running, start the background loop, - which iteratively invokes - {meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step` - to process the waiting requests. - - Add the request to the engine's `RequestTracker`. - On the next background loop, this request will be sent to - the underlying engine. - Also, a corresponding `AsyncStream` will be created. - - Wait for the request outputs from `AsyncStream` and yield them. + - If the engine is not running, start the background loop, + which iteratively invokes + [`vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`][] + to process the waiting requests. + - Add the request to the engine's `RequestTracker`. + On the next background loop, this request will be sent to + the underlying engine. + Also, a corresponding `AsyncStream` will be created. + - Wait for the request outputs from `AsyncStream` and yield them. Example: ``` diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py index aa54c0693941f..94674262bcfe3 100644 --- a/vllm/engine/async_timeout.py +++ b/vllm/engine/async_timeout.py @@ -8,7 +8,6 @@ import asyncio import enum import sys -import warnings from types import TracebackType from typing import Any, Optional, Type @@ -66,24 +65,6 @@ else: else: self.update(deadline) - def __enter__(self) -> "Timeout": - warnings.warn( - "with timeout() is deprecated, use async with timeout()", - DeprecationWarning, - stacklevel=2, - ) - self._do_enter() - return self - - def __exit__( - self, - exc_type: Optional[Type[BaseException]], - exc_val: Optional[BaseException], - exc_tb: Optional[TracebackType], - ) -> Optional[bool]: - self._do_exit(exc_type) - return None - async def __aenter__(self) -> "Timeout": self._do_enter() return self diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 2a27afe9757e1..a9600a2c8aa3d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -130,26 +130,16 @@ class LLMEngine: iteration-level scheduling and efficient memory management to maximize the serving throughput. - The {class}`~vllm.LLM` class wraps this class for offline batched inference - and the {class}`AsyncLLMEngine` class wraps this class for online serving. + The [`LLM`][vllm.LLM] class wraps this class for offline batched inference + and the [`AsyncLLMEngine`][vllm.engine.async_llm_engine.AsyncLLMEngine] + class wraps this class for online serving. - The config arguments are derived from {class}`~vllm.EngineArgs`. (See - {ref}`engine-args`) + The config arguments are derived from [`EngineArgs`][vllm.EngineArgs]. Args: - model_config: The configuration related to the LLM model. - cache_config: The configuration related to the KV cache memory - management. - parallel_config: The configuration related to distributed execution. - scheduler_config: The configuration related to the request scheduler. - device_config: The configuration related to the device. - lora_config (Optional): The configuration related to serving multi-LoRA. - speculative_config (Optional): The configuration related to speculative - decoding. + vllm_config: The configuration for initializing and running vLLM. executor_class: The model executor class for managing distributed execution. - prompt_adapter_config (Optional): The configuration related to serving - prompt adapters. log_stats: Whether to log statistics. usage_context: Specified entry point, used for usage info collection. """ @@ -695,11 +685,12 @@ class LLMEngine: Args: request_id: The unique ID of the request. - prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` + prompt: The prompt to the LLM. See + [PromptType][vllm.inputs.PromptType] for more details about the format of each input. params: Parameters for sampling or pooling. - {class}`~vllm.SamplingParams` for text generation. - {class}`~vllm.PoolingParams` for pooling. + [SamplingParams][vllm.SamplingParams] for text generation. + [PoolingParams][vllm.PoolingParams] for pooling. arrival_time: The arrival time of the request. If None, we use the current monotonic time. lora_request: The LoRA request to add. @@ -711,10 +702,11 @@ class LLMEngine: Details: - Set arrival_time to the current time if it is None. - Set prompt_token_ids to the encoded prompt if it is None. - - Create `n` number of {class}`~vllm.Sequence` objects. - - Create a {class}`~vllm.SequenceGroup` object - from the list of {class}`~vllm.Sequence`. - - Add the {class}`~vllm.SequenceGroup` object to the scheduler. + - Create `n` number of [Sequence][vllm.Sequence] objects. + - Create a [SequenceGroup][vllm.SequenceGroup] object + from the list of [Sequence][vllm.Sequence]. + - Add the [SequenceGroup][vllm.SequenceGroup] object to the + scheduler. Example: >>> # initialize engine @@ -861,9 +853,7 @@ class LLMEngine: request_id: The ID(s) of the request to abort. Details: - - Refer to the - {meth}`~vllm.core.scheduler.Scheduler.abort_seq_group` - from class {class}`~vllm.core.scheduler.Scheduler`. + - Refer to [vllm.core.scheduler.Scheduler.abort_seq_group][]. Example: >>> # initialize engine and add a request with request_id @@ -1263,12 +1253,10 @@ class LLMEngine: def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]: """Performs one decoding iteration and returns newly generated results. - :::{figure} https://i.imgur.com/sv2HssD.png - :alt: Overview of the step function - :align: center - - Overview of the step function. - ::: + <figure markdown="span"> + ![Overview of the step function](https://i.imgur.com/sv2HssD.png) + <figcaption>Overview of the step function</figcaption> + </figure> Details: - Step 1: Schedules the sequences to be executed in the next @@ -1662,6 +1650,20 @@ class LLMEngine: gpu_prefix_cache_hit_rate = self.scheduler[ 0].get_prefix_cache_hit_rate(Device.GPU) + # Exchange the uasge and cache hit stats between gpu and cpu when + # running on cpu because the cpu_worker.py intentionally reports the + # number of cpu blocks as gpu blocks in favor of cache management. + if self.device_config.device_type == "cpu": + num_total_gpu, num_total_cpu = num_total_cpu, num_total_gpu + gpu_cache_usage_sys, cpu_cache_usage_sys = ( + cpu_cache_usage_sys, + gpu_cache_usage_sys, + ) + gpu_prefix_cache_hit_rate, cpu_prefix_cache_hit_rate = ( + cpu_prefix_cache_hit_rate, + gpu_prefix_cache_hit_rate, + ) + # Iteration stats num_prompt_tokens_iter = 0 num_generation_tokens_iter = 0 @@ -1678,9 +1680,6 @@ class LLMEngine: time_inference_requests: List[float] = [] time_prefill_requests: List[float] = [] time_decode_requests: List[float] = [] - time_in_queue_requests: List[float] = [] - model_forward_time_requests: List[float] = [] - model_execute_time_requests: List[float] = [] # Metadata num_prompt_tokens_requests: List[int] = [] num_generation_tokens_requests: List[int] = [] @@ -1788,15 +1787,6 @@ class LLMEngine: now - seq_group.metrics.first_token_time) time_inference_requests.append( now - seq_group.metrics.first_scheduled_time) - if seq_group.metrics.time_in_queue is not None: - time_in_queue_requests.append( - seq_group.metrics.time_in_queue) - if seq_group.metrics.model_forward_time is not None: - model_forward_time_requests.append( - seq_group.metrics.model_forward_time) - if seq_group.metrics.model_execute_time is not None: - model_execute_time_requests.append( - seq_group.metrics.model_execute_time * 1000) # Metadata num_prompt_tokens_requests.append( len(seq_group.prompt_token_ids)) @@ -1865,9 +1855,6 @@ class LLMEngine: time_inference_requests=time_inference_requests, time_prefill_requests=time_prefill_requests, time_decode_requests=time_decode_requests, - time_in_queue_requests=time_in_queue_requests, - model_forward_time_requests=model_forward_time_requests, - model_execute_time_requests=model_execute_time_requests, # Metadata num_prompt_tokens_requests=num_prompt_tokens_requests, num_generation_tokens_requests=num_generation_tokens_requests, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 033551d07c39f..916afe0c8e5f7 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -29,7 +29,7 @@ prometheus_client.disable_created_metrics() # to extract the metrics definitions. -# begin-metrics-definitions +# --8<-- [start:metrics-definitions] class Metrics: """ vLLM uses a multiprocessing-based frontend for the OpenAI server. @@ -80,17 +80,6 @@ class Metrics: multiprocess_mode="livemostrecent", ) - # Deprecated in 0.8 - KV cache offloading is not used in V1 - # Hidden in 0.9, due to be removed in 0.10 - if self.show_hidden_metrics: - self.gauge_scheduler_swapped = self._gauge_cls( - name="vllm:num_requests_swapped", - documentation=( - "Number of requests swapped to CPU. " - "DEPRECATED: KV cache offloading is not used in V1"), - labelnames=labelnames, - multiprocess_mode="sum") - # KV Cache Usage in % self.gauge_gpu_cache_usage = self._gauge_cls( name="vllm:gpu_cache_usage_perc", @@ -98,35 +87,6 @@ class Metrics: labelnames=labelnames, multiprocess_mode="sum") - # Deprecated in 0.8 - KV cache offloading is not used in V1 - # Hidden in 0.9, due to be removed in 0.10 - if self.show_hidden_metrics: - self.gauge_cpu_cache_usage = self._gauge_cls( - name="vllm:cpu_cache_usage_perc", - documentation=( - "CPU KV-cache usage. 1 means 100 percent usage. " - "DEPRECATED: KV cache offloading is not used in V1"), - labelnames=labelnames, - multiprocess_mode="sum") - self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls( - name="vllm:cpu_prefix_cache_hit_rate", - documentation=( - "CPU prefix cache block hit rate. " - "DEPRECATED: KV cache offloading is not used in V1"), - labelnames=labelnames, - multiprocess_mode="sum") - - # Deprecated in 0.8 - replaced by queries+hits counters in V1 - # Hidden in 0.9, due to be removed in 0.10 - if self.show_hidden_metrics: - self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls( - name="vllm:gpu_prefix_cache_hit_rate", - documentation=("GPU prefix cache block hit rate. " - "DEPRECATED: use vllm:gpu_prefix_cache_queries " - "and vllm:gpu_prefix_cache_queries in V1"), - labelnames=labelnames, - multiprocess_mode="sum") - # Iteration stats self.counter_num_preemption = self._counter_cls( name="vllm:num_preemptions_total", @@ -200,36 +160,6 @@ class Metrics: "Histogram of time spent in DECODE phase for request.", labelnames=labelnames, buckets=request_latency_buckets) - # Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds: - # Hidden in 0.9, due to be removed in 0.10 - if self.show_hidden_metrics: - self.histogram_time_in_queue_request = self._histogram_cls( - name="vllm:time_in_queue_requests", - documentation= - ("Histogram of time the request spent in the queue in seconds. " - "DEPRECATED: use vllm:request_queue_time_seconds instead."), - labelnames=labelnames, - buckets=request_latency_buckets) - - # Deprecated in 0.8 - use prefill/decode/inference time metrics - # Hidden in 0.9, due to be removed in 0.10 - if self.show_hidden_metrics: - self.histogram_model_forward_time_request = self._histogram_cls( - name="vllm:model_forward_time_milliseconds", - documentation= - ("Histogram of time spent in the model forward pass in ms. " - "DEPRECATED: use prefill/decode/inference time metrics instead" - ), - labelnames=labelnames, - buckets=build_1_2_3_5_8_buckets(3000)) - self.histogram_model_execute_time_request = self._histogram_cls( - name="vllm:model_execute_time_milliseconds", - documentation= - ("Histogram of time spent in the model execute function in ms." - "DEPRECATED: use prefill/decode/inference time metrics instead" - ), - labelnames=labelnames, - buckets=build_1_2_3_5_8_buckets(3000)) # Metadata self.histogram_num_prompt_tokens_request = self._histogram_cls( @@ -293,7 +223,7 @@ class Metrics: labelnames=labelnames)) -# end-metrics-definitions +# --8<-- [end:metrics-definitions] def _unregister_vllm_metrics(self) -> None: for collector in list(prometheus_client.REGISTRY._collector_to_names): @@ -580,20 +510,10 @@ class PrometheusStatLogger(StatLoggerBase): # System state data self._log_gauge(self.metrics.gauge_scheduler_running, stats.num_running_sys) - if self.metrics.show_hidden_metrics: - self._log_gauge(self.metrics.gauge_scheduler_swapped, - stats.num_swapped_sys) self._log_gauge(self.metrics.gauge_scheduler_waiting, stats.num_waiting_sys) self._log_gauge(self.metrics.gauge_gpu_cache_usage, stats.gpu_cache_usage_sys) - if self.metrics.show_hidden_metrics: - self._log_gauge(self.metrics.gauge_cpu_cache_usage, - stats.cpu_cache_usage_sys) - self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate, - stats.cpu_prefix_cache_hit_rate) - self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate, - stats.gpu_prefix_cache_hit_rate) # Including max-lora in metric, in future this property of lora # config maybe extended to be dynamic. lora_info = { @@ -631,15 +551,6 @@ class PrometheusStatLogger(StatLoggerBase): stats.time_prefill_requests) self._log_histogram(self.metrics.histogram_decode_time_request, stats.time_decode_requests) - if self.metrics.show_hidden_metrics: - self._log_histogram(self.metrics.histogram_time_in_queue_request, - stats.time_in_queue_requests) - self._log_histogram( - self.metrics.histogram_model_forward_time_request, - stats.model_forward_time_requests) - self._log_histogram( - self.metrics.histogram_model_execute_time_request, - stats.model_execute_time_requests) # Metadata finished_reason_counter = CollectionsCounter( stats.finished_reason_requests) diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 9e6d5ef29bedb..acc83011d6c8e 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -53,9 +53,6 @@ class Stats: time_inference_requests: List[float] time_prefill_requests: List[float] time_decode_requests: List[float] - time_in_queue_requests: List[float] - model_forward_time_requests: List[float] - model_execute_time_requests: List[float] # Metadata num_prompt_tokens_requests: List[int] num_generation_tokens_requests: List[int] diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index eea89a9a055f1..18b7c187bdffe 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -492,8 +492,9 @@ class MQLLMEngineClient(EngineClient): from the LLMEngine to the caller. Args: - prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` - for more details about the format of each input. + prompt: The prompt to the LLM. See + [`PromptType`][vllm.inputs.PromptType] for more details about + the format of each input. sampling_params: The sampling parameters of the request. request_id: The unique id of the request. lora_request: LoRA request to use for generation, if any. @@ -561,8 +562,9 @@ class MQLLMEngineClient(EngineClient): from the LLMEngine to the caller. Args: - prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` - for more details about the format of each input. + prompt: The prompt to the LLM. See + [`PromptType`][vllm.inputs.PromptType] for more details about + the format of each input. pooling_params: The pooling parameters of the request. request_id: The unique id of the request. lora_request: LoRA request to use for generation, if any. diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index ac234d25373dc..434cb49855621 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -42,19 +42,22 @@ HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), ) class MQLLMEngine: - """A multiprocessing wrapper for {class}`LLMEngine`. + """A multiprocessing wrapper for + [`LLMEngine`][vllm.engine.llm_engine.LLMEngine]. - This class is used to wrap the {class}`LLMEngine` class to enable use + This class is used to wrap the + [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] class to enable use in concurrnet manner. It runs a background loop and uses zeromq to receive new requests and stream outputs incrementally via ipc. - The {class}`LLMEngine` generate or encode process is kicked off when a new - RPCProcessRequest is received by the input_socket. + The [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] generate or encode + process is kicked off when a new RPCProcessRequest is received by the + input_socket. The self.engine_loop checks the input_socket for new requests, adds them to the LLMEngine if there are any, calls the internal - {class}`LLMEngine.step()`, and sends the RequestOutputs back over - the output_socket. + [`LLMEngine.step()`][vllm.engine.llm_engine.LLMEngine.step], and sends + the RequestOutputs back over the output_socket. If use_async_sockets is set, the logic associated with reading new requests from the socket and sending data to the socket is passed @@ -65,8 +68,8 @@ class MQLLMEngine: ipc_path: Base path for zeromq interprocess messaging use_async_sockets: Whether to make send/recv async with GPU log_requests: Whether to log the requests. - *args: Arguments for {class}`LLMEngine`. - **kwargs: Arguments for {class}`LLMEngine`. + *args: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine]. + **kwargs: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine]. """ def __init__(self, diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 4cfb22c5a7501..110f84a65efc9 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -56,8 +56,11 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor): scheduled computation. Args: - seq_group: the outputs are associated with this {class}`SequenceGroup` - outputs: the {class}`SequenceGroupOutput`s for all scheduler steps + seq_group: the outputs are associated with this + [`SequenceGroup`][vllm.sequence.SequenceGroup] + outputs: the + [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]s + for all scheduler steps """ for output in outputs: # Concatenate single-step prompt logprob processing results. @@ -67,7 +70,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor): @staticmethod @functools.lru_cache def _log_prompt_logprob_unsupported_warning_once(): - # Reminder: Please update docs/source/features/compatibility_matrix.md + # Reminder: Please update docs/features/compatibility_matrix.md # If the feature combo become valid logger.warning( "Prompt logprob is not supported by multi step workers. " diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index ea4b71a5b9cd2..e88f119c87426 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -19,17 +19,21 @@ logger = init_logger(__name__) def single_step_process_prompt_logprob( sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup, output: CompletionSequenceGroupOutput) -> None: - """Process prompt logprobs associated with the {class}`SequenceGroupOutput` - for a given step. + """Process prompt logprobs associated with the + [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] for a given step. Do nothing if the output has no prompt logprobs. Account for the fact that transformers do not compute first-token logprobs. Args: - sg_output_proc: {class}`SequenceGroupOutputProcessor` instance - seq_group: the output is associated with this {class}`SequenceGroup` - output: the {class}`SequenceGroupOutput` for a single scheduler step + sg_output_proc: + [`SequenceGroupOutputProcessor`][vllm.engine.output_processor.interfaces.SequenceGroupOutputProcessor] + instance + seq_group: the output is associated with this + [`SequenceGroup`][vllm.sequence.SequenceGroup] + output: the [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] + for a single scheduler step """ prompt_logprobs = output.prompt_logprobs @@ -103,8 +107,11 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor): scheduled computation. Args: - seq_group: the output is associated with this {class}`SequenceGroup` - outputs: the {class}`SequenceGroupOutput` for a single scheduler step + seq_group: the output is associated with this + [`SequenceGroup`][vllm.sequence.SequenceGroup] + outputs: the + [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] + for a single scheduler step """ assert len(outputs) == 1, "Single step should only have 1 output." output = outputs[0] diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index a837a2d288a9c..28341c2c633e8 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -65,6 +65,7 @@ class EngineClient(ABC): prompt: PromptType, request_id: str, params: BeamSearchParams, + lora_request: Optional[LoRARequest] = None, ) -> AsyncGenerator[RequestOutput, None]: beam_width = params.beam_width @@ -106,27 +107,31 @@ class EngineClient(ABC): cum_logprob=0, logprobs=[], multi_modal_data=multi_modal_data, - mm_processor_kwargs=mm_processor_kwargs) + mm_processor_kwargs=mm_processor_kwargs, + lora_request=lora_request) ] completed = [] for _ in range(max_tokens): - prompts_batch = [ + prompts_batch, lora_req_batch = zip(*[( TokensPrompt(prompt_token_ids=beam.tokens, multi_modal_data=beam.multi_modal_data, - mm_processor_kwargs=beam.mm_processor_kwargs) - for beam in all_beams - ] + mm_processor_kwargs=beam.mm_processor_kwargs), + beam.lora_request, + ) for beam in all_beams]) tasks = [] request_id = f"beam_search-{random_uuid()}" - for i, individual_prompt in enumerate(prompts_batch): + for i, (individual_prompt, + lora_req) in enumerate(zip(prompts_batch, lora_req_batch)): request_id_item = f"{request_id}-{i}" task = asyncio.create_task( collect_from_async_generator( - self.generate(individual_prompt, beam_search_params, - request_id_item))) + self.generate(individual_prompt, + beam_search_params, + request_id_item, + lora_request=lora_req))) tasks.append(task) output = await asyncio.gather(*tasks) @@ -159,6 +164,7 @@ class EngineClient(ABC): tokens=current_beam.tokens + [token_id], logprobs=current_beam.logprobs + [logprobs], + lora_request=current_beam.lora_request, cum_logprob=current_beam.cum_logprob + logprob_obj.logprob, multi_modal_data=current_beam. diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index e8d10017a1e9f..b051cd3338a4c 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -556,6 +556,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): return "(<audio>./</audio>)" raise TypeError(f"Unknown model type: {model_type}") elif modality == "video": + if model_type == "internvl_chat": + return "<video>" if model_type in ("qwen2_vl", "qwen2_5_vl"): return "<|vision_start|><|video_pad|><|vision_end|>" if model_type == "qwen2_5_omni": @@ -1250,7 +1252,7 @@ def apply_hf_chat_template( # investigation. logger.exception( "An error occurred in `transformers` while applying chat template") - raise ValueError from e + raise ValueError(str(e)) from e def apply_mistral_chat_template( tokenizer: MistralTokenizer, @@ -1279,7 +1281,7 @@ def apply_mistral_chat_template( # We convert those assertion errors to ValueErrors so they can be # are properly caught in the preprocessing_input step except (AssertionError, MistralCommonException) as e: - raise ValueError from e + raise ValueError(str(e)) from e # External library exceptions can sometimes occur despite the framework's # internal exception management capabilities. @@ -1290,7 +1292,7 @@ def apply_mistral_chat_template( logger.exception( "An error occurred in `mistral_common` while applying chat " "template") - raise ValueError from e + raise ValueError(str(e)) from e def random_tool_call_id() -> str: return f"chatcmpl-tool-{random_uuid()}" diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py index b7c1afce71181..5eba72fec13cc 100644 --- a/vllm/entrypoints/cli/main.py +++ b/vllm/entrypoints/cli/main.py @@ -7,9 +7,10 @@ import sys import vllm.entrypoints.cli.benchmark.main import vllm.entrypoints.cli.collect_env import vllm.entrypoints.cli.openai +import vllm.entrypoints.cli.run_batch import vllm.entrypoints.cli.serve import vllm.version -from vllm.entrypoints.utils import cli_env_setup +from vllm.entrypoints.utils import VLLM_SERVE_PARSER_EPILOG, cli_env_setup from vllm.utils import FlexibleArgumentParser CMD_MODULES = [ @@ -17,6 +18,7 @@ CMD_MODULES = [ vllm.entrypoints.cli.serve, vllm.entrypoints.cli.benchmark.main, vllm.entrypoints.cli.collect_env, + vllm.entrypoints.cli.run_batch, ] @@ -32,7 +34,10 @@ def register_signal_handlers(): def main(): cli_env_setup() - parser = FlexibleArgumentParser(description="vLLM CLI") + parser = FlexibleArgumentParser( + description="vLLM CLI", + epilog=VLLM_SERVE_PARSER_EPILOG, + ) parser.add_argument('-v', '--version', action='version', diff --git a/vllm/entrypoints/cli/run_batch.py b/vllm/entrypoints/cli/run_batch.py new file mode 100644 index 0000000000000..f74c8da9b9b86 --- /dev/null +++ b/vllm/entrypoints/cli/run_batch.py @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import asyncio + +from prometheus_client import start_http_server + +from vllm.entrypoints.cli.types import CLISubcommand +from vllm.entrypoints.logger import logger +from vllm.entrypoints.openai.run_batch import main as run_batch_main +from vllm.entrypoints.openai.run_batch import make_arg_parser +from vllm.utils import FlexibleArgumentParser +from vllm.version import __version__ as VLLM_VERSION + + +class RunBatchSubcommand(CLISubcommand): + """The `run-batch` subcommand for vLLM CLI.""" + + def __init__(self): + self.name = "run-batch" + super().__init__() + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + logger.info("vLLM batch processing API version %s", VLLM_VERSION) + logger.info("args: %s", args) + + # Start the Prometheus metrics server. + # LLMEngine uses the Prometheus client + # to publish metrics at the /metrics endpoint. + if args.enable_metrics: + logger.info("Prometheus metrics enabled") + start_http_server(port=args.port, addr=args.url) + else: + logger.info("Prometheus metrics disabled") + + asyncio.run(run_batch_main(args)) + + def subparser_init( + self, + subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser: + run_batch_parser = subparsers.add_parser( + "run-batch", + help="Run batch prompts and write results to file.", + description=( + "Run batch prompts using vLLM's OpenAI-compatible API.\n" + "Supports local or HTTP input/output files."), + usage= + "vllm run-batch -i INPUT.jsonl -o OUTPUT.jsonl --model <model>", + ) + return make_arg_parser(run_batch_parser) + + +def cmd_init() -> list[CLISubcommand]: + return [RunBatchSubcommand()] diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 04be7c0339988..e65c97073218b 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -1,22 +1,35 @@ # SPDX-License-Identifier: Apache-2.0 import argparse +import os import signal +import sys import uvloop +import zmq import vllm.envs as envs from vllm import AsyncEngineArgs from vllm.entrypoints.cli.types import CLISubcommand -from vllm.entrypoints.openai.api_server import run_server +from vllm.entrypoints.openai.api_server import (run_server, run_server_worker, + setup_server) from vllm.entrypoints.openai.cli_args import (make_arg_parser, validate_parsed_serve_args) +from vllm.entrypoints.utils import (VLLM_SERVE_PARSER_EPILOG, + show_filtered_argument_or_group_from_help) +from vllm.executor.multiproc_worker_utils import _add_prefix from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext -from vllm.utils import FlexibleArgumentParser, get_tcp_uri +from vllm.utils import FlexibleArgumentParser, get_tcp_uri, zmq_socket_ctx +from vllm.v1.engine.coordinator import DPCoordinator from vllm.v1.engine.core import EngineCoreProc from vllm.v1.engine.core_client import CoreEngineProcManager from vllm.v1.executor.abstract import Executor +from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus +from vllm.v1.utils import (APIServerProcessManager, CoreEngine, + EngineZmqAddresses, get_engine_client_zmq_addr, + wait_for_completion_or_failure, + wait_for_engine_startup) logger = init_logger(__name__) @@ -34,9 +47,12 @@ class ServeSubcommand(CLISubcommand): if hasattr(args, 'model_tag') and args.model_tag is not None: args.model = args.model_tag - if args.headless: + if args.headless or args.api_server_count < 1: run_headless(args) + elif args.api_server_count > 1: + run_multi_api_server(args) else: + # Single API server (this process). uvloop.run(run_server(args)) def validate(self, args: argparse.Namespace) -> None: @@ -67,6 +83,11 @@ class ServeSubcommand(CLISubcommand): type=int, default=0, help='Starting data parallel rank for secondary nodes.') + serve_parser.add_argument('--api-server-count', + '-asc', + type=int, + default=1, + help='How many API server processes to run.') serve_parser.add_argument( "--config", type=str, @@ -77,7 +98,10 @@ class ServeSubcommand(CLISubcommand): "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference" ) - return make_arg_parser(serve_parser) + serve_parser = make_arg_parser(serve_parser) + show_filtered_argument_or_group_from_help(serve_parser) + serve_parser.epilog = VLLM_SERVE_PARSER_EPILOG + return serve_parser def cmd_init() -> list[CLISubcommand]: @@ -86,23 +110,26 @@ def cmd_init() -> list[CLISubcommand]: def run_headless(args: argparse.Namespace): + if args.api_server_count > 1: + raise ValueError("api_server_count can't be set in headless mode") + # Create the EngineConfig. engine_args = AsyncEngineArgs.from_cli_args(args) usage_context = UsageContext.OPENAI_API_SERVER vllm_config = engine_args.create_engine_config(usage_context=usage_context) if not envs.VLLM_USE_V1: - raise RuntimeError("Headless mode is only supported for V1") + raise ValueError("Headless mode is only supported for V1") parallel_config = vllm_config.parallel_config local_engine_count = parallel_config.data_parallel_size_local host = parallel_config.data_parallel_master_ip port = engine_args.data_parallel_rpc_port # add to config too - input_address = get_tcp_uri(host, port) + handshake_address = get_tcp_uri(host, port) if local_engine_count <= 0: - raise RuntimeError("data_parallel_size_local must be > 0 in " - "headless mode") + raise ValueError("data_parallel_size_local must be > 0 in " + "headless mode") # Catch SIGTERM and SIGINT to allow graceful shutdown. def signal_handler(signum, frame): @@ -114,7 +141,7 @@ def run_headless(args: argparse.Namespace): logger.info( "Launching %d data parallel engine(s) in headless mode, " - "with head node address %s.", local_engine_count, input_address) + "with head node address %s.", local_engine_count, handshake_address) # Create the engines. engine_manager = CoreEngineProcManager( @@ -124,7 +151,7 @@ def run_headless(args: argparse.Namespace): local_start_index=0, vllm_config=vllm_config, on_head_node=False, - input_address=input_address, + handshake_address=handshake_address, executor_class=Executor.get_class(vllm_config), log_stats=not engine_args.disable_log_stats, ) @@ -134,3 +161,142 @@ def run_headless(args: argparse.Namespace): finally: logger.info("Shutting down.") engine_manager.close() + + +def run_multi_api_server(args: argparse.Namespace): + + assert not args.headless + num_api_servers = args.api_server_count + assert num_api_servers > 0 + + if num_api_servers > 1: + setup_multiprocess_prometheus() + + listen_address, sock = setup_server(args) + + engine_args = AsyncEngineArgs.from_cli_args(args) + usage_context = UsageContext.OPENAI_API_SERVER + vllm_config = engine_args.create_engine_config(usage_context=usage_context) + model_config = vllm_config.model_config + + if num_api_servers > 1: + if not envs.VLLM_USE_V1: + raise ValueError("api_server_count > 1 is only supported for V1") + + if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: + raise ValueError("VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used " + "with api_server_count > 1") + + if model_config.is_multimodal_model and not ( + model_config.disable_mm_preprocessor_cache): + logger.warning( + "Multi-model preprocessor cache will be disabled for" + " api_server_count > 1") + model_config.disable_mm_preprocessor_cache = True + + parallel_config = vllm_config.parallel_config + + assert parallel_config.data_parallel_rank == 0 + + dp_size = parallel_config.data_parallel_size + local_engine_count = parallel_config.data_parallel_size_local + host = parallel_config.data_parallel_master_ip + local_only = local_engine_count == dp_size + + # Set up input and output addresses. + input_addresses = [ + get_engine_client_zmq_addr(local_only, host) + for _ in range(num_api_servers) + ] + output_addresses = [ + get_engine_client_zmq_addr(local_only, host) + for _ in range(num_api_servers) + ] + + addresses = EngineZmqAddresses( + inputs=input_addresses, + outputs=output_addresses, + ) + + # Set up coordinator for dp > 1. + coordinator = None + stats_update_address = None + if dp_size > 1: + coordinator = DPCoordinator(parallel_config) + addresses.coordinator_input, addresses.coordinator_output = ( + coordinator.get_engine_socket_addresses()) + stats_update_address = coordinator.get_stats_publish_address() + logger.info("Started DP Coordinator process (PID: %d)", + coordinator.proc.pid) + + handshake_address = get_engine_client_zmq_addr( + local_only, host, parallel_config.data_parallel_rpc_port) + + with zmq_socket_ctx(handshake_address, zmq.ROUTER, + bind=True) as handshake_socket: + + # Start local engines. + if not local_engine_count: + local_engine_manager = None + else: + local_engine_manager = CoreEngineProcManager( + EngineCoreProc.run_engine_core, + vllm_config=vllm_config, + executor_class=Executor.get_class(vllm_config), + log_stats=not engine_args.disable_log_stats, + handshake_address=handshake_address, + on_head_node=True, + local_engine_count=local_engine_count, + start_index=0, + local_start_index=0) + + # Start API servers using the manager + api_server_manager = APIServerProcessManager( + target_server_fn=run_api_server_worker_proc, + listen_address=listen_address, + sock=sock, + args=args, + num_servers=num_api_servers, + input_addresses=input_addresses, + output_addresses=output_addresses, + stats_update_address=stats_update_address) + + # Wait for engine handshakes to complete. + core_engines = [ + CoreEngine(index=i, local=(i < local_engine_count)) + for i in range(dp_size) + ] + wait_for_engine_startup( + handshake_socket, + addresses, + core_engines, + parallel_config, + vllm_config.cache_config, + local_engine_manager, + coordinator.proc if coordinator else None, + ) + + # Wait for API servers + wait_for_completion_or_failure( + api_server_manager=api_server_manager, + local_engine_manager=local_engine_manager, + coordinator=coordinator) + + +def run_api_server_worker_proc(listen_address, + sock, + args, + client_config=None, + **uvicorn_kwargs) -> None: + """Entrypoint for individual API server worker processes.""" + + # Add process-specific prefix to stdout and stderr. + from multiprocessing import current_process + process_name = current_process().name + pid = os.getpid() + _add_prefix(sys.stdout, process_name, pid) + _add_prefix(sys.stderr, process_name, pid) + + uvloop.run( + run_server_worker(listen_address, sock, args, client_config, + **uvicorn_kwargs)) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 053ee55bb6a8a..e05189ef49611 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -4,7 +4,8 @@ import itertools import warnings from collections.abc import Sequence from contextlib import contextmanager -from typing import Any, Callable, ClassVar, Optional, Union, cast, overload +from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union, + cast, overload) import cloudpickle import torch.nn as nn @@ -44,8 +45,10 @@ from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams, from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer, get_cached_tokenizer) from vllm.usage.usage_lib import UsageContext -from vllm.utils import (Counter, Device, deprecate_args, deprecate_kwargs, - is_list_of) +from vllm.utils import Counter, Device, deprecate_kwargs, is_list_of + +if TYPE_CHECKING: + from vllm.v1.metrics.reader import Metric logger = init_logger(__name__) @@ -116,7 +119,8 @@ class LLM: to eager mode. Additionally for encoder-decoder models, if the sequence length of the encoder input is larger than this, we fall back to the eager mode. - disable_custom_all_reduce: See {class}`~vllm.config.ParallelConfig` + disable_custom_all_reduce: See + [ParallelConfig][vllm.config.ParallelConfig]. disable_async_output_proc: Disable async output processing. This may result in lower performance. hf_token: The token to use as HTTP bearer authorization for remote files @@ -128,24 +132,16 @@ class LLM: compilation_config: Either an integer or a dictionary. If it is an integer, it is used as the level of compilation optimization. If it is a dictionary, it can specify the full compilation configuration. - **kwargs: Arguments for {class}`~vllm.EngineArgs`. (See - {ref}`engine-args`) + **kwargs: Arguments for [`EngineArgs`][vllm.EngineArgs]. - :::{note} - This class is intended to be used for offline inference. For online - serving, use the {class}`~vllm.AsyncLLMEngine` class instead. - ::: + Note: + This class is intended to be used for offline inference. For online + serving, use the [AsyncLLMEngine][vllm.AsyncLLMEngine] class instead. """ DEPRECATE_LEGACY: ClassVar[bool] = True """A flag to toggle whether to deprecate the legacy generate/encode API.""" - DEPRECATE_INIT_POSARGS: ClassVar[bool] = True - """ - A flag to toggle whether to deprecate positional arguments in - {meth}`LLM.__init__`. - """ - @classmethod @contextmanager def deprecate_legacy_api(cls): @@ -155,16 +151,11 @@ class LLM: cls.DEPRECATE_LEGACY = False - @deprecate_args( - start_index=2, # Ignore self and model - is_deprecated=lambda: LLM.DEPRECATE_INIT_POSARGS, - additional_message=( - "All positional arguments other than `model` will be " - "replaced with keyword arguments in an upcoming version."), - ) def __init__( self, model: str, + *, + task: TaskOption = "auto", tokenizer: Optional[str] = None, tokenizer_mode: TokenizerMode = "auto", skip_tokenizer_init: bool = False, @@ -186,8 +177,6 @@ class LLM: hf_token: Optional[Union[bool, str]] = None, hf_overrides: Optional[HfOverrides] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None, - # After positional args are removed, move this right below `model` - task: TaskOption = "auto", override_pooler_config: Optional[PoolerConfig] = None, compilation_config: Optional[Union[int, dict[str, Any]]] = None, **kwargs, @@ -204,6 +193,9 @@ class LLM: if isinstance(worker_cls, type): kwargs["worker_cls"] = cloudpickle.dumps(worker_cls) + if hf_overrides is None: + hf_overrides = {} + if compilation_config is not None: if isinstance(compilation_config, int): compilation_config_instance = CompilationConfig( @@ -215,7 +207,7 @@ class LLM: else: compilation_config_instance = compilation_config else: - compilation_config_instance = None + compilation_config_instance = CompilationConfig() engine_args = EngineArgs( model=model, @@ -404,7 +396,7 @@ class LLM: Args: prompts: The prompts to the LLM. You may pass a sequence of prompts - for batch inference. See {class}`~vllm.inputs.PromptType` + for batch inference. See [PromptType][vllm.inputs.PromptType] for more details about the format of each prompts. sampling_params: The sampling parameters for text generation. If None, we use the default sampling parameters. @@ -422,11 +414,10 @@ class LLM: A list of `RequestOutput` objects containing the generated completions in the same order as the input prompts. - :::{note} - Using `prompts` and `prompt_token_ids` as keyword parameters is - considered legacy and may be deprecated in the future. You should - instead pass them via the `inputs` parameter. - ::: + Note: + Using `prompts` and `prompt_token_ids` as keyword parameters is + considered legacy and may be deprecated in the future. You should + instead pass them via the `inputs` parameter. """ runner_type = self.llm_engine.model_config.runner_type if runner_type not in ["generate", "transcription"]: @@ -495,17 +486,16 @@ class LLM: `self` argument, in addition to the arguments passed in `args` and `kwargs`. The `self` argument will be the worker object. timeout: Maximum time in seconds to wait for execution. Raises a - {exc}`TimeoutError` on timeout. `None` means wait indefinitely. + [`TimeoutError`][] on timeout. `None` means wait indefinitely. args: Positional arguments to pass to the worker method. kwargs: Keyword arguments to pass to the worker method. Returns: A list containing the results from each worker. - :::{note} - It is recommended to use this API to only pass control messages, - and set up data-plane communication to pass data. - ::: + Note: + It is recommended to use this API to only pass control messages, + and set up data-plane communication to pass data. """ return self.llm_engine.collective_rpc(method, timeout, args, kwargs) @@ -518,10 +508,28 @@ class LLM: executor = self.llm_engine.model_executor return executor.apply_model(func) + def _get_beam_search_lora_requests( + self, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]], + prompts: list[Union[TokensPrompt, TextPrompt]], + ) -> list[Optional[LoRARequest]]: + """Get the optional lora request corresponding to each prompt.""" + if isinstance(lora_request, + Sequence) and len(lora_request) != len(prompts): + raise ValueError( + "Lora request list should be the same length as the prompts") + return lora_request + + if lora_request is None or isinstance(lora_request, LoRARequest): + return [lora_request] * len(prompts) + + raise TypeError(f"Invalid lora_request type {type(lora_request)}") + def beam_search( self, prompts: list[Union[TokensPrompt, TextPrompt]], params: BeamSearchParams, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, ) -> list[BeamSearchOutput]: """ Generate sequences using beam search. @@ -530,6 +538,7 @@ class LLM: prompts: A list of prompts. Each prompt can be a string or a list of token IDs. params: The beam search parameters. + lora_request: LoRA request to use for generation, if any. """ # TODO: how does beam search work together with length penalty, # frequency, penalty, and stopping criteria, etc.? @@ -539,6 +548,9 @@ class LLM: ignore_eos = params.ignore_eos length_penalty = params.length_penalty + lora_requests = self._get_beam_search_lora_requests( + lora_request, prompts) + def sort_beams_key(x: BeamSearchSequence) -> float: return get_beam_search_score(x.tokens, x.cum_logprob, tokenizer.eos_token_id, @@ -566,7 +578,7 @@ class LLM: temperature=temperature) instances: list[BeamSearchInstance] = [] - for prompt in prompts: + for lora_req, prompt in zip(lora_requests, prompts): # Add multimodal processor kwargs & data mm_kwargs = {} if "multi_modal_data" in prompt: @@ -582,7 +594,12 @@ class LLM: prompt_tokens = tokenizer.encode(prompt["prompt"]) instances.append( - BeamSearchInstance(prompt_tokens, logprobs=None, **mm_kwargs)) + BeamSearchInstance( + prompt_tokens, + lora_request=lora_req, + logprobs=None, + **mm_kwargs, + ), ) for _ in range(max_tokens): all_beams: list[BeamSearchSequence] = list( @@ -596,15 +613,17 @@ class LLM: if len(all_beams) == 0: break - prompts_batch = [ - create_tokens_prompt_from_beam(beam) for beam in all_beams - ] + # create the corresponding batch entries for prompt & optional lora + prompts_batch, lora_req_batch = zip( + *[(create_tokens_prompt_from_beam(beam), beam.lora_request) + for beam in all_beams]) # only runs for one step # we don't need to use tqdm here output = self.generate(prompts_batch, sampling_params=beam_search_params, - use_tqdm=False) + use_tqdm=False, + lora_request=lora_req_batch) for (start, end), instance in zip(instance_start_and_end, instances): @@ -622,6 +641,7 @@ class LLM: new_beam = BeamSearchSequence( tokens=current_beam.tokens + [token_id], logprobs=current_beam.logprobs + [logprobs], + lora_request=current_beam.lora_request, cum_logprob=current_beam.cum_logprob + logprob_obj.logprob, multi_modal_data=current_beam.multi_modal_data, @@ -672,7 +692,7 @@ class LLM: Generate responses for a chat conversation. The chat conversation is converted into a text prompt using the - tokenizer and calls the {meth}`generate` method to generate the + tokenizer and calls the [generate][] method to generate the responses. Multi-modal inputs can be passed in the same way you would pass them @@ -681,8 +701,8 @@ class LLM: Args: messages: A list of conversations or a single conversation. - - Each conversation is represented as a list of messages. - - Each message is a dictionary with 'role' and 'content' keys. + - Each conversation is represented as a list of messages. + - Each message is a dictionary with 'role' and 'content' keys. sampling_params: The sampling parameters for text generation. If None, we use the default sampling parameters. When it @@ -692,27 +712,27 @@ class LLM: use_tqdm: Whether to use tqdm to display the progress bar. lora_request: LoRA request to use for generation, if any. chat_template: The template to use for structuring the chat. - If not provided, the model's default chat template will be used. + If not provided, the model's default chat template will be used. chat_template_content_format: The format to render message content. - - "string" will render the content as a string. - Example: ``"Who are you?"`` - - "openai" will render the content as a list of dictionaries, - similar to OpenAI schema. - Example: ``[{"type": "text", "text": "Who are you?"}]`` + - "string" will render the content as a string. + Example: `"Who are you?"` + - "openai" will render the content as a list of dictionaries, + similar to OpenAI schema. + Example: `[{"type": "text", "text": "Who are you?"}]` add_generation_prompt: If True, adds a generation template to each message. continue_final_message: If True, continues the final message in the conversation instead of starting a new one. Cannot be - ``True`` if ``add_generation_prompt`` is also ``True``. + `True` if `add_generation_prompt` is also `True`. chat_template_kwargs: Additional kwargs to pass to the chat template. mm_processor_kwargs: Multimodal processor kwarg overrides for this chat request. Only used for offline requests. Returns: - A list of ``RequestOutput`` objects containing the generated + A list of `RequestOutput` objects containing the generated responses in the same order as the input messages. """ list_of_messages: list[list[ChatCompletionMessageParam]] @@ -911,7 +931,7 @@ class LLM: Args: prompts: The prompts to the LLM. You may pass a sequence of prompts - for batch inference. See {class}`~vllm.inputs.PromptType` + for batch inference. See [PromptType][vllm.inputs.PromptType] for more details about the format of each prompts. pooling_params: The pooling parameters for pooling. If None, we use the default pooling parameters. @@ -924,11 +944,10 @@ class LLM: A list of `PoolingRequestOutput` objects containing the pooled hidden states in the same order as the input prompts. - :::{note} - Using `prompts` and `prompt_token_ids` as keyword parameters is - considered legacy and may be deprecated in the future. You should - instead pass them via the `inputs` parameter. - ::: + Note: + Using `prompts` and `prompt_token_ids` as keyword parameters is + considered legacy and may be deprecated in the future. You should + instead pass them via the `inputs` parameter. """ runner_type = self.llm_engine.model_config.runner_type if runner_type != "pooling": @@ -1001,7 +1020,7 @@ class LLM: Args: prompts: The prompts to the LLM. You may pass a sequence of prompts - for batch inference. See {class}`~vllm.inputs.PromptType` + for batch inference. See [PromptType][vllm.inputs.PromptType] for more details about the format of each prompts. pooling_params: The pooling parameters for pooling. If None, we use the default pooling parameters. @@ -1011,7 +1030,7 @@ class LLM: generation, if any. Returns: - A list of ``EmbeddingRequestOutput`` objects containing the + A list of `EmbeddingRequestOutput` objects containing the embedding vectors in the same order as the input prompts. """ if self.llm_engine.model_config.task != "embed": @@ -1045,7 +1064,7 @@ class LLM: Args: prompts: The prompts to the LLM. You may pass a sequence of prompts - for batch inference. See {class}`~vllm.inputs.PromptType` + for batch inference. See [PromptType][vllm.inputs.PromptType] for more details about the format of each prompts. use_tqdm: Whether to use tqdm to display the progress bar. lora_request: LoRA request to use for generation, if any. @@ -1053,7 +1072,7 @@ class LLM: generation, if any. Returns: - A list of ``ClassificationRequestOutput`` objects containing the + A list of `ClassificationRequestOutput` objects containing the embedding vectors in the same order as the input prompts. """ if self.llm_engine.model_config.task != "classify": @@ -1163,11 +1182,11 @@ class LLM: lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[ScoringRequestOutput]: - """Generate similarity scores for all pairs ``<text,text_pair>``. + """Generate similarity scores for all pairs `<text,text_pair>`. - The inputs can be ``1 -> 1``, ``1 -> N`` or ``N -> N``. - In the ``1 - N`` case the ``text_1`` sentence will be replicated ``N`` - times to pair with the ``text_2`` sentences. + The inputs can be `1 -> 1`, `1 -> N` or `N -> N`. + In the `1 - N` case the `text_1` sentence will be replicated `N` + times to pair with the `text_2` sentences. The input pairs are used to build a list of prompts for the cross encoder model. This class automatically batches the prompts, considering the memory constraint. For the best performance, put all @@ -1175,9 +1194,9 @@ class LLM: Args: text_1: can be a single prompt or a list of prompts, in which - case it has to have the same length as the ``text_2`` list + case it has to have the same length as the `text_2` list text_2: The texts to pair with the query to form the input - to the LLM. See {class}`~vllm.inputs.PromptType` for + to the LLM. See [PromptType][vllm.inputs.PromptType] for more details about the format of each prompts. use_tqdm: Whether to use tqdm to display the progress bar. lora_request: LoRA request to use for generation, if any. @@ -1185,7 +1204,7 @@ class LLM: generation, if any. Returns: - A list of ``ScoringRequestOutput`` objects containing the + A list of `ScoringRequestOutput` objects containing the generated scores in the same order as the input prompts. """ runner_type = self.llm_engine.model_config.runner_type @@ -1286,18 +1305,32 @@ class LLM: def wake_up(self, tags: Optional[list[str]] = None): """ - Wake up the engine from sleep mode. See the {meth}`sleep` method + Wake up the engine from sleep mode. See the [sleep][] method for more details. Args: tags: An optional list of tags to reallocate the engine memory for specific memory allocations. Values must be in - ("weights", "kv_cache",). If None, all memory is reallocated. + `("weights", "kv_cache")`. If None, all memory is reallocated. wake_up should be called with all tags (or None) before the engine is used again. """ self.llm_engine.wake_up(tags) + def get_metrics(self) -> list["Metric"]: + """Return a snapshot of aggregated metrics from Prometheus. + + Returns: + A ``MetricSnapshot`` instance capturing the current state + of all aggregated metrics from Prometheus. + + Note: + This method is only available with the V1 LLM engine. + """ + from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine + assert isinstance(self.llm_engine, V1LLMEngine) + return self.llm_engine.get_metrics() + # LEGACY def _convert_v1_inputs( self, @@ -1306,27 +1339,25 @@ class LLM: ): # skip_tokenizer_init is now checked in engine + if prompts is None and prompt_token_ids is None: + raise ValueError( + "Either prompts or prompt_token_ids must be provided.") + if prompts is not None and prompt_token_ids is not None \ + and len(prompts) != len(prompt_token_ids): + raise ValueError( + "The lengths of prompts and prompt_token_ids must be the same." + ) + if prompts is not None: prompts = [p["content"] for p in parse_and_batch_prompt(prompts)] if prompt_token_ids is not None: prompt_token_ids = [ p["content"] for p in parse_and_batch_prompt(prompt_token_ids) ] - - num_requests = None if prompts is not None: num_requests = len(prompts) - if prompt_token_ids is not None: - if (num_requests is not None - and num_requests != len(prompt_token_ids)): - raise ValueError("The lengths of prompts and prompt_token_ids " - "must be the same.") - + elif prompt_token_ids is not None: num_requests = len(prompt_token_ids) - if num_requests is None: - raise ValueError("Either prompts or prompt_token_ids must be " - "provided.") - parsed_prompts: list[PromptType] = [] for i in range(num_requests): item: PromptType diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 0ab6fcdca1a41..5a4295ff716db 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -5,9 +5,9 @@ import atexit import gc import importlib import inspect +import json import multiprocessing import os -import re import signal import socket import tempfile @@ -17,15 +17,17 @@ from collections.abc import AsyncIterator from contextlib import asynccontextmanager from functools import partial from http import HTTPStatus -from json import JSONDecodeError -from typing import Annotated, Optional, Union +from typing import Annotated, Any, Optional import prometheus_client +import regex as re import uvloop from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse +from prometheus_client import make_asgi_app +from prometheus_fastapi_instrumentator import Instrumentator from starlette.concurrency import iterate_in_threadpool from starlette.datastructures import State from starlette.routing import Mount @@ -59,9 +61,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, EmbeddingChatRequest, EmbeddingCompletionRequest, EmbeddingRequest, - EmbeddingResponse, - EmbeddingResponseData, - ErrorResponse, + EmbeddingResponse, ErrorResponse, LoadLoRAAdapterRequest, PoolingChatRequest, PoolingCompletionRequest, @@ -99,6 +99,7 @@ from vllm.transformers_utils.tokenizer import MistralTokenizer from vllm.usage.usage_lib import UsageContext from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path, is_valid_ipv6_address, set_ulimit) +from vllm.v1.metrics.prometheus import get_prometheus_registry from vllm.version import __version__ as VLLM_VERSION TIMEOUT_KEEP_ALIVE = 5 # seconds @@ -144,14 +145,17 @@ async def lifespan(app: FastAPI): @asynccontextmanager async def build_async_engine_client( - args: Namespace) -> AsyncIterator[EngineClient]: + args: Namespace, + client_config: Optional[dict[str, Any]] = None, +) -> AsyncIterator[EngineClient]: # Context manager to handle engine_client lifecycle # Ensures everything is shutdown and cleaned up on error/exit engine_args = AsyncEngineArgs.from_cli_args(args) async with build_async_engine_client_from_engine_args( - engine_args, args.disable_frontend_multiprocessing) as engine: + engine_args, args.disable_frontend_multiprocessing, + client_config) as engine: yield engine @@ -159,6 +163,7 @@ async def build_async_engine_client( async def build_async_engine_client_from_engine_args( engine_args: AsyncEngineArgs, disable_frontend_multiprocessing: bool = False, + client_config: Optional[dict[str, Any]] = None, ) -> AsyncIterator[EngineClient]: """ Create EngineClient, either: @@ -181,12 +186,16 @@ async def build_async_engine_client_from_engine_args( from vllm.v1.engine.async_llm import AsyncLLM async_llm: Optional[AsyncLLM] = None + client_index = client_config.pop( + "client_index") if client_config else 0 try: async_llm = AsyncLLM.from_vllm_config( vllm_config=vllm_config, usage_context=usage_context, disable_log_requests=engine_args.disable_log_requests, - disable_log_stats=engine_args.disable_log_stats) + disable_log_stats=engine_args.disable_log_stats, + client_addresses=client_config, + client_index=client_index) # Don't keep the dummy data in memory await async_llm.reset_mm_cache() @@ -320,22 +329,9 @@ class PrometheusResponse(Response): def mount_metrics(app: FastAPI): - # Lazy import for prometheus multiprocessing. - # We need to set PROMETHEUS_MULTIPROC_DIR environment variable - # before prometheus_client is imported. - # See https://prometheus.github.io/client_python/multiprocess/ - from prometheus_client import (REGISTRY, CollectorRegistry, make_asgi_app, - multiprocess) - from prometheus_fastapi_instrumentator import Instrumentator + """Mount prometheus metrics to a FastAPI app.""" - registry = REGISTRY - - prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None) - if prometheus_multiproc_dir_path is not None: - logger.debug("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR", - prometheus_multiproc_dir_path) - registry = CollectorRegistry() - multiprocess.MultiProcessCollector(registry) + registry = get_prometheus_registry() # `response_class=PrometheusResponse` is needed to return an HTTP response # with header "Content-Type: text/plain; version=0.0.4; charset=utf-8" @@ -627,37 +623,10 @@ async def create_completion(request: CompletionRequest, raw_request: Request): async def create_embedding(request: EmbeddingRequest, raw_request: Request): handler = embedding(raw_request) if handler is None: - fallback_handler = pooling(raw_request) - if fallback_handler is None: - return base(raw_request).create_error_response( - message="The model does not support Embeddings API") + return base(raw_request).create_error_response( + message="The model does not support Embeddings API") - logger.warning( - "Embeddings API will become exclusive to embedding models " - "in a future release. To return the hidden states directly, " - "use the Pooling API (`/pooling`) instead.") - - res = await fallback_handler.create_pooling(request, raw_request) - - generator: Union[ErrorResponse, EmbeddingResponse] - if isinstance(res, PoolingResponse): - generator = EmbeddingResponse( - id=res.id, - object=res.object, - created=res.created, - model=res.model, - data=[ - EmbeddingResponseData( - index=d.index, - embedding=d.data, # type: ignore - ) for d in res.data - ], - usage=res.usage, - ) - else: - generator = res - else: - generator = await handler.create_embedding(request, raw_request) + generator = await handler.create_embedding(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), @@ -961,7 +930,7 @@ async def invocations(raw_request: Request): """ try: body = await raw_request.json() - except JSONDecodeError as e: + except json.JSONDecodeError as e: raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value, detail=f"JSON decode error: {e}") from e @@ -1034,6 +1003,18 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: return Response(status_code=200, content=response) +def load_log_config(log_config_file: Optional[str]) -> Optional[dict]: + if not log_config_file: + return None + try: + with open(log_config_file) as f: + return json.load(f) + except Exception as e: + logger.warning("Failed to load log config from file %s: error %s", + log_config_file, e) + return None + + def build_app(args: Namespace) -> FastAPI: if args.disable_fastapi_docs: app = FastAPI(openapi_url=None, @@ -1285,16 +1266,10 @@ def create_server_socket(addr: tuple[str, int]) -> socket.socket: return sock -async def run_server(args, **uvicorn_kwargs) -> None: - logger.info("vLLM API server version %s", VLLM_VERSION) - log_non_default_args(args) - - if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: - ToolParserManager.import_tool_parser(args.tool_parser_plugin) - +def validate_api_server_args(args): valid_tool_parses = ToolParserManager.tool_parsers.keys() if args.enable_auto_tool_choice \ - and args.tool_call_parser not in valid_tool_parses: + and args.tool_call_parser not in valid_tool_parses: raise KeyError(f"invalid tool call parser: {args.tool_call_parser} " f"(chose from {{ {','.join(valid_tool_parses)} }})") @@ -1305,6 +1280,19 @@ async def run_server(args, **uvicorn_kwargs) -> None: f"invalid reasoning parser: {args.reasoning_parser} " f"(chose from {{ {','.join(valid_reasoning_parses)} }})") + +def setup_server(args): + """Validate API server args, set up signal handler, create socket + ready to serve.""" + + logger.info("vLLM API server version %s", VLLM_VERSION) + log_non_default_args(args) + + if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: + ToolParserManager.import_tool_parser(args.tool_parser_plugin) + + validate_api_server_args(args) + # workaround to make sure that we bind the port before the engine is set up. # This avoids race conditions with ray. # see https://github.com/vllm-project/vllm/issues/8204 @@ -1321,22 +1309,46 @@ async def run_server(args, **uvicorn_kwargs) -> None: signal.signal(signal.SIGTERM, signal_handler) - async with build_async_engine_client(args) as engine_client: + addr, port = sock_addr + is_ssl = args.ssl_keyfile and args.ssl_certfile + host_part = f"[{addr}]" if is_valid_ipv6_address( + addr) else addr or "0.0.0.0" + listen_address = f"http{'s' if is_ssl else ''}://{host_part}:{port}" + + return listen_address, sock + + +async def run_server(args, **uvicorn_kwargs) -> None: + """Run a single-worker API server.""" + listen_address, sock = setup_server(args) + await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) + + +async def run_server_worker(listen_address, + sock, + args, + client_config=None, + **uvicorn_kwargs) -> None: + """Run a single API server worker.""" + + if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: + ToolParserManager.import_tool_parser(args.tool_parser_plugin) + + server_index = client_config.get("client_index", 0) if client_config else 0 + + # Load logging config for uvicorn if specified + log_config = load_log_config(args.log_config_file) + if log_config is not None: + uvicorn_kwargs['log_config'] = log_config + + async with build_async_engine_client(args, client_config) as engine_client: app = build_app(args) vllm_config = await engine_client.get_vllm_config() await init_app_state(engine_client, vllm_config, app.state, args) - def _listen_addr(a: str) -> str: - if is_valid_ipv6_address(a): - return '[' + a + ']' - return a or "0.0.0.0" - - is_ssl = args.ssl_keyfile and args.ssl_certfile - logger.info("Starting vLLM API server on http%s://%s:%d", - "s" if is_ssl else "", _listen_addr(sock_addr[0]), - sock_addr[1]) - + logger.info("Starting vLLM API server %d on %s", server_index, + listen_address) shutdown_task = await serve_http( app, sock=sock, diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index d01af5e422666..f196ff6ed3021 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -11,6 +11,7 @@ import ssl from collections.abc import Sequence from typing import Optional, Union, get_args +import vllm.envs as envs from vllm.engine.arg_utils import AsyncEngineArgs, optional_type from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, validate_chat_template) @@ -243,6 +244,13 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: " into OpenAI API format, the name register in this plugin can be used " "in ``--tool-call-parser``.") + parser.add_argument( + "--log-config-file", + type=str, + default=envs.VLLM_LOGGING_CONFIG_PATH, + help="Path to logging config JSON file for both vllm and uvicorn", + ) + parser = AsyncEngineArgs.add_cli_args(parser) parser.add_argument('--max-log-len', diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 5ab2356a0898a..e72c23993ac8c 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -3,11 +3,11 @@ # Adapted from # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py import json -import re import time from http import HTTPStatus from typing import Annotated, Any, ClassVar, Literal, Optional, Union +import regex as re import torch from fastapi import HTTPException, UploadFile from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter, @@ -175,11 +175,15 @@ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel): type: Literal["function"] = "function" +# extra="forbid" is a workaround to have kwargs as a field, +# see https://github.com/pydantic/pydantic/issues/3125 class LogitsProcessorConstructor(BaseModel): qualname: str args: Optional[list[Any]] = None kwargs: Optional[dict[str, Any]] = None + model_config = ConfigDict(extra="forbid") + LogitsProcessors = list[Union[str, LogitsProcessorConstructor]] @@ -234,7 +238,7 @@ class ChatCompletionRequest(OpenAIBaseModel): presence_penalty: Optional[float] = 0.0 response_format: Optional[AnyResponseFormat] = None seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) - stop: Optional[Union[str, list[str]]] = Field(default_factory=list) + stop: Optional[Union[str, list[str]]] = [] stream: Optional[bool] = False stream_options: Optional[StreamOptions] = None temperature: Optional[float] = None @@ -251,14 +255,14 @@ class ChatCompletionRequest(OpenAIBaseModel): parallel_tool_calls: Optional[bool] = False user: Optional[str] = None - # doc: begin-chat-completion-sampling-params + # --8<-- [start:chat-completion-sampling-params] best_of: Optional[int] = None use_beam_search: bool = False top_k: Optional[int] = None min_p: Optional[float] = None repetition_penalty: Optional[float] = None length_penalty: float = 1.0 - stop_token_ids: Optional[list[int]] = Field(default_factory=list) + stop_token_ids: Optional[list[int]] = [] include_stop_str_in_output: bool = False ignore_eos: bool = False min_tokens: int = 0 @@ -266,9 +270,9 @@ class ChatCompletionRequest(OpenAIBaseModel): spaces_between_special_tokens: bool = True truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None prompt_logprobs: Optional[int] = None - # doc: end-chat-completion-sampling-params + # --8<-- [end:chat-completion-sampling-params] - # doc: begin-chat-completion-extra-params + # --8<-- [start:chat-completion-extra-params] echo: bool = Field( default=False, description=( @@ -407,7 +411,7 @@ class ChatCompletionRequest(OpenAIBaseModel): default=None, description="KVTransfer parameters used for disaggregated serving.") - # doc: end-chat-completion-extra-params + # --8<-- [end:chat-completion-extra-params] # Default sampling parameters for chat completion requests _DEFAULT_SAMPLING_PARAMS: dict = { @@ -756,7 +760,7 @@ class CompletionRequest(OpenAIBaseModel): n: int = 1 presence_penalty: Optional[float] = 0.0 seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) - stop: Optional[Union[str, list[str]]] = Field(default_factory=list) + stop: Optional[Union[str, list[str]]] = [] stream: Optional[bool] = False stream_options: Optional[StreamOptions] = None suffix: Optional[str] = None @@ -764,13 +768,13 @@ class CompletionRequest(OpenAIBaseModel): top_p: Optional[float] = None user: Optional[str] = None - # doc: begin-completion-sampling-params + # --8<-- [start:completion-sampling-params] use_beam_search: bool = False top_k: Optional[int] = None min_p: Optional[float] = None repetition_penalty: Optional[float] = None length_penalty: float = 1.0 - stop_token_ids: Optional[list[int]] = Field(default_factory=list) + stop_token_ids: Optional[list[int]] = [] include_stop_str_in_output: bool = False ignore_eos: bool = False min_tokens: int = 0 @@ -779,9 +783,9 @@ class CompletionRequest(OpenAIBaseModel): truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None allowed_token_ids: Optional[list[int]] = None prompt_logprobs: Optional[int] = None - # doc: end-completion-sampling-params + # --8<-- [end:completion-sampling-params] - # doc: begin-completion-extra-params + # --8<-- [start:completion-extra-params] add_special_tokens: bool = Field( default=True, description=( @@ -858,7 +862,7 @@ class CompletionRequest(OpenAIBaseModel): default=None, description="KVTransfer parameters used for disaggregated serving.") - # doc: end-completion-extra-params + # --8<-- [end:completion-extra-params] # Default sampling parameters for completion requests _DEFAULT_SAMPLING_PARAMS: dict = { @@ -1045,11 +1049,11 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): user: Optional[str] = None truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None - # doc: begin-embedding-pooling-params + # --8<-- [start:embedding-pooling-params] additional_data: Optional[Any] = None - # doc: end-embedding-pooling-params + # --8<-- [end:embedding-pooling-params] - # doc: begin-embedding-extra-params + # --8<-- [start:embedding-extra-params] add_special_tokens: bool = Field( default=True, description=( @@ -1064,7 +1068,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): "if the served model does not use priority scheduling."), ) - # doc: end-embedding-extra-params + # --8<-- [end:embedding-extra-params] def to_pooling_params(self): return PoolingParams(dimensions=self.dimensions, @@ -1080,11 +1084,11 @@ class EmbeddingChatRequest(OpenAIBaseModel): user: Optional[str] = None truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None - # doc: begin-chat-embedding-pooling-params + # --8<-- [start:chat-embedding-pooling-params] additional_data: Optional[Any] = None - # doc: end-chat-embedding-pooling-params + # --8<-- [end:chat-embedding-pooling-params] - # doc: begin-chat-embedding-extra-params + # --8<-- [start:chat-embedding-extra-params] add_special_tokens: bool = Field( default=False, description=( @@ -1118,7 +1122,7 @@ class EmbeddingChatRequest(OpenAIBaseModel): "default: 0). Any priority other than 0 will raise an error " "if the served model does not use priority scheduling."), ) - # doc: end-chat-embedding-extra-params + # --8<-- [end:chat-embedding-extra-params] @model_validator(mode="before") @classmethod @@ -1147,11 +1151,11 @@ class ScoreRequest(OpenAIBaseModel): text_2: Union[list[str], str] truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None - # doc: begin-score-pooling-params + # --8<-- [start:score-pooling-params] additional_data: Optional[Any] = None - # doc: end-score-pooling-params + # --8<-- [end:score-pooling-params] - # doc: begin-score-extra-params + # --8<-- [start:score-extra-params] priority: int = Field( default=0, description=( @@ -1160,7 +1164,7 @@ class ScoreRequest(OpenAIBaseModel): "if the served model does not use priority scheduling."), ) - # doc: end-score-extra-params + # --8<-- [end:score-extra-params] def to_pooling_params(self): return PoolingParams(additional_data=self.additional_data) @@ -1173,11 +1177,11 @@ class RerankRequest(OpenAIBaseModel): top_n: int = Field(default_factory=lambda: 0) truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None - # doc: begin-rerank-pooling-params + # --8<-- [start:rerank-pooling-params] additional_data: Optional[Any] = None - # doc: end-rerank-pooling-params + # --8<-- [end:rerank-pooling-params] - # doc: begin-rerank-extra-params + # --8<-- [start:rerank-extra-params] priority: int = Field( default=0, description=( @@ -1186,7 +1190,7 @@ class RerankRequest(OpenAIBaseModel): "if the served model does not use priority scheduling."), ) - # doc: end-rerank-extra-params + # --8<-- [end:rerank-extra-params] def to_pooling_params(self): return PoolingParams(additional_data=self.additional_data) @@ -1321,11 +1325,11 @@ class ClassificationRequest(OpenAIBaseModel): truncate_prompt_tokens: Optional[int] = None user: Optional[str] = None - # doc: begin-classification-pooling-params + # --8<-- [start:classification-pooling-params] additional_data: Optional[Any] = None - # doc: end-classification-pooling-params + # --8<-- [end:classification-pooling-params] - # doc: begin-classification-extra-params + # --8<-- [start:classification-extra-params] priority: int = Field( default=0, description=( @@ -1334,7 +1338,7 @@ class ClassificationRequest(OpenAIBaseModel): "if the served model does not use priority scheduling."), ) - # doc: end-classification-extra-params + # --8<-- [end:classification-extra-params] def to_pooling_params(self): return PoolingParams(additional_data=self.additional_data) @@ -1477,6 +1481,10 @@ class TranscriptionStreamResponse(OpenAIBaseModel): usage: Optional[UsageInfo] = Field(default=None) +BatchRequestInputBody = Union[ChatCompletionRequest, EmbeddingRequest, + ScoreRequest, RerankRequest] + + class BatchRequestInput(OpenAIBaseModel): """ The per-line object of the batch input file. @@ -1497,21 +1505,22 @@ class BatchRequestInput(OpenAIBaseModel): url: str # The parameters of the request. - body: Union[ChatCompletionRequest, EmbeddingRequest, ScoreRequest] + body: BatchRequestInputBody @field_validator('body', mode='plain') @classmethod def check_type_for_url(cls, value: Any, info: ValidationInfo): # Use url to disambiguate models - url = info.data['url'] + url: str = info.data["url"] if url == "/v1/chat/completions": return ChatCompletionRequest.model_validate(value) if url == "/v1/embeddings": return TypeAdapter(EmbeddingRequest).validate_python(value) - if url == "/v1/score": + if url.endswith("/score"): return ScoreRequest.model_validate(value) - return TypeAdapter(Union[ChatCompletionRequest, EmbeddingRequest, - ScoreRequest]).validate_python(value) + if url.endswith("/rerank"): + return RerankRequest.model_validate(value) + return TypeAdapter(BatchRequestInputBody).validate_python(value) class BatchResponseData(OpenAIBaseModel): @@ -1523,7 +1532,7 @@ class BatchResponseData(OpenAIBaseModel): # The body of the response. body: Optional[Union[ChatCompletionResponse, EmbeddingResponse, - ScoreResponse]] = None + ScoreResponse, RerankResponse]] = None class BatchRequestOutput(OpenAIBaseModel): @@ -1554,6 +1563,11 @@ class TokenizeCompletionRequest(OpenAIBaseModel): "If true (the default), special tokens (e.g. BOS) will be added to " "the prompt."), ) + return_token_strs: Optional[bool] = Field( + default=False, + description=("If true, also return the token strings " + "corresponding to the token ids."), + ) class TokenizeChatRequest(OpenAIBaseModel): @@ -1567,6 +1581,11 @@ class TokenizeChatRequest(OpenAIBaseModel): "This is a parameter used by chat template in tokenizer config of the " "model."), ) + return_token_strs: Optional[bool] = Field( + default=False, + description=("If true, also return the token strings " + "corresponding to the token ids."), + ) continue_final_message: bool = Field( default=False, description= @@ -1624,6 +1643,7 @@ class TokenizeResponse(OpenAIBaseModel): count: int max_model_len: int tokens: list[int] + token_strs: Optional[list[str]] = None class DetokenizeRequest(OpenAIBaseModel): @@ -1698,7 +1718,7 @@ class TranscriptionRequest(OpenAIBaseModel): timestamps incurs additional latency. """ - # doc: begin-transcription-extra-params + # --8<-- [start:transcription-extra-params] stream: Optional[bool] = False """Custom field not present in the original OpenAI definition. When set, it will enable output to be streamed in a similar fashion as the Chat @@ -1707,9 +1727,9 @@ class TranscriptionRequest(OpenAIBaseModel): # Flattened stream option to simplify form data. stream_include_usage: Optional[bool] = False stream_continuous_usage_stats: Optional[bool] = False - # doc: end-transcription-extra-params + # --8<-- [end:transcription-extra-params] - # doc: begin-transcription-sampling-params + # --8<-- [start:transcription-sampling-params] temperature: float = Field(default=0.0) """The sampling temperature, between 0 and 1. @@ -1743,7 +1763,7 @@ class TranscriptionRequest(OpenAIBaseModel): presence_penalty: Optional[float] = 0.0 """The presence penalty to use for sampling.""" - # doc: end-transcription-sampling-params + # --8<-- [end:transcription-sampling-params] # Default sampling parameters for transcription requests. _DEFAULT_SAMPLING_PARAMS: dict = { diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index fccf459f17dc6..ac250b3cb4fbf 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -21,7 +21,7 @@ from vllm.entrypoints.openai.protocol import (BatchRequestInput, BatchResponseData, ChatCompletionResponse, EmbeddingResponse, ErrorResponse, - ScoreResponse) + RerankResponse, ScoreResponse) # yapf: enable from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding @@ -33,9 +33,7 @@ from vllm.utils import FlexibleArgumentParser, random_uuid from vllm.version import __version__ as VLLM_VERSION -def parse_args(): - parser = FlexibleArgumentParser( - description="vLLM OpenAI-Compatible batch runner.") +def make_arg_parser(parser: FlexibleArgumentParser): parser.add_argument( "-i", "--input-file", @@ -98,7 +96,13 @@ def parse_args(): default=False, help="If set to True, enable prompt_tokens_details in usage.") - return parser.parse_args() + return parser + + +def parse_args(): + parser = FlexibleArgumentParser( + description="vLLM OpenAI-Compatible batch runner.") + return make_arg_parser(parser).parse_args() # explicitly use pure text format, with a newline at the end @@ -270,8 +274,11 @@ async def run_request(serving_engine_func: Callable, tracker: BatchProgressTracker) -> BatchRequestOutput: response = await serving_engine_func(request.body) - if isinstance(response, - (ChatCompletionResponse, EmbeddingResponse, ScoreResponse)): + if isinstance( + response, + (ChatCompletionResponse, EmbeddingResponse, ScoreResponse, + RerankResponse), + ): batch_output = BatchRequestOutput( id=f"vllm-{random_uuid()}", custom_id=request.custom_id, @@ -365,8 +372,8 @@ async def main(args): # Determine the type of request and run it. if request.url == "/v1/chat/completions": - chat_handler_fn = (None if openai_serving_chat is None else - openai_serving_chat.create_chat_completion) + chat_handler_fn = openai_serving_chat.create_chat_completion if \ + openai_serving_chat is not None else None if chat_handler_fn is None: response_futures.append( make_async_error_request_output( @@ -380,8 +387,8 @@ async def main(args): run_request(chat_handler_fn, request, tracker)) tracker.submitted() elif request.url == "/v1/embeddings": - embed_handler_fn = (None if openai_serving_embedding is None else - openai_serving_embedding.create_embedding) + embed_handler_fn = openai_serving_embedding.create_embedding if \ + openai_serving_embedding is not None else None if embed_handler_fn is None: response_futures.append( make_async_error_request_output( @@ -393,9 +400,9 @@ async def main(args): response_futures.append( run_request(embed_handler_fn, request, tracker)) tracker.submitted() - elif request.url == "/v1/score": - score_handler_fn = (None if openai_serving_scores is None else - openai_serving_scores.create_score) + elif request.url.endswith("/score"): + score_handler_fn = openai_serving_scores.create_score if \ + openai_serving_scores is not None else None if score_handler_fn is None: response_futures.append( make_async_error_request_output( @@ -407,13 +414,29 @@ async def main(args): response_futures.append( run_request(score_handler_fn, request, tracker)) tracker.submitted() + elif request.url.endswith("/rerank"): + rerank_handler_fn = openai_serving_scores.do_rerank if \ + openai_serving_scores is not None else None + if rerank_handler_fn is None: + response_futures.append( + make_async_error_request_output( + request, + error_msg="The model does not support Rerank API", + )) + continue + + response_futures.append( + run_request(rerank_handler_fn, request, tracker)) + tracker.submitted() else: response_futures.append( make_async_error_request_output( request, - error_msg= - "Only /v1/chat/completions, /v1/embeddings, and /v1/score " - "are supported in the batch endpoint.", + error_msg=f"URL {request.url} was used. " + "Supported endpoints: /v1/chat/completions, /v1/embeddings," + " /score, /rerank ." + "See vllm/entrypoints/openai/api_server.py for supported " + "score/rerank versions.", )) with tracker.pbar(): diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index ee18e0b0a454f..ea8e187dc6b7f 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -2,7 +2,6 @@ import asyncio import json -import re import time from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import Sequence as GenericSequence @@ -10,6 +9,7 @@ from typing import Callable, Final, Optional, Union import jinja2 import partial_json_parser +import regex as re from fastapi import Request from pydantic import TypeAdapter @@ -236,6 +236,7 @@ class OpenAIServingChat(OpenAIServing): prompt=engine_prompt, request_id=request_id, params=sampling_params, + lora_request=lora_request, ) else: generator = self.engine_client.generate( @@ -987,7 +988,8 @@ class OpenAIServingChat(OpenAIServing): tool_calls=[ tool_call_class(function=FunctionCall( name=tool_call.name, - arguments=json.dumps(tool_call.parameters))) + arguments=json.dumps(tool_call.parameters, + ensure_ascii=False))) for tool_call in tool_calls ]) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 7beaae287de99..1c06070cb3154 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -186,6 +186,7 @@ class OpenAIServingCompletion(OpenAIServing): prompt=engine_prompt, request_id=request_id, params=sampling_params, + lora_request=lora_request, ) else: generator = self.engine_client.generate( diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 93de9f3a5c05c..f96a4ac8b3a51 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -134,11 +134,9 @@ class RequestProcessingMixin(BaseModel): Mixin for request processing, handling prompt preparation and engine input. """ - request_prompts: Optional[Sequence[RequestPrompt]] = \ - Field(default_factory=list) + request_prompts: Optional[Sequence[RequestPrompt]] = [] engine_prompts: Optional[Union[list[EngineTokensPrompt], - list[EngineEmbedsPrompt]]] = Field( - default_factory=list) + list[EngineEmbedsPrompt]]] = [] model_config = ConfigDict(arbitrary_types_allowed=True) @@ -528,12 +526,14 @@ class OpenAIServing: if isinstance(request, (EmbeddingChatRequest, EmbeddingCompletionRequest, ScoreRequest, RerankRequest, ClassificationRequest)): - operation = { - ScoreRequest: "score", - ClassificationRequest: "classification" - }.get(type(request), "embedding generation") if token_num > self.max_model_len: + operations: dict[type[AnyRequest], str] = { + ScoreRequest: "score", + ClassificationRequest: "classification" + } + operation = operations.get(type(request), + "embedding generation") raise ValueError( f"This model's maximum context length is " f"{self.max_model_len} tokens. However, you requested " @@ -582,7 +582,8 @@ class OpenAIServing: add_special_tokens: bool = True, ) -> TextTokensPrompt: """ - A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs` + A simpler implementation of + [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs] that assumes single input. """ return next( @@ -603,7 +604,8 @@ class OpenAIServing: add_special_tokens: bool = True, ) -> Iterator[TextTokensPrompt]: """ - A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs` + A simpler implementation of + [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs] that assumes multiple inputs. """ for text in prompt_inputs: diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 5ef1a486d86c8..0d739bbf9bf22 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -110,7 +110,12 @@ class OpenAIServingTokenization(OpenAIServing): dict) and "prompt_token_ids" in engine_prompt: input_ids.extend(engine_prompt["prompt_token_ids"]) + token_strs = None + if request.return_token_strs: + token_strs = tokenizer.convert_ids_to_tokens(input_ids) + return TokenizeResponse(tokens=input_ids, + token_strs=token_strs, count=len(input_ids), max_model_len=self.max_model_len) diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py index 13565d0ef8dd7..9fc5b562e7d5c 100644 --- a/vllm/entrypoints/openai/serving_transcription.py +++ b/vllm/entrypoints/openai/serving_transcription.py @@ -278,7 +278,9 @@ class OpenAIServingTranscription(OpenAIServing): result_generator: Optional[AsyncGenerator[RequestOutput, None]] = None try: - # TODO(rob): subtract len of tokenized prompt. + # Unlike most decoder-only models, whisper generation length is not + # constrained by the size of the input audio, which is mapped to a + # fixed-size log-mel-spectogram. default_max_tokens = self.model_config.max_model_len sampling_params = request.to_sampling_params( default_max_tokens, self.default_sampling_params) diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index f7c7112b124fd..054c0b006b2fc 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -7,6 +7,7 @@ from .granite_tool_parser import GraniteToolParser from .hermes_tool_parser import Hermes2ProToolParser from .internlm2_tool_parser import Internlm2ToolParser from .jamba_tool_parser import JambaToolParser +from .llama4_pythonic_tool_parser import Llama4PythonicToolParser from .llama_tool_parser import Llama3JsonToolParser from .mistral_tool_parser import MistralToolParser from .phi4mini_tool_parser import Phi4MiniJsonToolParser @@ -16,5 +17,6 @@ __all__ = [ "ToolParser", "ToolParserManager", "Granite20bFCToolParser", "GraniteToolParser", "Hermes2ProToolParser", "MistralToolParser", "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser", - "PythonicToolParser", "Phi4MiniJsonToolParser", "DeepSeekV3ToolParser" + "Llama4PythonicToolParser", "PythonicToolParser", "Phi4MiniJsonToolParser", + "DeepSeekV3ToolParser" ] diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py index bd8e87e4cee84..14e743e13a727 100644 --- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py @@ -1,9 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 -import re from collections.abc import Sequence from typing import Union +import regex as re + from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py index b93de6b418172..383e0d44de99f 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -1,12 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 import json -import re from collections.abc import Sequence from json import JSONDecoder from typing import Union import partial_json_parser +import regex as re from partial_json_parser.core.options import Allow from vllm.entrypoints.chat_utils import random_tool_call_id @@ -80,7 +80,8 @@ class Granite20bFCToolParser(ToolParser): function=FunctionCall( name=function_call["name"], # function call args are JSON but as a string - arguments=json.dumps(function_call["arguments"]), + arguments=json.dumps(function_call["arguments"], + ensure_ascii=False), ), ) for function_call in raw_function_calls ] @@ -166,7 +167,8 @@ class Granite20bFCToolParser(ToolParser): if self.current_tool_id >= 0: cur_arguments = current_tool_call.get("arguments") if cur_arguments: - cur_args_json = json.dumps(cur_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) sent = len( self.streamed_args_for_tool[self.current_tool_id]) argument_diff = cur_args_json[sent:] @@ -218,7 +220,8 @@ class Granite20bFCToolParser(ToolParser): if cur_arguments: sent = len( self.streamed_args_for_tool[self.current_tool_id]) - cur_args_json = json.dumps(cur_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) prev_arguments = self.prev_tool_call_arr[ self.current_tool_id].get("arguments") @@ -226,7 +229,8 @@ class Granite20bFCToolParser(ToolParser): if is_complete[self.current_tool_id]: argument_diff = cur_args_json[sent:] elif prev_arguments: - prev_args_json = json.dumps(prev_arguments) + prev_args_json = json.dumps(prev_arguments, + ensure_ascii=False) if cur_args_json != prev_args_json: prefix = find_common_prefix( diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py index 6710e7938c43d..b8bf142530ee3 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py @@ -67,7 +67,8 @@ class GraniteToolParser(ToolParser): function=FunctionCall( name=function_call["name"], # function call args are JSON but as a string - arguments=json.dumps(function_call["arguments"]), + arguments=json.dumps(function_call["arguments"], + ensure_ascii=False), ), ) for function_call in raw_function_calls ] @@ -151,7 +152,8 @@ class GraniteToolParser(ToolParser): if self.current_tool_id >= 0: cur_arguments = current_tool_call.get("arguments") if cur_arguments: - cur_args_json = json.dumps(cur_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) sent = len( self.streamed_args_for_tool[self.current_tool_id]) argument_diff = cur_args_json[sent:] @@ -197,7 +199,8 @@ class GraniteToolParser(ToolParser): if cur_arguments: sent = len( self.streamed_args_for_tool[self.current_tool_id]) - cur_args_json = json.dumps(cur_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) prev_arguments = self.prev_tool_call_arr[ self.current_tool_id].get("arguments") @@ -205,7 +208,8 @@ class GraniteToolParser(ToolParser): if is_complete[self.current_tool_id]: argument_diff = cur_args_json[sent:] elif prev_arguments: - prev_args_json = json.dumps(prev_arguments) + prev_args_json = json.dumps(prev_arguments, + ensure_ascii=False) if cur_args_json != prev_args_json: prefix = find_common_prefix( prev_args_json, cur_args_json) diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index e56a8ef7193c1..2b9f9852bcb32 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -1,11 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 import json -import re from collections.abc import Sequence from typing import Union import partial_json_parser +import regex as re from partial_json_parser.core.options import Allow from vllm.entrypoints.chat_utils import random_tool_call_id diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py index 5abd553d884d0..3f2799f8010a5 100644 --- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -133,7 +133,8 @@ class Internlm2ToolParser(ToolParser): delta = None # first time to get parameters elif cur_arguments and not prev_arguments: - cur_arguments_json = json.dumps(cur_arguments) + cur_arguments_json = json.dumps(cur_arguments, + ensure_ascii=False) arguments_delta = cur_arguments_json[:cur_arguments_json. index(delta_text) + @@ -148,8 +149,10 @@ class Internlm2ToolParser(ToolParser): self.current_tool_id] += arguments_delta # both prev and cur parameters, send the increase parameters elif cur_arguments and prev_arguments: - cur_args_json = json.dumps(cur_arguments) - prev_args_json = json.dumps(prev_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) + prev_args_json = json.dumps(prev_arguments, + ensure_ascii=False) argument_diff = extract_intermediate_diff( cur_args_json, prev_args_json) @@ -190,7 +193,8 @@ class Internlm2ToolParser(ToolParser): action_dict = json.loads(action) name, parameters = action_dict['name'], json.dumps( action_dict.get('parameters', action_dict.get('arguments', - {}))) + {})), + ensure_ascii=False) if not tools or name not in [t.function.name for t in tools]: ExtractedToolCallInformation(tools_called=False, diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py index 6cac6f8163bfe..2714a545f997f 100644 --- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py @@ -1,11 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 import json -import re from collections.abc import Sequence from typing import Union import partial_json_parser +import regex as re from partial_json_parser.core.options import Allow from vllm.entrypoints.chat_utils import random_tool_call_id @@ -96,8 +96,9 @@ class JambaToolParser(ToolParser): function=FunctionCall( name=function_call["name"], # function call args are JSON but as a string - arguments=json.dumps(function_call["arguments"]))) - for function_call in raw_function_calls + arguments=json.dumps(function_call["arguments"], + ensure_ascii=False), + )) for function_call in raw_function_calls ] content = model_output[:model_output. @@ -187,7 +188,7 @@ class JambaToolParser(ToolParser): diff: Union[str, None] = current_tool_call.get("arguments") if diff: - diff = json.dumps(diff).replace( + diff = json.dumps(diff, ensure_ascii=False).replace( self.streamed_args_for_tool[self.current_tool_id], "") delta = DeltaMessage(tool_calls=[ @@ -248,7 +249,8 @@ class JambaToolParser(ToolParser): "mid-arguments") delta = None elif cur_arguments and not prev_arguments: - cur_arguments_json = json.dumps(cur_arguments) + cur_arguments_json = json.dumps(cur_arguments, + ensure_ascii=False) logger.debug("finding %s in %s", new_text, cur_arguments_json) @@ -267,8 +269,10 @@ class JambaToolParser(ToolParser): self.current_tool_id] += arguments_delta elif cur_arguments and prev_arguments: - cur_args_json = json.dumps(cur_arguments) - prev_args_json = json.dumps(prev_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) + prev_args_json = json.dumps(prev_arguments, + ensure_ascii=False) logger.debug("Searching for diff between \n%s\n%s", cur_args_json, prev_args_json) diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py new file mode 100644 index 0000000000000..323fb144181ea --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py @@ -0,0 +1,315 @@ +# SPDX-License-Identifier: Apache-2.0 +import ast +import json +from collections.abc import Sequence +from typing import Any, Union + +import regex as re +from transformers import PreTrainedTokenizerBase + +import vllm.envs as envs +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaFunctionCall, DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, ToolCall) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, ToolParserManager) +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class _UnexpectedAstError(Exception): + pass + + +@ToolParserManager.register_module("llama4_pythonic") +class Llama4PythonicToolParser(ToolParser): + """ + Toolcall parser for Llama4 that produce tool calls in a pythonic style + Use --enable-auto-tool-choice --tool-call-parser llama4_pythonic + """ + # TODO(mdepinet): Possible future improvements: + # 1. Support text + tools separated by either <|python_tag|> or \n\n + # 2. Support tools outside of a list (or separated by a semicolon). + # This depends on item 1 for consistent streaming. + # Neither of these are necessary for e.g. ToolACE, but both would help make + # Llama3.2 models more reliable. + + TOOL_CALL_REGEX = re.compile( + r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]", + re.DOTALL) + + def __init__(self, tokenizer: PreTrainedTokenizerBase): + super().__init__(tokenizer) + + # Rename for readability. This is NOT a tool id. + @property + def current_tool_index(self) -> int: + return self.current_tool_id + + @current_tool_index.setter + def current_tool_index(self, value: int) -> None: + self.current_tool_id = value + + def extract_tool_calls( + self, model_output: str, + request: ChatCompletionRequest) -> ExtractedToolCallInformation: + """ + Extract the tool calls from a complete model response. + """ + + # remove <|python_start|> and <|python_end|> + # as Llama 4 model sometime will output those tokens + if model_output.startswith("<|python_start|>"): + model_output = model_output[len("<|python_start|>"):] + model_output = model_output.replace("<|python_end|>", "") + + is_tool_call_pattern = False + try: + is_tool_call_pattern = self.TOOL_CALL_REGEX.match( + model_output, + timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None + except TimeoutError: + logger.warning( + "Regex timeout occurred when matching tool call pattern.") + logger.debug("Regex timeout occurred when matching user input: %s", + model_output) + + if not is_tool_call_pattern: + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + try: + module = ast.parse(model_output) + parsed = getattr(module.body[0], "value", None) + if isinstance(parsed, ast.List) and all( + isinstance(e, ast.Call) for e in parsed.elts): + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=[ + _handle_single_tool(e) # type: ignore + for e in parsed.elts + ], + content=None) + else: + raise _UnexpectedAstError( + "Tool output must be a list of function calls") + except Exception: + logger.exception("Error in extracting tool call from response.") + # Treat as regular text + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> Union[DeltaMessage, None]: + + if not current_text.startswith("[") and not current_text.startswith( + "<|python_start|>"): + return DeltaMessage(content=delta_text) + + try: + # remove <|python_start|> and <|python_end|> + if current_text.startswith("<|python_start|>"): + current_text = current_text[len("<|python_start|>"):] + if current_text.endswith("<|python_end|>"): + current_text = current_text[:current_text. + rfind("<|python_end|>")] + valid_and_added_text = _make_valid_python(current_text) + if valid_and_added_text is None: + return None + valid_text, added_text = valid_and_added_text + + module = ast.parse(valid_text) + parsed = getattr(module.body[0], "value", None) + if not isinstance(parsed, ast.List) or not all( + isinstance(e, ast.Call) for e in parsed.elts): + raise _UnexpectedAstError( + "Tool output must be a list of function calls") + tool_calls = [ + _handle_single_tool(e) # type: ignore + for e in parsed.elts + ] + + tool_deltas = [] + for index, new_call in enumerate(tool_calls): + if index < self.current_tool_index: + continue + + self.current_tool_index = index + if len(self.streamed_args_for_tool) == index: + self.streamed_args_for_tool.append("") + + new_call_complete = index < len( + tool_calls) - 1 or ")]" not in added_text + if new_call_complete: + self.current_tool_index += 1 + + withheld_suffix = (added_text[:-2] + if not new_call_complete else "") + if not new_call_complete and added_text[-2] == ")": + # Function call is incomplete. Withhold the closing bracket. + withheld_suffix = withheld_suffix + "}" + # Strings get single quotes in the model-produced string. + # JSON requires double quotes. + withheld_suffix = withheld_suffix.replace("'", '"') + delta = _compute_tool_delta(self.streamed_args_for_tool[index], + new_call, index, withheld_suffix) + + if delta is not None: + tool_deltas.append(delta) + if (delta.function is not None + and delta.function.arguments is not None): + self.streamed_args_for_tool[ + index] += delta.function.arguments + + # HACK: serving_chat.py inspects the internal state of tool parsers + # when determining it's final streaming delta, automatically + # adding autocompleted JSON. + # These two lines avoid that nonsense while ensuring finish_reason + # is set to tool_calls when at least one tool is called. + if tool_deltas and not self.prev_tool_call_arr: + self.prev_tool_call_arr = [{"arguments": {}}] + + if tool_deltas: + return DeltaMessage(tool_calls=tool_deltas) + elif not added_text and self.current_tool_id > 0: + # Return an empty DeltaMessage once the tool calls are all done + # so that finish_reason gets set. + return DeltaMessage(content='') + else: + return None + except Exception: + logger.exception("Error trying to handle streaming tool call.") + logger.debug( + "Skipping chunk as a result of tool streaming extraction " + "error") + return None + + +def _get_parameter_value(val: ast.expr) -> Any: + if isinstance(val, ast.Constant): + return val.value + elif isinstance(val, ast.Dict): + if not all(isinstance(k, ast.Constant) for k in val.keys): + raise _UnexpectedAstError( + "Dict tool call arguments must have literal keys") + return { + k.value: _get_parameter_value(v) # type: ignore + for k, v in zip(val.keys, val.values) + } + elif isinstance(val, ast.List): + return [_get_parameter_value(v) for v in val.elts] + else: + raise _UnexpectedAstError("Tool call arguments must be literals") + + +def _handle_single_tool(call: ast.Call) -> ToolCall: + if not isinstance(call.func, ast.Name): + raise _UnexpectedAstError("Invalid tool call name") + function_name = call.func.id + arguments = {} + for keyword in call.keywords: + arguments[keyword.arg] = _get_parameter_value(keyword.value) + return ToolCall(type="function", + function=FunctionCall(name=function_name, + arguments=json.dumps(arguments))) + + +def _make_valid_python(text: str) -> Union[tuple[str, str], None]: + bracket_stack = [] + for index, char in enumerate(text): + if char in {"[", "(", "{"}: + bracket_stack.append(char) + elif char == "]": + if not bracket_stack or bracket_stack.pop() != "[": + raise _UnexpectedAstError("Mismatched square brackets") + elif char == ")": + if not bracket_stack or bracket_stack.pop() != "(": + raise _UnexpectedAstError("Mismatched parentheses") + elif char == "}": + if not bracket_stack or bracket_stack.pop() != "{": + raise _UnexpectedAstError("Mismatched curly braces") + elif char in {"'", '"'}: + if bracket_stack and bracket_stack[-1] == char: + if index > 0 and text[index - 1] == "\\": + # Treat an escaped quote as a regular character + pass + else: + bracket_stack.pop() + elif bracket_stack and bracket_stack[-1] in {"'", '"'}: + # Double quote within a single quote string or vice versa. + pass + else: + bracket_stack.append(char) + + text = text.rstrip() + if text.endswith("=") or text.endswith(":"): + # Since we have no type information for this property/parameter value, + # we can't fill in a valid value. + return None + if bracket_stack and bracket_stack[-1] == "{": + trailing_dict_text = text[:text.rfind("{")] + num_keys = trailing_dict_text.count(":") + num_values = trailing_dict_text.count(",") + if num_keys <= num_values: + return None # Incomplete property name within parameter value + if bracket_stack and bracket_stack[-1] == "(": + trailing_params_text = text[:text.rfind("(")] + num_full_param_names = trailing_params_text.count("=") + num_full_param_values = trailing_params_text.count(",") + if num_full_param_names <= num_full_param_values: + return None # Incomplete parameter name + if text.endswith(","): + text = text[:-1] + if bracket_stack and bracket_stack[-1] == "[" and not text.endswith( + "[") and not text.endswith(")"): + return None # Incomplete function name + + added_text = "" + for char in reversed(bracket_stack): + if char == "[": + added_text += "]" + elif char == "(": + added_text += ")" + elif char == "{": + added_text += "}" + elif char == "'": + added_text += "'" + elif char == '"': + added_text += '"' + + return text + added_text, added_text + + +def _compute_tool_delta(previously_sent_args: str, new_call: ToolCall, + index: int, + withheld_suffix: str) -> Union[DeltaToolCall, None]: + new_call_args = new_call.function.arguments + if withheld_suffix: + assert new_call_args.endswith(withheld_suffix) + new_call_args = new_call_args[:-len(withheld_suffix)] + if not previously_sent_args: + return DeltaToolCall(id=new_call.id, + type="function", + index=index, + function=DeltaFunctionCall( + name=new_call.function.name, + arguments=new_call_args, + )) + + arg_diff = new_call_args[len(previously_sent_args):] + return DeltaToolCall( + id=None, index=index, function=DeltaFunctionCall( + arguments=arg_diff)) if arg_diff else None diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py index 9307034f40d6e..4eda7044cbbaf 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -1,12 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 import json -import re from collections.abc import Sequence from json import JSONDecoder from typing import Union import partial_json_parser +import regex as re from partial_json_parser.core.options import Allow from transformers import PreTrainedTokenizerBase @@ -88,7 +88,8 @@ class Llama3JsonToolParser(ToolParser): # function call args are JSON but as a string arguments=json.dumps(raw_function_call["arguments"] \ if "arguments" in raw_function_call \ - else raw_function_call["parameters"]))) + else raw_function_call["parameters"], + ensure_ascii=False))) for raw_function_call in function_call_arr ] @@ -174,7 +175,8 @@ class Llama3JsonToolParser(ToolParser): if self.current_tool_id >= 0: cur_arguments = current_tool_call.get("arguments") if cur_arguments: - cur_args_json = json.dumps(cur_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) sent = len( self.streamed_args_for_tool[self.current_tool_id]) argument_diff = cur_args_json[sent:] @@ -226,7 +228,8 @@ class Llama3JsonToolParser(ToolParser): if cur_arguments: sent = len( self.streamed_args_for_tool[self.current_tool_id]) - cur_args_json = json.dumps(cur_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) prev_arguments = self.prev_tool_call_arr[ self.current_tool_id].get("arguments") @@ -234,7 +237,8 @@ class Llama3JsonToolParser(ToolParser): if is_complete[self.current_tool_id]: argument_diff = cur_args_json[sent:] elif prev_arguments: - prev_args_json = json.dumps(prev_arguments) + prev_args_json = json.dumps(prev_arguments, + ensure_ascii=False) if cur_args_json != prev_args_json: prefix = find_common_prefix( diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index 9dbfe85ecc686..fecad7e653abc 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -1,13 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 import json -import re from collections.abc import Sequence from random import choices from string import ascii_letters, digits from typing import Union import partial_json_parser +import regex as re from partial_json_parser.core.options import Allow from pydantic import Field diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py index abf70a5e85c45..00690ad79a7ac 100644 --- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 import json -import re from collections.abc import Sequence from typing import Any, Optional +import regex as re from transformers import PreTrainedTokenizerBase from vllm.entrypoints.chat_utils import random_tool_call_id @@ -68,8 +68,8 @@ class Phi4MiniJsonToolParser(ToolParser): len(function_call_arr)) except json.JSONDecodeError as e: logger.error( - "Failed to parse function calls from model output: %s. " - "Error: %s", model_output, str(e)) + "Failed to parse function calls from model output. " + "Error: %s", str(e)) tool_calls: list[ToolCall] = [ ToolCall( @@ -79,10 +79,11 @@ class Phi4MiniJsonToolParser(ToolParser): name=raw_function_call["name"], # function call args are JSON but as a string arguments=json.dumps( - raw_function_call["arguments"] if "arguments" in - raw_function_call else - raw_function_call["parameters"]))) - for raw_function_call in function_call_arr + raw_function_call["arguments"] + if "arguments" in raw_function_call else + raw_function_call["parameters"], + ensure_ascii=False), + )) for raw_function_call in function_call_arr ] # get any content before the tool call diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py index bb91a35af3be1..bc5d15dcb82f4 100644 --- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py @@ -2,12 +2,13 @@ import ast import json -import re from collections.abc import Sequence from typing import Any, Union +import regex as re from transformers import PreTrainedTokenizerBase +import vllm.envs as envs from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -61,8 +62,18 @@ class PythonicToolParser(ToolParser): """ Extract the tool calls from a complete model response. """ + is_tool_call_pattern = False + try: + is_tool_call_pattern = self.TOOL_CALL_REGEX.match( + model_output, + timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None + except TimeoutError: + logger.warning( + "Regex timeout occurred when matching tool call pattern.") + logger.debug("Regex timeout occurred when matching user input: %s", + model_output) - if not (self.TOOL_CALL_REGEX.match(model_output)): + if not is_tool_call_pattern: return ExtractedToolCallInformation(tools_called=False, tool_calls=[], content=model_output) @@ -200,9 +211,12 @@ def _handle_single_tool(call: ast.Call) -> ToolCall: arguments = {} for keyword in call.keywords: arguments[keyword.arg] = _get_parameter_value(keyword.value) - return ToolCall(type="function", - function=FunctionCall(name=function_name, - arguments=json.dumps(arguments))) + return ToolCall( + type="function", + function=FunctionCall(name=function_name, + arguments=json.dumps(arguments, + ensure_ascii=False)), + ) def _make_valid_python(text: str) -> Union[tuple[str, str], None]: diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index 2fe6e1a9e9c40..1b0ea69096cc6 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -13,12 +13,24 @@ from vllm.logger import init_logger logger = init_logger(__name__) +VLLM_SERVE_PARSER_EPILOG = ( + "Tip: Use `vllm serve --help=<keyword>` to explore arguments from help.\n" + " - To view a argument group: --help=ModelConfig\n" + " - To view a single argument: --help=max-num-seqs\n" + " - To search by keyword: --help=max\n" + " - To list all groups: --help=listgroup") + async def listen_for_disconnect(request: Request) -> None: """Returns if a disconnect message is received""" while True: message = await request.receive() if message["type"] == "http.disconnect": + if request.app.state.enable_server_load_tracking: + # on timeout/cancellation the BackgroundTask in load_aware_call + # cannot decrement the server load metrics. + # Must be decremented by with_cancellation instead. + request.app.state.server_load_metrics -= 1 break @@ -158,3 +170,55 @@ def _validate_truncation_size( tokenization_kwargs["max_length"] = truncate_prompt_tokens return truncate_prompt_tokens + + +def show_filtered_argument_or_group_from_help(parser): + import sys + for arg in sys.argv: + if arg.startswith('--help='): + search_keyword = arg.split('=', 1)[1] + + # List available groups + if search_keyword == 'listgroup': + print("\nAvailable argument groups:") + for group in parser._action_groups: + if group.title and not group.title.startswith( + "positional arguments"): + print(f" - {group.title}") + if group.description: + print(" " + group.description.strip()) + print() + sys.exit(0) + + # For group search + formatter = parser._get_formatter() + for group in parser._action_groups: + if group.title and group.title.lower() == search_keyword.lower( + ): + formatter.start_section(group.title) + formatter.add_text(group.description) + formatter.add_arguments(group._group_actions) + formatter.end_section() + print(formatter.format_help()) + sys.exit(0) + + # For single arg + matched_actions = [] + + for group in parser._action_groups: + for action in group._group_actions: + # search option name + if any(search_keyword.lower() in opt.lower() + for opt in action.option_strings): + matched_actions.append(action) + + if matched_actions: + print(f"\nParameters matching '{search_keyword}':\n") + formatter = parser._get_formatter() + formatter.add_arguments(matched_actions) + print(formatter.format_help()) + sys.exit(0) + + print(f"\nNo group or parameter matching '{search_keyword}'") + print("Tip: use `--help=listgroup` to view all groups.") + sys.exit(1) diff --git a/vllm/envs.py b/vllm/envs.py index dc23c8ea5314d..44baf5a189b43 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -15,6 +15,7 @@ if TYPE_CHECKING: VLLM_NCCL_SO_PATH: Optional[str] = None LD_LIBRARY_PATH: Optional[str] = None VLLM_USE_TRITON_FLASH_ATTN: bool = False + VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False VLLM_FLASH_ATTN_VERSION: Optional[int] = None LOCAL_RANK: int = 0 CUDA_VISIBLE_DEVICES: Optional[str] = None @@ -117,6 +118,8 @@ if TYPE_CHECKING: VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost" VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557 VLLM_ALL2ALL_BACKEND: str = "naive" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840 + VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1 def get_default_cache_root(): @@ -141,10 +144,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: def get_vllm_port() -> Optional[int]: """Get the port from VLLM_PORT environment variable. - + Returns: The port number as an integer if VLLM_PORT is set, None otherwise. - + Raises: ValueError: If VLLM_PORT is a URI, suggest k8s service discovery issue. """ @@ -157,17 +160,13 @@ def get_vllm_port() -> Optional[int]: return int(port) except ValueError as err: from urllib.parse import urlparse - try: - parsed = urlparse(port) - if parsed.scheme: - raise ValueError( - f"VLLM_PORT '{port}' appears to be a URI. " - "This may be caused by a Kubernetes service discovery issue" - "check the warning in: https://docs.vllm.ai/en/stable/serving/env_vars.html" - ) - except Exception: - pass - + parsed = urlparse(port) + if parsed.scheme: + raise ValueError( + f"VLLM_PORT '{port}' appears to be a URI. " + "This may be caused by a Kubernetes service discovery issue," + "check the warning in: https://docs.vllm.ai/en/stable/serving/env_vars.html" + ) from None raise ValueError( f"VLLM_PORT '{port}' must be a valid integer") from err @@ -175,7 +174,7 @@ def get_vllm_port() -> Optional[int]: # The begin-* and end* here are used by the documentation generator # to extract the used env vars. -# begin-env-vars-definition +# --8<-- [start:env-vars-definition] environment_variables: dict[str, Callable[[], Any]] = { @@ -289,6 +288,13 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1")), + # Use separate prefill and decode kernels for V1 attention instead of + # the unified triton kernel. + "VLLM_V1_USE_PREFILL_DECODE_ATTENTION": + lambda: + (os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower() in + ("true", "1")), + # Force vllm to use a specific flash-attention version (2 or 3), only valid # when using the flash-attention backend. "VLLM_FLASH_ATTN_VERSION": @@ -299,9 +305,11 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: bool( os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"), - # Internal flag to enable/disable Inductor standalone compile - "VLLM_TEST_STANDALONE_COMPILE": - lambda: os.environ.get("VLLM_TEST_STANDALONE_COMPILE", "0") != "0", + # Feature flag to enable/disable Inductor standalone compile. + # In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is + # enabled by default. + "VLLM_USE_STANDALONE_COMPILE": + lambda: os.environ.get("VLLM_USE_STANDALONE_COMPILE", "1") == "1", # local rank of the process in the distributed setting, used to determine # the GPU device id @@ -322,8 +330,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # Whether to log responses from API Server for debugging "VLLM_DEBUG_LOG_API_SERVER_RESPONSE": - lambda: os.environ.get("VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False"). - lower() == "true", + lambda: os.environ.get("VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False" + ).lower() == "true", # S3 access information, used for tensorizer to load model from S3 "S3_ACCESS_KEY_ID": @@ -809,11 +817,25 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: int(os.getenv("VLLM_NIXL_SIDE_CHANNEL_PORT", "5557")), # all2all backend for vllm's expert parallel communication + # Available options: + # - "naive": naive all2all implementation using all-reduce + # - "pplx": use pplx kernels "VLLM_ALL2ALL_BACKEND": lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"), + + # Control the maximum number of tokens per expert supported by the + # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for + # the blockscale tensor of activations NVFP4 Quantization. + # This is used to prevent the kernel from running out of memory. + "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE": + lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")), + + # Regex timeout for use by the vLLM tool parsing plugins. + "VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS": + lambda: int(os.getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")), } -# end-env-vars-definition +# --8<-- [end:env-vars-definition] def __getattr__(name: str): @@ -873,7 +895,7 @@ def compute_hash() -> str: "VLLM_USE_TRITON_AWQ", "VLLM_DP_RANK", "VLLM_DP_SIZE", - "VLLM_TEST_STANDALONE_COMPILE", + "VLLM_USE_STANDALONE_COMPILE", ] for key in environment_variables_to_hash: if key in environment_variables: diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 522bd940211f8..40ca1d29939af 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -74,7 +74,7 @@ class ExecutorBase(ABC): `self` argument, in addition to the arguments passed in `args` and `kwargs`. The `self` argument will be the worker object. timeout: Maximum time in seconds to wait for execution. Raises a - {exc}`TimeoutError` on timeout. `None` means wait indefinitely. + [`TimeoutError`][] on timeout. `None` means wait indefinitely. args: Positional arguments to pass to the worker method. kwargs: Keyword arguments to pass to the worker method. diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index 9b0b98731e033..8e67c7a41bb19 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -528,12 +528,12 @@ class RayDistributedExecutor(DistributedExecutorBase): ray.get(parallel_worker_tasks) def _check_ray_cgraph_installation(self): - import pkg_resources + import importlib.metadata + from packaging import version required_version = version.parse("2.43.0") - current_version = version.parse( - pkg_resources.get_distribution("ray").version) + current_version = version.parse(importlib.metadata.version("ray")) if current_version < required_version: raise ValueError(f"Ray version {required_version} is " f"required, but found {current_version}") diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 37cc07bfbb36a..7bc98a16f041d 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -87,9 +87,8 @@ try: # TODO(swang): This is needed right now because Ray Compiled Graph # executes on a background thread, so we need to reset torch's # current device. - import torch if not self.compiled_dag_cuda_device_set: - torch.cuda.set_device(self.worker.device) + current_platform.set_device(self.worker.device) self.compiled_dag_cuda_device_set = True output = self.worker._execute_model_spmd(execute_model_req, @@ -113,8 +112,7 @@ try: # Not needed pass else: - import torch - torch.cuda.set_device(self.worker.device) + current_platform.set_device(self.worker.device) self.compiled_dag_cuda_device_set = True diff --git a/vllm/forward_context.py b/vllm/forward_context.py index bb43302c323b3..6ed6015ab2f7f 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -10,7 +10,7 @@ import torch import torch.distributed as dist import vllm.envs as envs -from vllm.config import VllmConfig +from vllm.config import ParallelConfig, VllmConfig from vllm.logger import init_logger if TYPE_CHECKING: @@ -30,6 +30,53 @@ class DPMetadata: max_tokens_across_dp_cpu: torch.Tensor cu_tokens_across_dp_cpu: torch.Tensor + @staticmethod + def num_tokens_across_dp(num_tokens: int, dp_size: int, + dp_rank: int) -> torch.Tensor: + """ + Gather the num_tokens across all DP ranks and return results in a + CPU tensor of size dp_size. + """ + num_tokens_across_dp = [0] * dp_size + num_tokens_across_dp[dp_rank] = num_tokens + num_tokens_tensor = torch.tensor(num_tokens_across_dp, + device="cpu", + dtype=torch.int32) + from vllm.distributed.parallel_state import get_dp_group + dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group) + return num_tokens_tensor + + @staticmethod + def make( + parallel_config: ParallelConfig, + attn_metadata: Any, + num_tokens: int, + num_tokens_across_dp: Optional[torch.Tensor] = None + ) -> "DPMetadata": + + assert parallel_config.data_parallel_size > 1 + dp_size = parallel_config.data_parallel_size + dp_rank = parallel_config.data_parallel_rank + if attn_metadata is not None and hasattr(attn_metadata, + "num_prefill_tokens"): + # for v0 attention backends + batchsize = attn_metadata.num_prefill_tokens + \ + attn_metadata.num_decode_tokens + else: + # for v1 attention backends or no attn_metadata + batchsize = num_tokens + + # If num_tokens_across_dp is None, it will be computed by all_reduce + # Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize + assert (num_tokens_across_dp is None + or num_tokens_across_dp[dp_rank] == batchsize) + if num_tokens_across_dp is None: + num_tokens_across_dp = DPMetadata.num_tokens_across_dp( + batchsize, dp_size, dp_rank) + max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp) + cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_across_dp, dim=0) + return DPMetadata(max_tokens_across_dp_cpu, cu_tokens_across_dp_cpu) + @dataclass class ForwardContext: @@ -64,28 +111,11 @@ def create_forward_context(attn_metadata: Any, virtual_engine: int = 0, num_tokens: int = 0): dp_metadata: Optional[DPMetadata] = None - if vllm_config.parallel_config.data_parallel_size > 1: - dp_size = vllm_config.parallel_config.data_parallel_size - dp_rank = vllm_config.parallel_config.data_parallel_rank - if attn_metadata is not None and hasattr(attn_metadata, - "num_prefill_tokens"): - # for v0 attention backends - batchsize = attn_metadata.num_prefill_tokens + \ - attn_metadata.num_decode_tokens - else: - # for v1 attention backends or no attn_metadata - batchsize = num_tokens - num_tokens_across_dp = [0] * dp_size - num_tokens_across_dp[dp_rank] = batchsize - num_tokens_tensor = torch.tensor(num_tokens_across_dp, - device="cpu", - dtype=torch.int32) - from vllm.distributed.parallel_state import get_dp_group - dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group) - max_tokens_across_dp_cpu = torch.max(num_tokens_tensor) - cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_tensor, dim=0) - dp_metadata = DPMetadata(max_tokens_across_dp_cpu, - cu_tokens_across_dp_cpu) + if vllm_config.parallel_config.data_parallel_size > 1 and ( + attn_metadata is not None or num_tokens is not None): + dp_metadata = DPMetadata.make(vllm_config.parallel_config, + attn_metadata, num_tokens or 0, + num_tokens_across_dp) return ForwardContext(no_compile_layers=vllm_config.compilation_config. static_forward_context, @@ -142,7 +172,10 @@ def set_forward_context(attn_metadata: Any, # we use synchronous scheduling right now, # adding a sync point here should not affect # scheduling of the next batch - torch.cuda.synchronize() + from vllm.platforms import current_platform + synchronize = current_platform.synchronize + if synchronize is not None: + synchronize() now = time.perf_counter() # time measurement is in milliseconds batchsize_forward_time[batchsize].append( diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index 0673aece91087..df4f844cd815e 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -10,8 +10,9 @@ from .registry import (DummyData, InputContext, InputProcessingContext, INPUT_REGISTRY = InputRegistry() """ -The global {class}`~InputRegistry` which is used by {class}`~vllm.LLMEngine` -to dispatch data processing according to the target model. +The global [`InputRegistry`][vllm.inputs.registry.InputRegistry] which is used +by [`LLMEngine`][vllm.LLMEngine] to dispatch data processing according to the +target model. """ __all__ = [ diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 3b58ec47d5bff..843c45bd6163e 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -80,22 +80,24 @@ SingletonPrompt = Union[str, TextPrompt, TokensPrompt, EmbedsPrompt] """ Set of possible schemas for a single prompt: -- A text prompt ({class}`str` or {class}`TextPrompt`) -- A tokenized prompt ({class}`TokensPrompt`) -- An embeddings prompt ({class}`EmbedsPrompt`) +- A text prompt ([`str`][] or [`TextPrompt`][vllm.inputs.data.TextPrompt]) +- A tokenized prompt ([`TokensPrompt`][vllm.inputs.data.TokensPrompt]) +- An embeddings prompt ([`EmbedsPrompt`][vllm.inputs.data.EmbedsPrompt]) Note that "singleton" is as opposed to a data structure which encapsulates multiple prompts, i.e. of the sort which may be utilized for encoder/decoder models when the user desires to express both the encoder & decoder -prompts explicitly, i.e. {class}`ExplicitEncoderDecoderPrompt` +prompts explicitly, i.e. +[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt] -A prompt of type {class}`SingletonPrompt` may be employed -as (1) input to a decoder-only model, (2) input to +A prompt of type [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] may be +employed as (1) input to a decoder-only model, (2) input to the encoder of an encoder/decoder model, in the scenario where the decoder-prompt is not specified explicitly, or (3) as a member of a larger data structure encapsulating -more than one prompt, i.e. {class}`ExplicitEncoderDecoderPrompt` +more than one prompt, i.e. +[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt] """ @@ -126,18 +128,20 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]): comprising an explicit encoder prompt and a decoder prompt. The encoder and decoder prompts, respectively, may be formatted - according to any of the {class}`SingletonPrompt` schemas, + according to any of the + [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] schemas, and are not required to have the same schema. Only the encoder prompt may have multi-modal data. mm_processor_kwargs should be at the top-level, and should not be set in the encoder/decoder prompts, since they are agnostic to the encoder/decoder. - Note that an {class}`ExplicitEncoderDecoderPrompt` may not - be used as an input to a decoder-only model, + Note that an + [`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt] + may not be used as an input to a decoder-only model, and that the `encoder_prompt` and `decoder_prompt` fields of this data structure themselves must be - {class}`SingletonPrompt` instances. + [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] instances. """ encoder_prompt: _T1_co @@ -152,11 +156,11 @@ PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt] Set of possible schemas for an LLM input, including both decoder-only and encoder/decoder input types: -- A text prompt ({class}`str` or {class}`TextPrompt`) -- A tokenized prompt ({class}`TokensPrompt`) -- An embeddings prompt ({class}`EmbedsPrompt`) +- A text prompt ([`str`][] or [`TextPrompt`][vllm.inputs.data.TextPrompt]) +- A tokenized prompt ([`TokensPrompt`][vllm.inputs.data.TokensPrompt]) +- An embeddings prompt ([`EmbedsPrompt`][vllm.inputs.data.EmbedsPrompt]) - A single data structure containing both an encoder and a decoder prompt - ({class}`ExplicitEncoderDecoderPrompt`) + ([`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]) """ @@ -189,7 +193,8 @@ def token_inputs( prompt: Optional[str] = None, cache_salt: Optional[str] = None, ) -> TokenInputs: - """Construct {class}`TokenInputs` from optional values.""" + """Construct [`TokenInputs`][vllm.inputs.data.TokenInputs] from optional + values.""" inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids) if prompt is not None: @@ -221,7 +226,8 @@ def embeds_inputs( prompt_embeds: torch.Tensor, cache_salt: Optional[str] = None, ) -> EmbedsInputs: - """Construct :class:`EmbedsInputs` from optional values.""" + """Construct [`EmbedsInputs`][vllm.inputs.data.EmbedsInputs] from optional + values.""" inputs = EmbedsInputs(type="embeds", prompt_embeds=prompt_embeds) if cache_salt is not None: @@ -232,7 +238,7 @@ def embeds_inputs( DecoderOnlyInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"] """ -The inputs in {class}`~vllm.LLMEngine` before they are +The inputs in [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] before they are passed to the model executor. This specifies the data required for decoder-only models. """ @@ -240,11 +246,12 @@ This specifies the data required for decoder-only models. class EncoderDecoderInputs(TypedDict): """ - The inputs in {class}`~vllm.LLMEngine` before they are - passed to the model executor. + The inputs in [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] before they + are passed to the model executor. This specifies the required data for encoder-decoder models. """ + encoder: Union[TokenInputs, "MultiModalInputs"] """The inputs for the encoder portion.""" @@ -254,13 +261,13 @@ class EncoderDecoderInputs(TypedDict): SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"] """ -A processed {class}`SingletonPrompt` which can be passed to -{class}`vllm.sequence.Sequence`. +A processed [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] which can be +passed to [`vllm.sequence.Sequence`][]. """ ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs] """ -The inputs to {data}`vllm.inputs.InputProcessor`. +The outputs from [`vllm.inputs.preprocess.InputPreprocessor`][]. """ _T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt) @@ -277,7 +284,8 @@ def build_explicit_enc_dec_prompt( return ExplicitEncoderDecoderPrompt( encoder_prompt=encoder_prompt, decoder_prompt=decoder_prompt, - mm_processor_kwargs=mm_processor_kwargs) + mm_processor_kwargs=mm_processor_kwargs, + ) def zip_enc_dec_prompts( @@ -288,7 +296,8 @@ def zip_enc_dec_prompts( ) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]: """ Zip encoder and decoder prompts together into a list of - {class}`ExplicitEncoderDecoderPrompt` instances. + [`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt] + instances. ``mm_processor_kwargs`` may also be provided; if a dict is passed, the same dictionary will be used for every encoder/decoder prompt. If an iterable is @@ -299,10 +308,11 @@ def zip_enc_dec_prompts( if isinstance(mm_processor_kwargs, dict): return [ build_explicit_enc_dec_prompt( - encoder_prompt, decoder_prompt, - cast(dict[str, Any], mm_processor_kwargs)) - for (encoder_prompt, - decoder_prompt) in zip(enc_prompts, dec_prompts) + encoder_prompt, + decoder_prompt, + cast(dict[str, Any], mm_processor_kwargs), + ) for (encoder_prompt, + decoder_prompt) in zip(enc_prompts, dec_prompts) ] return [ build_explicit_enc_dec_prompt(encoder_prompt, decoder_prompt, diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py index d17122b483446..4c64a41ace310 100644 --- a/vllm/inputs/parse.py +++ b/vllm/inputs/parse.py @@ -23,13 +23,13 @@ class ParsedTokens(TypedDict): @overload def parse_and_batch_prompt( - prompt: Union[str, list[str]]) -> Sequence[ParsedText]: + prompt: Union[str, list[str]], ) -> Sequence[ParsedText]: ... @overload def parse_and_batch_prompt( - prompt: Union[list[int], list[list[int]]]) -> Sequence[ParsedTokens]: + prompt: Union[list[int], list[list[int]]], ) -> Sequence[ParsedTokens]: ... @@ -86,7 +86,7 @@ class ParsedTokensPrompt(TypedDict): class ParsedEmbedsPrompt(TypedDict): - type: Literal['embeds'] + type: Literal["embeds"] content: EmbedsPrompt @@ -133,7 +133,7 @@ def parse_singleton_prompt(prompt: SingletonPrompt) -> ParsedSingletonPrompt: def is_explicit_encoder_decoder_prompt( - prompt: PromptType) -> TypeIs[ExplicitEncoderDecoderPrompt]: + prompt: PromptType, ) -> TypeIs[ExplicitEncoderDecoderPrompt]: return isinstance(prompt, dict) and "encoder_prompt" in prompt diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 6e8effd60274f..b9acabeabd8df 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -67,11 +67,11 @@ class InputPreprocessor: return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id def get_decoder_start_token_id(self) -> Optional[int]: - ''' + """ Obtain the decoder start token id employed by an encoder/decoder model. Returns None for non-encoder/decoder models or if the model config is unavailable. - ''' + """ if not self.model_config.is_encoder_decoder: logger.warning_once( @@ -79,14 +79,14 @@ class InputPreprocessor: "this is not an encoder/decoder model.") return None - if (self.model_config is None or self.model_config.hf_config is None): + if self.model_config is None or self.model_config.hf_config is None: logger.warning_once( "Using None for decoder start token id because " "model config is not available.") return None dec_start_token_id = getattr(self.model_config.hf_config, - 'decoder_start_token_id', None) + "decoder_start_token_id", None) if dec_start_token_id is None: logger.warning_once( "Falling back on <BOS> for decoder start token " @@ -97,7 +97,7 @@ class InputPreprocessor: return dec_start_token_id def _get_default_enc_dec_decoder_prompt(self) -> list[int]: - ''' + """ Specifically for encoder/decoder models: generate a default decoder prompt for when the user specifies only the encoder prompt. @@ -126,7 +126,7 @@ class InputPreprocessor: Returns: * prompt_token_ids - ''' + """ bos_token_id = self.get_bos_token_id() assert bos_token_id is not None @@ -224,7 +224,10 @@ class InputPreprocessor: lora_request: Optional[LoRARequest], tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> list[int]: - """Async version of {meth}`_tokenize_prompt`.""" + """ + Async version of + [`_tokenize_prompt`][vllm.inputs.preprocess.InputPreprocessor._tokenize_prompt]. + """ tokenizer = self.get_tokenizer_group() tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs) @@ -287,7 +290,10 @@ class InputPreprocessor: lora_request: Optional[LoRARequest], return_mm_hashes: bool = False, ) -> MultiModalInputs: - """Async version of {meth}`_process_multimodal`.""" + """ + Async version of + [`_process_multimodal`][vllm.inputs.preprocess.InputPreprocessor._process_multimodal]. + """ tokenizer = await self._get_mm_tokenizer_async(lora_request) mm_processor = self.mm_registry.create_processor(self.model_config, @@ -472,7 +478,7 @@ class InputPreprocessor: Returns: - * {class}`SingletonInputs` instance + * [`SingletonInputs`][vllm.inputs.data.SingletonInputs] instance """ parsed = parse_singleton_prompt(prompt) @@ -508,7 +514,10 @@ class InputPreprocessor: lora_request: Optional[LoRARequest] = None, return_mm_hashes: bool = False, ) -> SingletonInputs: - """Async version of {meth}`_prompt_to_llm_inputs`.""" + """ + Async version of + [`_prompt_to_llm_inputs`][vllm.inputs.preprocess.InputPreprocessor._prompt_to_llm_inputs]. + """ parsed = parse_singleton_prompt(prompt) if parsed["type"] == "embeds": @@ -644,7 +653,9 @@ class InputPreprocessor: ) -> EncoderDecoderInputs: """ For encoder/decoder models only: - Process an input prompt into an {class}`EncoderDecoderInputs` instance. + Process an input prompt into an + [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs] + instance. There are two types of input prompts: singleton prompts which carry only the @@ -670,7 +681,8 @@ class InputPreprocessor: Returns: - * {class}`EncoderDecoderInputs` instance + * [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs] + instance """ encoder_inputs: SingletonInputs decoder_inputs: Optional[SingletonInputs] @@ -710,7 +722,10 @@ class InputPreprocessor: prompt: PromptType, tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> EncoderDecoderInputs: - """Async version of {meth}`_process_encoder_decoder_prompt`.""" + """ + Async version of + [`_process_encoder_decoder_prompt`][vllm.inputs.preprocess.InputPreprocessor._process_encoder_decoder_prompt]. + """ encoder_inputs: SingletonInputs decoder_inputs: Optional[SingletonInputs] @@ -778,7 +793,8 @@ class InputPreprocessor: ) -> DecoderOnlyInputs: """ For decoder-only models: - Process an input prompt into an {class}`DecoderOnlyInputs` instance. + Process an input prompt into a + [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance. Arguments: @@ -789,7 +805,7 @@ class InputPreprocessor: Returns: - * {class}`DecoderOnlyInputs` instance + * [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance """ prompt_comps = self._prompt_to_llm_inputs( @@ -812,7 +828,10 @@ class InputPreprocessor: prompt_adapter_request: Optional[PromptAdapterRequest] = None, return_mm_hashes: bool = False, ) -> DecoderOnlyInputs: - """Async version of {meth}`_process_decoder_only_prompt`.""" + """ + Async version of + [`_process_decoder_only_prompt`][vllm.inputs.preprocess.InputPreprocessor._process_decoder_only_prompt]. + """ prompt_comps = await self._prompt_to_llm_inputs_async( prompt, tokenization_kwargs=tokenization_kwargs, @@ -863,7 +882,10 @@ class InputPreprocessor: prompt_adapter_request: Optional[PromptAdapterRequest] = None, return_mm_hashes: bool = False, ) -> ProcessorInputs: - """Async version of {meth}`preprocess`.""" + """ + Async version of + [`preprocess`][vllm.inputs.preprocess.InputPreprocessor.preprocess]. + """ if self.model_config.is_encoder_decoder: assert not return_mm_hashes, ( "Multimodal hashes for encoder-decoder models should not be ", diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 148b3558c15e1..f424a8f613ab1 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -38,7 +38,7 @@ class InputContext: ) -> _C: """ Get the HuggingFace configuration - ({class}`transformers.PretrainedConfig`) of the model, + (`transformers.PretrainedConfig`) of the model, additionally checking its type. Raises: @@ -79,7 +79,7 @@ class InputContext: ) -> _P: """ Get the HuggingFace processor - ({class}`transformers.ProcessorMixin`) of the model, + (`transformers.ProcessorMixin`) of the model, additionally checking its type. Raises: diff --git a/vllm/logger.py b/vllm/logger.py index cf32041c5b700..fd16dd95bb1b3 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -68,22 +68,22 @@ class _VllmLogger(Logger): """ Note: This class is just to provide type information. - We actually patch the methods directly on the {class}`logging.Logger` + We actually patch the methods directly on the [`logging.Logger`][] instance to avoid conflicting with other libraries such as `intel_extension_for_pytorch.utils._logger`. """ def info_once(self, msg: str, *args: Hashable) -> None: """ - As {meth}`info`, but subsequent calls with the same message - are silently dropped. + As [`info`][logging.Logger.info], but subsequent calls with + the same message are silently dropped. """ _print_info_once(self, msg, *args) def warning_once(self, msg: str, *args: Hashable) -> None: """ - As {meth}`warning`, but subsequent calls with the same message - are silently dropped. + As [`warning`][logging.Logger.warning], but subsequent calls with + the same message are silently dropped. """ _print_warning_once(self, msg, *args) diff --git a/vllm/logging_utils/dump_input.py b/vllm/logging_utils/dump_input.py index 169e247940953..47ce0ab188bd6 100644 --- a/vllm/logging_utils/dump_input.py +++ b/vllm/logging_utils/dump_input.py @@ -18,7 +18,7 @@ logger = init_logger(__name__) def prepare_object_to_dump(obj) -> str: if isinstance(obj, str): - return "'{obj}'" # Double quotes + return f"'{obj}'" # Double quotes elif isinstance(obj, dict): dict_str = ', '.join({f'{str(k)}: {prepare_object_to_dump(v)}' \ for k, v in obj.items()}) @@ -42,9 +42,9 @@ def prepare_object_to_dump(obj) -> str: return obj.anon_repr() elif hasattr(obj, '__dict__'): items = obj.__dict__.items() - dict_str = ','.join([f'{str(k)}={prepare_object_to_dump(v)}' \ + dict_str = ', '.join([f'{str(k)}={prepare_object_to_dump(v)}' \ for k, v in items]) - return (f"{type(obj).__name__}({dict_str})") + return f"{type(obj).__name__}({dict_str})" else: # Hacky way to make sure we can serialize the object in JSON format try: diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 959fe4a672a6d..dfdc908d7e05b 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -1,13 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 -import copy import math import os -import re from collections.abc import Sequence from dataclasses import dataclass, field from typing import Any, Callable, Optional, Union +import regex as re import safetensors.torch import torch from torch import nn @@ -29,10 +28,12 @@ from vllm.lora.utils import (from_layer, from_layer_logits_processor, get_supported_lora_modules, is_regex_target_modules, parse_fine_tuned_lora_name, replace_submodule) +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.model_executor.models import SupportsLoRA, supports_multimodal from vllm.model_executor.models.interfaces import is_pooling_model from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper +from vllm.model_executor.utils import get_packed_modules_mapping from vllm.utils import is_pin_memory_available logger = init_logger(__name__) @@ -185,21 +186,21 @@ class LoRAModel(AdapterModel): @classmethod def from_local_checkpoint( - cls, - lora_dir: str, - expected_lora_modules: list[str], - peft_helper: PEFTHelper, - *, - lora_model_id: Optional[int] = None, - device: str = "cuda", - dtype: Optional[torch.dtype] = None, - target_embedding_padding: Optional[int] = None, - embedding_modules: Optional[dict[str, str]] = None, - embedding_padding_modules: Optional[list[str]] = None, - weights_mapper: Optional[WeightsMapper] = None, - ) -> "LoRAModel": + cls, + lora_dir: str, + expected_lora_modules: list[str], + peft_helper: PEFTHelper, + *, + lora_model_id: Optional[int] = None, + device: str = "cuda", + dtype: Optional[torch.dtype] = None, + target_embedding_padding: Optional[int] = None, + embedding_modules: Optional[dict[str, str]] = None, + embedding_padding_modules: Optional[list[str]] = None, + weights_mapper: Optional[WeightsMapper] = None, + tensorizer_config_dict: Optional[dict] = None) -> "LoRAModel": """Create a LoRAModel from a local checkpoint. - + Args: lora_dir: The local path that has lora data. expected_lora_modules: Name of modules that are expected to be @@ -219,10 +220,36 @@ class LoRAModel(AdapterModel): lora_dir, "new_embeddings.safetensors") new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin") + tensors: dict[str, torch.Tensor] = {} + unexpected_modules: list[Union[list[str], str]] = [] - unexpected_modules: list[Union[list[str], str]] - if os.path.isfile(lora_tensor_path): - tensors: dict[str, torch.Tensor] = {} + def check_unexpected_modules(modules: dict): + for lora_module in modules.keys(): # noqa + module_name, _, _ = parse_fine_tuned_lora_name( + lora_module, weights_mapper) + part_name = module_name.split(".")[-1] + if part_name not in expected_lora_modules: + unexpected_modules.append(module_name) + if unexpected_modules: + raise ValueError( + f"While loading {lora_dir}, expected" + f" target modules in {expected_lora_modules}" + f" but received {unexpected_modules}." + f" Please verify that the loaded LoRA module is correct") + + if tensorizer_config_dict: + from tensorizer import TensorDeserializer + + tensorizer_config = TensorizerConfig(**tensorizer_config_dict) + lora_tensor_path = os.path.join(tensorizer_config.tensorizer_dir, + "adapter_model.tensors") + tensorizer_args = tensorizer_config._construct_tensorizer_args() + tensors = TensorDeserializer(lora_tensor_path, + dtype=tensorizer_config.dtype, + **tensorizer_args.deserializer_params) + check_unexpected_modules(tensors) + + elif os.path.isfile(lora_tensor_path): # Find unexpected modules. # Use safetensor key as a source of truth to find expected modules. # in peft if you have target_modules A, B, C and C does not exist @@ -232,20 +259,8 @@ class LoRAModel(AdapterModel): unexpected_modules = [] with safetensors.safe_open(lora_tensor_path, framework="pt") as f: # type: ignore - for lora_module in f.keys(): # noqa - module_name, _, _ = parse_fine_tuned_lora_name( - lora_module, weights_mapper) - part_name = module_name.split(".")[-1] - if part_name not in expected_lora_modules: - unexpected_modules.append(module_name) - if unexpected_modules: - raise ValueError( - f"While loading {lora_dir}, expected" - f" target modules in {expected_lora_modules}" - f" but received {unexpected_modules}." - f" Please verify that the loaded LoRA module is correct" - ) # Load tensors if there are only expected modules. + check_unexpected_modules(f) for module in f.keys(): # noqa tensors[module] = f.get_tensor(module) elif os.path.isfile(lora_bin_file_path): @@ -349,8 +364,8 @@ class LoRAModelManager(AdapterModelManager): # We need to replace rotary emb layer to do batch computation # for long lora. self.supported_lora_modules.append("rotary_emb") - self.packed_modules_mapping = copy.deepcopy( - self.model.packed_modules_mapping) + + self.packed_modules_mapping = get_packed_modules_mapping(self.model) # Used to indicate whether the model is a multimodal model self.supports_mm: bool = ( supports_multimodal(self.model) @@ -605,7 +620,7 @@ class LoRAModelManager(AdapterModelManager): def _filter_unsupported_mm_module(self, module_name: str) -> bool: """ Regarding multimodal models, vLLM currently only supports adding LoRA to - language model. LoRA for other modules, such as the vision tower, will + language model. LoRA for other modules, such as the vision tower, will be filtered out. """ if self.supports_mm: diff --git a/vllm/lora/ops/torch_ops/lora_ops.py b/vllm/lora/ops/torch_ops/lora_ops.py index af79f98415cbc..ab65faceb2c10 100644 --- a/vllm/lora/ops/torch_ops/lora_ops.py +++ b/vllm/lora/ops/torch_ops/lora_ops.py @@ -36,10 +36,13 @@ def bgmv_expand(inputs: torch.Tensor, if outputs.shape[0] == 1 and output_tensor.shape[0] != 1: limit = 1 + # LoRA adapter and model may add different amounts of padding to output + common_len = min(outputs.shape[1], output_tensor.shape[1]) + if add_inputs: - output_tensor[:, :outputs.shape[1]] += outputs[:limit, :] + output_tensor[:, :common_len] += outputs[:limit, :common_len] else: - output_tensor[:, :outputs.shape[1]] = outputs[:limit, :] + output_tensor[:, :common_len] = outputs[:limit, :common_len] def sgmv_shrink( diff --git a/vllm/lora/ops/xla_ops/lora_ops.py b/vllm/lora/ops/xla_ops/lora_ops.py index acbec0cfab9c7..dff4d5181efe2 100644 --- a/vllm/lora/ops/xla_ops/lora_ops.py +++ b/vllm/lora/ops/xla_ops/lora_ops.py @@ -1,63 +1,99 @@ # SPDX-License-Identifier: Apache-2.0 +import jax +import jax.numpy as jnp import torch - -# Required to register the custom ops -import vllm.lora.ops.xla_ops.pallas # noqa # pylint: disable=unused-import +import torch.nn.functional as F +import torch_xla.core.xla_builder as xb +from torch.library import impl +from torch_xla.experimental.custom_kernel import XLA_LIB, jax_import_guard -def bgmv_expand(inputs: torch.Tensor, - lora_b_weights: torch.Tensor, - output_tensor: torch.Tensor, - lora_indices_tensor: torch.Tensor, - add_inputs: bool = True): +@jax.jit +def bgmv_jax(inputs, loras, idxs): + return jnp.einsum( + "td,tX,Xld->tl", + inputs, + jax.nn.one_hot(idxs, loras.shape[0], dtype=inputs.dtype), + loras, + ) + + +XLA_LIB.define("bgmv(Tensor inputs, Tensor loras, Tensor idxs) -> Tensor") + + +@impl(XLA_LIB, "bgmv", "XLA") +def bgmv_xla(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.IntTensor): + if len(loras.shape) == 4: + loras = loras.squeeze(axis=1) + + jax_import_guard() + return xb.call_jax(bgmv_jax, (inputs, loras, idxs)) + + +@impl(XLA_LIB, "bgmv", "CompositeExplicitAutograd") +def bgmv_non_xla(inputs: torch.Tensor, loras: torch.Tensor, + idxs: torch.IntTensor): + T, _ = inputs.shape + if len(loras.shape) == 4: + loras = loras.squeeze(axis=1) + _, L, _ = loras.shape + + return torch.empty((T, L), device=inputs.device) + + +def bgmv_expand( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + add_inputs: bool = True, +): """ Args: inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size]. - - lora_b_weights (torch.Tensor): LoRA weights of shape + + lora_b_weights (torch.Tensor): LoRA weights of shape [num_loras, lora_rank, hidden_size]. - - output_tensor (torch.Tensor): output tensor of shape + + output_tensor (torch.Tensor): output tensor of shape [num_tokens, hidden_size * num_slices]. - - lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] + + lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] indicating which LoRA matrix to use for each token. - add_inputs (bool): Whether or not to add the input tensor to the output + add_inputs (bool): Whether or not to add the input tensor to the output tensor. """ outputs = torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor) - n_tokens = outputs.size(0) limit = output_tensor.shape[0] if outputs.shape[0] == 1 and output_tensor.shape[0] != 1: limit = 1 - outputs = torch.cat( - (outputs, - torch.zeros((n_tokens, output_tensor.shape[1] - outputs.shape[1]), - device=outputs.device)), - dim=1) + if output_tensor.shape[1] > outputs.shape[1]: + outputs = F.pad(outputs, + (0, output_tensor.shape[1] - outputs.shape[1], 0, 0)) if add_inputs: - return output_tensor + outputs[:limit, :] + return output_tensor + outputs[:limit, :output_tensor.shape[1]] else: - return outputs[:limit, :] + return outputs[:limit, :output_tensor.shape[1]] -def bgmv_shrink(inputs: torch.Tensor, - lora_b_weights: torch.Tensor, - output_tensor: torch.Tensor, - lora_indices_tensor: torch.Tensor, - scaling: float = 1.0): +def bgmv_shrink( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + lora_indices_tensor: torch.Tensor, + scaling: float = 1.0, +): """ Args: inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size]. - lora_b_weights (torch.Tensor): LoRA weights of shape + lora_b_weights (torch.Tensor): LoRA weights of shape [num_loras, lora_rank, hidden_size]. output_tensor (torch.Tensor): (Unused) output tensor (placeholder). - lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] + lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] indicating which LoRA matrix to use for each token. scaling (float, optional): Scalar multiplier applied to the output. """ @@ -66,39 +102,41 @@ def bgmv_shrink(inputs: torch.Tensor, lora_indices_tensor) -def bgmv_expand_slice(inputs: torch.Tensor, - lora_b_weights: torch.Tensor, - output_tensor: torch.Tensor, - lora_indices_tensor: torch.Tensor, - slice_offset: int, - slice_size: int, - add_inputs: bool = True): +def bgmv_expand_slice( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + slice_offset: int, + slice_size: int, + add_inputs: bool = True, +): """ Args: inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size]. - - lora_b_weights (torch.Tensor): LoRA weights of shape + + lora_b_weights (torch.Tensor): LoRA weights of shape [num_loras, lora_rank, hidden_size]. - - output_tensor (torch.Tensor): output tensor of shape + + output_tensor (torch.Tensor): output tensor of shape [num_tokens, hidden_size * num_slices]. - - lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] + + lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] indicating which LoRA matrix to use for each token. - add_inputs (bool): Whether or not to add the input tensor to the output + add_inputs (bool): Whether or not to add the input tensor to the output tensor. """ outputs = torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor) - n_tokens = outputs.size(0) - outputs = torch.cat(( - torch.zeros((n_tokens, slice_offset), device=outputs.device), + outputs = F.pad( outputs, - torch.zeros( - (n_tokens, output_tensor.shape[1] - (slice_offset + slice_size)), - device=outputs.device), - ), - dim=1) + ( + slice_offset, + output_tensor.shape[1] - (slice_offset + slice_size), + 0, + 0, + ), + ) if add_inputs: return output_tensor + outputs diff --git a/vllm/lora/ops/xla_ops/pallas.py b/vllm/lora/ops/xla_ops/pallas.py deleted file mode 100644 index 35dc307539bf4..0000000000000 --- a/vllm/lora/ops/xla_ops/pallas.py +++ /dev/null @@ -1,133 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -import functools - -import jax -import jax.numpy as jnp -import torch -from jax.experimental import pallas as pl -from jax.experimental.pallas import tpu as pltpu -from torch.library import impl -from torch_xla.experimental.custom_kernel import (XLA_LIB, jax_import_guard, - make_kernel_from_pallas) - -# TODO: Tune these -TOKENS_BLOCK = 16 -LORA_RANK_BLOCK = 128 -DIM_BLOCK_SIZE = 128 - - -def _bgmv_kernel(bT: int, bL: int, idx_ref, inp_ref, lora_ref, out_ref, - acc_ref, mask_ref): - - @pl.when(pl.program_id(2) == 0) - def _(): - acc_ref[...] = jnp.zeros_like(acc_ref[...], dtype=jnp.float32) - - t = pl.program_id(0) - - for i in range(bT): - idx = idx_ref[i + bT * t] - mask_ref[...] = jnp.zeros_like(mask_ref[...], dtype=jnp.float32) - mask_ref[i, :] = jnp.ones((bL, ), dtype=jnp.float32) - - acc_ref[...] += jax.lax.dot_general( - inp_ref[...], - lora_ref[idx, ...], (((1, ), (1, )), ((), ())), - preferred_element_type=jnp.float32) * mask_ref[...] - - @pl.when(pl.program_id(2) == pl.num_programs(2) - 1) - def _(): - out_ref[...] = acc_ref[...].astype(out_ref.dtype) - - -@jax.jit -def _bgmv( - idxs: jax.Array, # (T, ) int32 - inputs: jax.Array, # (T, D) model dtype - loras: jax.Array # (N, L, D) model dtype -) -> jax.Array: # (T, L) model dtype - T, D = inputs.shape - N, L, _ = loras.shape - - return pl.pallas_call( - kernel=functools.partial(_bgmv_kernel, TOKENS_BLOCK, LORA_RANK_BLOCK), - out_shape=jax.ShapeDtypeStruct((T, L), dtype=inputs.dtype), - grid_spec=pltpu.PrefetchScalarGridSpec( - num_scalar_prefetch=1, - grid=(T // TOKENS_BLOCK, L // LORA_RANK_BLOCK, - D // DIM_BLOCK_SIZE), - in_specs=[ - pl.BlockSpec((TOKENS_BLOCK, DIM_BLOCK_SIZE), - lambda i, j, k, block_idx: (i, k)), - pl.BlockSpec((N, LORA_RANK_BLOCK, DIM_BLOCK_SIZE), - lambda i, j, k, block_idx: (0, j, k)), - ], - out_specs=pl.BlockSpec((TOKENS_BLOCK, LORA_RANK_BLOCK), - lambda i, j, k, block_idx: (i, j)), - scratch_shapes=[ - pltpu.VMEM((TOKENS_BLOCK, LORA_RANK_BLOCK), jnp.float32), - pltpu.VMEM((TOKENS_BLOCK, LORA_RANK_BLOCK), jnp.float32) - ]), - compiler_params=pltpu.TPUCompilerParams( - dimension_semantics=("parallel", "parallel", "arbitrary")), - name="bgmv")(idxs, inputs, loras) - - -def bgmv_shape_function(idxs, inputs, loras): - T, _ = inputs.shape - _, L, _ = loras.shape - - return [((T, L), inputs.dtype)] - - -XLA_LIB.define("bgmv(Tensor inputs, Tensor loras, Tensor idxs) -> Tensor", ) - - -@impl(XLA_LIB, "bgmv", "XLA") -def bgmv_xla(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.IntTensor): - inputs = inputs.to(dtype=loras.dtype) - - if len(loras.shape) == 4: - loras = loras.squeeze(axis=1) - - jax_import_guard() - kernel = make_kernel_from_pallas(_bgmv, bgmv_shape_function) - - T, _ = inputs.shape - _, L, D = loras.shape - - # Pad the loras' rank if it's too low. This is to allow it to fit in a TPU - # register. This has to happen in pytorch, doing it in Jax will lead to NaNs - L1 = L - if LORA_RANK_BLOCK > L or L % LORA_RANK_BLOCK != 0: - L1 = (L // LORA_RANK_BLOCK + 1) * LORA_RANK_BLOCK - - D1 = D - if DIM_BLOCK_SIZE > D or D % DIM_BLOCK_SIZE != 0: - D1 = (D // DIM_BLOCK_SIZE + 1) * DIM_BLOCK_SIZE - - T1 = T - if TOKENS_BLOCK > T or T % TOKENS_BLOCK != 0: - T1 = (T // TOKENS_BLOCK + 1) * TOKENS_BLOCK - - if D1 != D or L1 != L: - loras = torch.nn.functional.pad(loras, (0, D1 - D, 0, L1 - L, 0, 0)) - if D1 != D or T1 != T: - inputs = torch.nn.functional.pad(inputs, (0, D1 - D, 0, T1 - T)) - if T1 != T: - idxs = torch.nn.functional.pad(idxs, ((0, T1 - T))) - - return kernel(idxs, inputs, loras)[:T, :L] - - -@impl(XLA_LIB, "bgmv", "CompositeExplicitAutograd") -def bgmv_non_xla(inputs: torch.Tensor, loras: torch.Tensor, - idxs: torch.IntTensor): - T, _ = inputs.shape - - if len(loras.shape) == 4: - loras = loras.squeeze(axis=1) - - _, L, _ = loras.shape - - return torch.empty((T, L), device=inputs.device) diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index d5de63f5baade..7d335e5f7fab1 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -10,6 +10,7 @@ from typing import Literal, Optional, Union from vllm.config import LoRAConfig from vllm.logger import init_logger +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig logger = init_logger(__name__) @@ -89,12 +90,31 @@ class PEFTHelper: return cls(**filtered_dict) @classmethod - def from_local_dir(cls, lora_path: str, - max_position_embeddings: Optional[int]) -> "PEFTHelper": + def from_local_dir( + cls, + lora_path: str, + max_position_embeddings: Optional[int], + tensorizer_config_dict: Optional[dict] = None) -> "PEFTHelper": lora_config_path = os.path.join(lora_path, "adapter_config.json") - with open(lora_config_path) as f: - config = json.load(f) + if tensorizer_config_dict: + tensorizer_config = TensorizerConfig(**tensorizer_config_dict) + tensorizer_args = tensorizer_config._construct_tensorizer_args() + from tensorizer.stream_io import open_stream + lora_config_path = os.path.join(tensorizer_config.lora_dir, + "adapter_config.json") + with open_stream(lora_config_path, + mode="rb", + **tensorizer_args.stream_params) as f: + config = json.load(f) + + logger.info("Successfully deserialized LoRA config from %s", + tensorizer_config.lora_dir) + + else: + with open(lora_config_path) as f: + config = json.load(f) + config["vllm_max_position_embeddings"] = max_position_embeddings return cls.from_dict(config) diff --git a/vllm/lora/punica_wrapper/punica_tpu.py b/vllm/lora/punica_wrapper/punica_tpu.py index f3153c6dab03c..0556e583f409a 100644 --- a/vllm/lora/punica_wrapper/punica_tpu.py +++ b/vllm/lora/punica_wrapper/punica_tpu.py @@ -1,11 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Union +import math +from typing import TYPE_CHECKING, Optional, Union import torch import torch.nn.functional as F +import torch_xla.core.xla_model as xm from vllm.lora.ops.xla_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink +from vllm.lora.punica_wrapper.utils import convert_mapping + +if TYPE_CHECKING: + # avoid circuit import + from vllm.lora.layers import LoRAMapping + from vllm.lora.models import LongContextLoRAContext from .punica_base import PunicaWrapperBase @@ -31,6 +39,15 @@ class PunicaWrapperTPU(PunicaWrapperBase): self._sampler_indices_padded = self._sampler_indices_padded.to( dtype=torch.int32) + torch.ops.xla.dynamo_set_buffer_donor_(self._token_lora_indices, True) + torch.ops.xla.dynamo_set_buffer_donor_(self._sampler_indices, True) + torch.ops.xla.dynamo_set_buffer_donor_(self._sampler_indices_padded, + True) + torch.ops.xla.dynamo_set_buffer_donor_(self._embeddings_indices, True) + torch.ops.xla.dynamo_set_buffer_donor_(self._long_lora_indices, True) + torch.ops.xla.dynamo_set_buffer_donor_(self._lora_indices_per_batch, + True) + torch._dynamo.mark_dynamic(self._token_lora_indices, 0) torch._dynamo.mark_dynamic(self._embeddings_indices, 1) torch._dynamo.mark_dynamic(self._sampler_indices_padded, 0) @@ -55,15 +72,11 @@ class PunicaWrapperTPU(PunicaWrapperBase): def shrink( self, - y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, scale: float, ): - if self.no_lora: - return y - return bgmv_shrink(x, w_t_all, y, self._get_token_lora_indices(x), - scale) + return bgmv_shrink(x, w_t_all, self._get_token_lora_indices(x), scale) def expand(self, y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, add_inputs: bool): @@ -72,7 +85,7 @@ class PunicaWrapperTPU(PunicaWrapperBase): def expand_slice(self, y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, y_offset: int, y_slice_size: int, - y_total_size: int, add_inputs: bool) -> torch.Tensor: + add_inputs: bool) -> torch.Tensor: return bgmv_expand_slice(x, w_t_all, y, self._get_token_lora_indices(x), y_offset, y_slice_size, add_inputs) @@ -98,9 +111,8 @@ class PunicaWrapperTPU(PunicaWrapperBase): x = x.view(-1, x.shape[-1]) for slice_idx in range(len(lora_a_stacked)): - y_s = y[slice_idx] lora_s = lora_a_stacked[slice_idx] - y_s = self.shrink(y_s, x, lora_s, scale) + y_s = self.shrink(x, lora_s, scale) y[slice_idx, :, :] = y_s # type: ignore[index] return y @@ -140,15 +152,12 @@ class PunicaWrapperTPU(PunicaWrapperBase): y = self._apply_bias(self._get_token_lora_indices(y), y, output_slices, lora_bias_stacked) for slice_idx in range(len(lora_b_stacked)): - y = self.expand_slice( - y, - x[slice_idx], - lora_b_stacked[slice_idx], - offset_left, - output_slices[slice_idx], - y_total_size=sum(output_slices), - add_inputs=add_inputs, - ) + y = self.expand_slice(y, + x[slice_idx], + lora_b_stacked[slice_idx], + offset_left, + output_slices[slice_idx], + add_inputs=add_inputs) offset_left += output_slices[slice_idx] return y.view_as(y_org) @@ -216,12 +225,10 @@ class PunicaWrapperTPU(PunicaWrapperBase): if buffer is None: r = lora_b_stacked[0].size(-1) - # We set the buffer to be float32 by default, consistent with the - # triton op T = x.size(0) buffer = torch.zeros( (len(output_slices), T, r), - dtype=torch.float32, + dtype=x.dtype, device=x.device, ) buffer = self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs) @@ -257,26 +264,16 @@ class PunicaWrapperTPU(PunicaWrapperBase): scale (float): Scaling factor. buffer (Optional[torch.Tensor]):Default to None. """ - if self.no_lora: - return y - y_org = y y = y.view(-1, y.shape[-1]) x = x.view(-1, x.shape[-1]) - r = lora_b_stacked.size(-1) - if buffer is None: - # We set the buffer to be float32 by default, consistent with the - # triton op - buffer = torch.zeros((x.size(0), r), - dtype=torch.float32, - device=x.device) - buffer = bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, - scale) + sampler_indices = torch.narrow(self._sampler_indices, 0, 0, x.size(0)) + buffer = bgmv_shrink(x, lora_a_stacked, sampler_indices, scale) y = bgmv_expand(buffer, lora_b_stacked, y, - self.sampler_indices, + sampler_indices, add_inputs=True) return y.view_as(y_org) @@ -316,10 +313,92 @@ class PunicaWrapperTPU(PunicaWrapperBase): return output.view_as(org_output) + # This performs the same tensor ops as the base method, except it does them + # on the CPU then transfers the results to the TPU + def _update_base_metadata( + self, + mapping: "LoRAMapping", + lora_index_to_id: list[Optional[int]], + max_loras: int, + vocab_size: int, + extra_vocab_size: int, + long_lora_context: Optional["LongContextLoRAContext"] = None, + ): + # Make sure we don't accidentally collect outside operations + xm.mark_step() + + # Pad the prompt mapping to avoid running into recompiles on the TPU + # TODO: Should this happen inside mapping internally? If so how can we + # avoid having backend specific LoRAMapping classes? + mapping.prompt_mapping = self._pad_prompt_mapping( + mapping.prompt_mapping) + + ( + base_indices, + sampler_indices, + sampler_indices_padded, + embeddings_indices, + long_lora_offsets_tensor, + indices_len, + ) = convert_mapping( + mapping, + lora_index_to_id, + max_loras, + vocab_size, + extra_vocab_size, + "cpu", + long_lora_context, + ) + self._token_lora_indices = self._pad_to_shape( + base_indices, self._token_lora_indices.shape, + dims=1).to(self.device) + self._sampler_indices = self._pad_to_shape(sampler_indices, + self._sampler_indices.shape, + dims=1).to(self.device) + self._sampler_indices_padded = self._pad_to_shape( + sampler_indices_padded, self._sampler_indices_padded.shape, + dims=1).to(self.device) + self._embeddings_indices = self._pad_to_shape( + embeddings_indices, self._embeddings_indices.shape, + dims=2).to(self.device) + if long_lora_offsets_tensor is not None: + self._long_lora_indices = self._pad_to_shape( + long_lora_offsets_tensor, + self._long_lora_indices.shape, + dims=1).to(self.device) + else: + zeroed = torch.zeros_like(self._long_lora_indices.cpu(), + dtype=torch.int32) + self._long_lora_indices = zeroed.to(self.device) + self.indices_len[:] = indices_len + def _update_prefill_metadata(self, token_lora_tensor: torch.Tensor) -> None: self.batch_size = 1 - self._lora_indices_per_batch[:self.batch_size].copy_( - token_lora_tensor[:self.batch_size]) - # TODO: .item() is extremely inefficient on TPU, so find a way around it - self.no_lora = torch.all(token_lora_tensor == -1).item() + self._lora_indices_per_batch[:self. + batch_size] = token_lora_tensor[:self. + batch_size] + + def _pad_prompt_mapping( + self, prompt_mapping: tuple[int, ...]) -> tuple[int, ...]: + num_reqs = len(prompt_mapping) + + # From vllm/v1/worker/tpu_model_runner:51, but need to avoid a circular + # import + MIN_NUM_SEQS = 8 + + padded_num_reqs = max(2**math.ceil(math.log2(num_reqs)), MIN_NUM_SEQS) + pad_len = padded_num_reqs - num_reqs + + padding = [-1] * pad_len + return tuple(list(prompt_mapping) + padding) + + def _pad_to_shape(self, src, target_shape, dims=1): + if dims == 1: + pad_len = target_shape[0] - src.shape[0] + return F.pad(src, (0, pad_len), value=0).to(torch.int32) + else: + pad_rows = target_shape[0] - src.shape[0] + pad_cols = target_shape[1] - src.shape[1] + return F.pad(src, (0, pad_cols, 0, pad_rows), + value=0).to(torch.int32) diff --git a/vllm/lora/request.py b/vllm/lora/request.py index badfaa4193774..616e94f8d678f 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -31,6 +31,7 @@ class LoRARequest( lora_local_path: Optional[str] = msgspec.field(default=None) long_lora_max_len: Optional[int] = None base_model_name: Optional[str] = msgspec.field(default=None) + tensorizer_config_dict: Optional[dict] = None def __post_init__(self): if self.lora_local_path: diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index b66850d4304f1..619dd3bdc40af 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 import os -import re from typing import Optional, Union import huggingface_hub +import regex as re from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError, HFValidationError, RepositoryNotFoundError) from torch import nn diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 8e5bc61066593..f1ae030975074 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -100,7 +100,8 @@ class WorkerLoRAManager(AbstractWorkerManager): lora_path = get_adapter_absolute_path(lora_request.lora_path) peft_helper = PEFTHelper.from_local_dir( - lora_path, self.max_position_embeddings) + lora_path, self.max_position_embeddings, + lora_request.tensorizer_config_dict) # Validates the LoRA configuration against requirements before # loading weights, throwing an exception if validation fails. @@ -125,6 +126,7 @@ class WorkerLoRAManager(AbstractWorkerManager): self.lora_config.lora_extra_vocab_size, embedding_modules=self.embedding_modules, embedding_padding_modules=self.embedding_padding_modules, + tensorizer_config_dict=lora_request.tensorizer_config_dict, weights_mapper=hf_to_vllm_mapper) except FileNotFoundError as e: @@ -227,6 +229,11 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager): self.add_adapter(lora) def add_adapter(self, lora_request: LoRARequest) -> bool: + # Note that this method is not thread-safe. It may be invoked multiple + # times for the same adapter when using multiple API servers. + # This is ok because it's currently only called from + # the single-threaded core engine loop. + if lora_request.lora_int_id not in self.list_adapters(): # Load the new adapter first to ensure it is actually valid, before # evicting any existing adapters. diff --git a/vllm/model_executor/guided_decoding/guidance_decoding.py b/vllm/model_executor/guided_decoding/guidance_decoding.py index 0b1f4762bc730..58adcc3caff99 100644 --- a/vllm/model_executor/guided_decoding/guidance_decoding.py +++ b/vllm/model_executor/guided_decoding/guidance_decoding.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import json -from re import escape as regex_escape import llguidance +from regex import escape as regex_escape from transformers import PreTrainedTokenizerBase from vllm.model_executor.guided_decoding.guidance_logits_processors import ( diff --git a/vllm/model_executor/guided_decoding/guidance_logits_processors.py b/vllm/model_executor/guided_decoding/guidance_logits_processors.py index 4b45c272adc52..e17df68b4b4da 100644 --- a/vllm/model_executor/guided_decoding/guidance_logits_processors.py +++ b/vllm/model_executor/guided_decoding/guidance_logits_processors.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +import copy import os from typing import Any @@ -34,9 +35,24 @@ class GuidanceLogitsProcessor: self.grammar = grammar self.tokenizer = tokenizer self.tokenizer_name = tokenizer.name_or_path + self.ll_tokenizer = None + self.ll_matcher = None + self.bitmask = None self.new_sampling = False self.initialized = False + def clone(self) -> "GuidanceLogitsProcessor": + cloned = copy.copy(self) + if self.initialized: + cloned.ll_matcher = llguidance.LLMatcher( + self.ll_tokenizer, # type: ignore[assignment] + self.grammar, + log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")), + ) + self.bitmask = llguidance.torch.allocate_token_bitmask( + 1, self.ll_tokenizer.vocab_size) # type: ignore[attr-defined] + return cloned + def _initialize(self): if self.initialized: return @@ -56,7 +72,7 @@ class GuidanceLogitsProcessor: # create reusable bitmask self.bitmask = llguidance.torch.allocate_token_bitmask( - 1, self.ll_tokenizer.vocab_size) + 1, self.ll_tokenizer.vocab_size) # type: ignore[attr-defined] self.initialized = True @@ -70,15 +86,17 @@ class GuidanceLogitsProcessor: self._initialize() if self.new_sampling and len(input_ids) > 0: - self.ll_matcher.consume_token(input_ids[-1]) - err = self.ll_matcher.get_error() + self.ll_matcher.consume_token( # type: ignore[attr-defined] + input_ids[-1]) + err = self.ll_matcher.get_error() # type: ignore[attr-defined] if err: logger.warning("Error in LLMatcher: %s", err) llguidance.torch.fill_next_token_bitmask(self.ll_matcher, self.bitmask, 0) llguidance.torch.apply_token_bitmask_inplace( - scores, self.bitmask.to(scores.device)) + scores, + self.bitmask.to(scores.device)) # type: ignore[attr-defined] self.new_sampling = True diff --git a/vllm/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py index 085f37a5d5167..316860718b77b 100644 --- a/vllm/model_executor/guided_decoding/guided_fields.py +++ b/vllm/model_executor/guided_decoding/guided_fields.py @@ -3,12 +3,10 @@ from dataclasses import dataclass from typing import Optional, TypedDict, Union -from pydantic import BaseModel - # These classes are deprecated, see SamplingParams class LLMGuidedOptions(TypedDict, total=False): - guided_json: Union[dict, BaseModel, str] + guided_json: Union[dict, str] guided_regex: str guided_choice: list[str] guided_grammar: str @@ -20,7 +18,7 @@ class LLMGuidedOptions(TypedDict, total=False): @dataclass class GuidedDecodingRequest: """One of the fields will be used to retrieve the logit processor.""" - guided_json: Optional[Union[dict, BaseModel, str]] = None + guided_json: Optional[Union[dict, str]] = None guided_regex: Optional[str] = None guided_choice: Optional[list[str]] = None guided_grammar: Optional[str] = None diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py index bcd7494e6cec2..e41af4b360e45 100644 --- a/vllm/model_executor/guided_decoding/outlines_decoding.py +++ b/vllm/model_executor/guided_decoding/outlines_decoding.py @@ -5,9 +5,9 @@ import concurrent.futures import os from enum import Enum from json import dumps as json_dumps -from re import escape as regex_escape from typing import Optional, Union +from regex import escape as regex_escape from transformers import PreTrainedTokenizerBase from vllm.model_executor.guided_decoding.outlines_logits_processors import ( diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index 8ae7c7b6b2c78..6986b6554c230 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -56,6 +56,12 @@ class BaseLogitsProcessor: self._fsm_state: defaultdict[int, Union[int, CFGState]] = defaultdict(int) + def clone(self) -> "BaseLogitsProcessor": + cloned = copy.copy(self) + cloned._guide = self._guide.copy() + cloned._fsm_state = copy.deepcopy(self._fsm_state) + return cloned + def __call__(self, input_ids: list[int], scores: torch.Tensor) -> torch.Tensor: """Use the FSM to bias the logits before sampling the next token.""" @@ -218,6 +224,12 @@ class CFGLogitsProcessor(BaseLogitsProcessor): reasoner) self._guide = self._guide.copy() + def clone(self) -> "CFGLogitsProcessor": + cloned = copy.copy(self) + cloned._fsm_state = copy.deepcopy(self._fsm_state) + cloned._guide = self._guide.copy() + return cloned + @lru_cache(maxsize=32) def _adapt_tokenizer(tokenizer: PreTrainedTokenizerBase): diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py index 1ad1ef8fbf166..3f77cf394d9a3 100644 --- a/vllm/model_executor/guided_decoding/utils.py +++ b/vllm/model_executor/guided_decoding/utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -import re +import regex as re def has_xgrammar_unsupported_json_features(schema: dict) -> bool: diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index 8e40da4b3aa99..d2e5686099459 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -4,10 +4,10 @@ from __future__ import annotations import json -import re from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any +import regex as re import torch import vllm.envs @@ -302,8 +302,9 @@ class XGrammarLogitsProcessor: prefilled: bool = field(default=False) def __post_init__(self): - self.tokenizer_info = self.config.tokenizer_info( - self.config.tokenizer_data) + if self.tokenizer_info is None: + self.tokenizer_info = self.config.tokenizer_info( + self.config.tokenizer_data) def __getstate__(self) -> dict[str, Any]: return {'config': self.config, 'reasoner': self.reasoner} @@ -400,7 +401,8 @@ class XGrammarLogitsProcessor: def clone(self) -> XGrammarLogitsProcessor: """Create a new instance with shared compiled grammar but separate state""" - new_processor = XGrammarLogitsProcessor(self.config, self.reasoner) + new_processor = XGrammarLogitsProcessor(self.config, self.reasoner, + None, self.tokenizer_info) # Share the compiled grammar context (immutable after compilation) new_processor.ctx = self.ctx diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index aff108112b611..26a433da2189a 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 """ CUTLASS based Fused MoE kernels.""" -import os from typing import Optional import torch @@ -271,8 +270,6 @@ def cutlass_moe_fp8( FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max -MAX_TOKENS_PER_EXPERT = int( - os.environ.get('VLLM_MODELOPT_MAX_TOKENS_PER_EXPERT', '65536')) def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor, @@ -330,10 +327,7 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor, assert a.dtype in [torch.half, torch.bfloat16], "Invalid input dtype" assert (topk_weights.shape[0] == m and topk_ids.shape[0] == m), ("topk must be provided for each row of a") - assert (m <= MAX_TOKENS_PER_EXPERT), ( - f"m must be less than MAX_TOKENS_PER_EXPERT({MAX_TOKENS_PER_EXPERT})" - f" for cutlass_moe_fp4, observed m = {m}. Use" - f" VLLM_MODELOPT_MAX_TOKENS_PER_EXPERT to set this value.") + out_dtype = a.dtype num_topk = topk_ids.shape[1] @@ -362,8 +356,7 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor, expert_offsets, blockscale_offsets, num_topk, - expert_map=a_map, - MAX_TOKENS_PER_EXPERT=MAX_TOKENS_PER_EXPERT) + expert_map=a_map) c1 = ops.cutlass_fp4_moe_mm(rep_a_fp4, w1_fp4, rep_a_blockscale, w1_blockscale, w1_alphas, problem_sizes1, @@ -378,12 +371,7 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor, torch.ops._C.silu_and_mul(intermediate, c1) int_fp4, int_blockscale = ops.scaled_fp4_experts_quant( - intermediate, - a2_gscale, - expert_offsets, - blockscale_offsets, - num_topk, - MAX_TOKENS_PER_EXPERT=MAX_TOKENS_PER_EXPERT) + intermediate, a2_gscale, expert_offsets, blockscale_offsets, num_topk) c2 = ops.cutlass_fp4_moe_mm(int_fp4, w2_fp4, int_blockscale, w2_blockscale, w2_alphas, problem_sizes2, expert_offsets[:-1], diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index f1cb77f64eae7..af7b98e14c6c8 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1,12 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 import importlib -import threading from abc import abstractmethod from dataclasses import dataclass from enum import Enum from typing import Callable, Optional -from weakref import WeakValueDictionary import torch import torch.nn.functional as F @@ -43,15 +41,15 @@ if current_platform.is_cuda_alike(): from .pplx_prepare_finalize import PplxPrepareAndFinalize else: fused_experts = None # type: ignore + FusedMoEPermuteExpertsUnpermute = None # type: ignore FusedMoEPrepareAndFinalize = None # type: ignore if is_rocm_aiter_moe_enabled(): from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 - rocm_aiter_biased_group_topk as grouped_topk) + rocm_aiter_grouped_topk as grouped_topk) else: from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk if current_platform.is_tpu(): - # the iterative moe implementation is used until the moe_pallas is fixed - from .moe_torch_iterative import fused_moe as fused_moe_pallas + from .moe_pallas import fused_moe as fused_moe_pallas else: fused_moe_pallas = None # type: ignore logger = init_logger(__name__) @@ -74,7 +72,8 @@ class FusedMoEParallelConfig: @property def use_pplx_kernels(self): - return self.dp_size > 1 and self.use_ep and has_pplx + return self.dp_size > 1 and self.use_ep and \ + envs.VLLM_ALL2ALL_BACKEND == "pplx" @staticmethod def make(tp_size_: int, dp_size_: int, @@ -197,6 +196,8 @@ class MoEConfig: # TODO: add more quantization params, blocked, per-token, etc. block_size: int = 128 + max_num_tokens: int = MOE_DP_CHUNK_SIZE + @property def tp_size(self): return self.moe_parallel_config.tp_size @@ -245,13 +246,59 @@ class FusedMoEMethodBase(QuantizeMethodBase): params_dtype: torch.dtype, **extra_weight_attrs): raise NotImplementedError - def set_prepare_finalize( - self, - dp_size: int, - world_size: int, - prepare_finalize: FusedMoEPrepareAndFinalize, - ) -> bool: - return False + def init_prepare_finalize(self, moe: MoEConfig, + quant_config: Optional[QuantizationConfig]): + all2all_manager = get_ep_group().device_communicator.all2all_manager + assert all2all_manager is not None + + prepare_finalize = None + if moe.use_pplx_kernels: + all_to_all_args = dict( + max_num_tokens=moe.max_num_tokens, + num_experts=moe.num_experts, + experts_per_token=moe.experts_per_token, # topk + rank=all2all_manager.rank, + world_size=all2all_manager.world_size, + # dp_size actually means tp_size, bug in pplx kernels + dp_size=all2all_manager.tp_group.world_size, + hidden_dim=moe.hidden_dim, + hidden_dim_bytes=moe.hidden_dim * moe.in_dtype.itemsize, + # For blocked per token: set to + # ceil_div(hidden_dim, block_size) * sizeof(float32) + # For per-token: set to sizeof(float32) + hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else ( + (moe.hidden_dim + moe.block_size - 1) // moe.block_size * + torch.float32.itemsize)), + group_name=all2all_manager.cpu_group.group_name, + ) + + handle = all2all_manager.get_handle(all_to_all_args) + + prepare_finalize = PplxPrepareAndFinalize( + handle, + max_num_tokens=moe.max_num_tokens, + world_size=all2all_manager.world_size, + rank=all2all_manager.rank, + # dp_size actually means tp_size, bug in pplx kernels + dp_size=all2all_manager.tp_group.world_size, + quant_dtype=moe.in_dtype, + ) + + if prepare_finalize is not None: + experts = self.select_gemm_impl(prepare_finalize) + self.fused_experts = FusedMoEModularKernel( + prepare_finalize, + experts, + ) + + def select_gemm_impl( + self, prepare_finalize: Optional[FusedMoEPrepareAndFinalize] + ) -> FusedMoEPermuteExpertsUnpermute: + # based on the all2all implementation, select the appropriate + # gemm implementation + raise NotImplementedError( + "Subclass must select appropriate gemm implementation" + " based on the prepare_finalize") @abstractmethod def apply( @@ -275,53 +322,13 @@ class FusedMoEMethodBase(QuantizeMethodBase): raise NotImplementedError -class AllToAllCache: - - def __init__(self): - self._cache: WeakValueDictionary = WeakValueDictionary() - self._lock = threading.RLock() # Reentrant lock for thread safety - - def destroy(self): - with self._lock: - # TODO: can we do del self._cache? - for _, a2a in self._cache.items(): - a2a.destroy() - - def get_or_create(self, **kwargs): - assert has_pplx - import pplx_kernels as pplx - - # Create a hashable key from the kwargs - key = tuple(sorted((k, v) for k, v in kwargs.items())) - - with self._lock: - instance = self._cache.get(key) - if instance is None: - # TODO (varun): Add support to switch to intranode - # when all communications are within the same - # node. - logger.debug("Create AllToAll %s", kwargs) - instance = pplx.AllToAll.internode(**kwargs) - self._cache[key] = instance - return instance - - -# Global singleton -_all_to_all_cache = AllToAllCache() - - -# Factory function as a cleaner interface -def get_all_to_all(**kwargs): - return _all_to_all_cache.get_or_create(**kwargs) - - @CustomOp.register("unquantized_fused_moe") class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): """MoE method without quantization.""" def __init__(self, moe: MoEConfig): super().__init__() - self.fused_experts = fused_experts + self.fused_experts = fused_experts # type: ignore self.moe = moe self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled() @@ -331,6 +338,42 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): else: self.rocm_aiter_fused_experts = None # type: ignore + def select_gemm_impl( + self, prepare_finalize: Optional[FusedMoEPrepareAndFinalize]): + + assert self.fused_experts == fused_experts + + all2all_manager = get_ep_group().device_communicator.all2all_manager + assert all2all_manager is not None + + experts: Optional[FusedMoEPermuteExpertsUnpermute] = None + + if isinstance(prepare_finalize, + (BatchedPrepareAndFinalize, PplxPrepareAndFinalize)): + logger.debug("BatchedTritonExperts %s", self.moe) + experts = BatchedTritonExperts( + max_num_tokens=MOE_DP_CHUNK_SIZE, + world_size=all2all_manager.world_size, + # dp_size actually means tp_size, bug in pplx kernels + dp_size=all2all_manager.tp_group.world_size, + use_fp8_w8a8=False, + use_int8_w8a8=False, + use_int8_w8a16=False, + use_int4_w4a16=False, + block_shape=None, + ) + else: + logger.debug("TritonExperts %s", self.moe) + experts = TritonExperts( + use_fp8_w8a8=False, + use_int8_w8a8=False, + use_int8_w8a16=False, + use_int4_w4a16=False, + block_shape=None, + per_channel_quant=False, + ) + return experts + def create_weights(self, layer: torch.nn.Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): @@ -376,10 +419,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): shuffle_weights) if self.rocm_aiter_moe_enabled: - # use 2stage ck moe layout - shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight.data, - layer.w2_weight.data, - layout=(32, 32)) + shuffled_w13, shuffled_w2 = shuffle_weights( + layer.w13_weight.data, layer.w2_weight.data) layer.w13_weight.data = shuffled_w13 layer.w2_weight.data = shuffled_w2 @@ -430,47 +471,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): activation=activation, apply_router_weight_on_input=apply_router_weight_on_input) - def set_prepare_finalize( - self, - dp_size: int, - world_size: int, - prepare_finalize: FusedMoEPrepareAndFinalize, - ) -> bool: - assert self.fused_experts == fused_experts - - experts: Optional[FusedMoEPermuteExpertsUnpermute] = None - - if isinstance(prepare_finalize, - (BatchedPrepareAndFinalize, PplxPrepareAndFinalize)): - logger.debug("BatchedTritonExperts %s", self.moe) - experts = BatchedTritonExperts( - max_num_tokens=MOE_DP_CHUNK_SIZE, - world_size=world_size, - dp_size=dp_size, - use_fp8_w8a8=False, - use_int8_w8a8=False, - use_int8_w8a16=False, - use_int4_w4a16=False, - block_shape=None, - ) - else: - logger.debug("TritonExperts %s", self.moe) - experts = TritonExperts( - use_fp8_w8a8=False, - use_int8_w8a8=False, - use_int8_w8a16=False, - use_int4_w4a16=False, - block_shape=None, - per_channel_quant=False, - ) - - self.fused_experts = FusedMoEModularKernel( - prepare_finalize, - experts, - ) - - return True - def forward_cuda( self, layer: torch.nn.Module, @@ -680,45 +680,6 @@ def determine_expert_map( return (local_num_experts, expert_map) -def _construct_prepare_finalize( - moe: MoEConfig, quant_config: Optional[QuantizationConfig] -) -> Optional[FusedMoEPrepareAndFinalize]: - max_num_tokens = MOE_DP_CHUNK_SIZE - world_size = moe.ep_size - dp_size = moe.ep_size // moe.dp_size # dp_size actually means TP. - rank = moe.ep_rank - - if moe.use_pplx_kernels: - logger.debug("using PplxPrepareAndFinalize") - - all_to_all = get_all_to_all( - max_num_tokens=max_num_tokens, - num_experts=moe.num_experts, - experts_per_token=moe.experts_per_token, # topk - rank=rank, - world_size=world_size, - dp_size=dp_size, - hidden_dim=moe.hidden_dim, - hidden_dim_bytes=moe.hidden_dim * moe.in_dtype.itemsize, - # For blocked per token: set to - # ceil_div(hidden_dim, block_size) * sizeof(float32) - # For per-token: set to sizeof(float32) - hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else - ((moe.hidden_dim + moe.block_size - 1) // - moe.block_size * torch.float32.itemsize))) - - return PplxPrepareAndFinalize( - all_to_all, - max_num_tokens=max_num_tokens, - world_size=world_size, - rank=rank, - dp_size=dp_size, - quant_dtype=moe.in_dtype, - ) - - return None - - class FusedMoE(torch.nn.Module): """FusedMoE layer for MoE models. @@ -832,7 +793,10 @@ class FusedMoE(torch.nn.Module): moe_parallel_config=self.moe_parallel_config, # TODO (bnell): this needs to be fixed for quantized types. in_dtype=params_dtype, + max_num_tokens=MOE_DP_CHUNK_SIZE, ) + self.moe_config = moe + self.quant_config = quant_config # Note: get_quant_method will look at the layer's local_num_experts # for heuristic purposes, so it must be initialized first. @@ -840,25 +804,13 @@ class FusedMoE(torch.nn.Module): if quant_config is None: quant_method = UnquantizedFusedMoEMethod(moe) - prepare_finalize = _construct_prepare_finalize(moe, quant_config) else: quant_method = quant_config.get_quant_method(self, prefix) - # No pplx for quantized types yet. - prepare_finalize = None assert quant_method is not None assert isinstance(quant_method, FusedMoEMethodBase) self.quant_method = quant_method - if prepare_finalize is not None: - world_size = moe.ep_size - dp_size = int(moe.ep_size // moe.dp_size) - success = self.quant_method.set_prepare_finalize( - dp_size, world_size, prepare_finalize) - if not success: - logger.warning("DP+EP not supported for %s.", - type(self.quant_method)) - moe_quant_params = { "num_experts": self.local_num_experts, "hidden_size": hidden_size, @@ -876,6 +828,21 @@ class FusedMoE(torch.nn.Module): self.quant_method.create_weights(layer=self, **moe_quant_params) + # Chunked all2all staging tensor + self.batched_hidden_states: Optional[torch.Tensor] = None + self.batched_router_logits: Optional[torch.Tensor] = None + if self.moe_parallel_config.use_pplx_kernels: + act_dtype = vllm_config.model_config.dtype + self.batched_hidden_states = torch.zeros( + (MOE_DP_CHUNK_SIZE, self.hidden_size), + dtype=act_dtype, + device=torch.cuda.current_device()) + + self.batched_router_logits = torch.zeros( + (MOE_DP_CHUNK_SIZE, self.global_num_experts), + dtype=act_dtype, + device=torch.cuda.current_device()) + @property def tp_size(self): return self.moe_parallel_config.tp_size @@ -1265,18 +1232,39 @@ class FusedMoE(torch.nn.Module): def forward_impl_chunked(self, full_hidden_states: torch.Tensor, full_router_logits: torch.Tensor): + assert self.batched_hidden_states is not None + assert self.batched_router_logits is not None + assert self.batched_hidden_states.dtype == full_hidden_states.dtype + assert self.batched_router_logits.dtype == full_router_logits.dtype + # Check size compatibility. + assert ( + self.batched_hidden_states.size(-1) == full_hidden_states.size(-1)) + assert ( + self.batched_router_logits.size(-1) == full_router_logits.size(-1)) full_final_hidden_states = torch.empty_like(full_hidden_states) def process_chunk(chunk_start, chunk_end, skip_result_store=False): + chunk_size = chunk_end - chunk_start hidden_states = full_hidden_states[chunk_start:chunk_end, :] router_logits = full_router_logits[chunk_start:chunk_end, :] + assert (self.batched_hidden_states.size(0) # type: ignore + >= chunk_size) + assert (self.batched_router_logits.size(0) # type: ignore + >= chunk_size) + staged_hidden_states = self.batched_hidden_states[: + chunk_size, :] # type: ignore + staged_router_logits = self.batched_router_logits[: + chunk_size, :] # type: ignore + staged_hidden_states.copy_(hidden_states, non_blocking=True) + staged_router_logits.copy_(router_logits, non_blocking=True) + # Matrix multiply. final_hidden_states = self.quant_method.apply( layer=self, - x=hidden_states, - router_logits=router_logits, + x=staged_hidden_states, + router_logits=staged_router_logits, top_k=self.top_k, renormalize=self.renormalize, use_grouped_topk=self.use_grouped_topk, @@ -1292,7 +1280,7 @@ class FusedMoE(torch.nn.Module): if not skip_result_store: full_final_hidden_states[chunk_start:chunk_end, :].copy_( - final_hidden_states) + final_hidden_states, non_blocking=True) ctx = get_forward_context() max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py index 8f28b64ed487c..9d8bd62c6969a 100644 --- a/vllm/model_executor/layers/fused_moe/moe_pallas.py +++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py @@ -2,7 +2,24 @@ import torch import torch.nn.functional as F -from torch_xla.experimental.custom_kernel import _histogram +import torch_xla.experimental.custom_kernel # noqa: F401 + + +def _histogram(input: torch.Tensor, min: int, max: int) -> torch.Tensor: + """ + Compute the histogram of a int32 tensor. The bin edges are defined by the + min and max values, with step = 1. + """ + assert input.dtype == torch.int32, "input must be of torch.int32 dtype." + assert min <= max, "min must be less than or equal to max." + + def searchsorted(sorted_sequence: torch.Tensor, + values_to_search: torch.Tensor) -> torch.Tensor: + return (sorted_sequence.unsqueeze(1) == values_to_search).sum(dim=1) + + bin_edges = torch.linspace(min, max, max - min + 1, + dtype=input.dtype).to(input.device) + return searchsorted(bin_edges, input).to(torch.int32) def fused_moe( @@ -50,18 +67,13 @@ def fused_moe( token_indices = token_indices[topk_argsort_indices] group_sizes = _histogram(topk_indices.to(torch.int32), 0, num_experts - 1) - # NOTE(woosuk): The GMM Pallas kernel requires a different weight layout - # from HF Transformers. - w1 = w1.transpose(1, 2) - w2 = w2.transpose(1, 2) - x = hidden_states[token_indices] - x = torch.ops.xla.gmm(x, w1, group_sizes) + x = torch.ops.xla.gmm(x, w1, group_sizes, transpose_rhs=True) x = F.silu(x[..., :intermediate_size]) * x[..., intermediate_size:] - x = torch.ops.xla.gmm(x, w2, group_sizes) + x = torch.ops.xla.gmm(x, w2, group_sizes, transpose_rhs=True) x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size) - x = x * topk_weights.unsqueeze_(dim=-1) + x = x * topk_weights.unsqueeze(dim=-1) x = x.sum(dim=-2) x = x.reshape(orig_shape) return x diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py index 270e7cf1298ab..cb396f26c96e0 100644 --- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py @@ -182,3 +182,7 @@ def moe_unpermute( expert_first_token_offset, n_expert, n_local_expert, topk, hidden_states) return hidden_states + + +def moe_permute_unpermute_supported(): + return torch.ops._moe_C.moe_permute_unpermute_supported() diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index bad45325117a7..914bc47d91e3b 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -12,7 +12,6 @@ from vllm.v1.worker.ubatching import ( yield_and_switch_from_compute_to_comm_impl) -# Note use: layer.get_all_to_all() to get an AllToAll instance # The max_num_tokens, world_size and dp_size must be the same # as the ones used to create the AllToAll. class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index a92081862bfa5..824062491f0ed 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +from enum import IntEnum from functools import cache from typing import Optional @@ -9,6 +10,28 @@ from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op +class QuantMethod(IntEnum): + # This allows interfacing with AITER QuantType Enum + # without importing the QuantType from AITER globally. + + # Note that these quantization methods are + # supported in AITER package. However, + # not all are used in this module. + + NO = 0 # a16w16 + PER_TENSOR = 1 # w8a8 (pre_Tensor) + PER_TOKEN = 2 # w8a8/w8a4 (per_Token) + BLOCK_1X128 = 3 # block quantized w8a8 (per_1x128) + BLOCK_128x128 = 4 # block quantized w8a8 (per_128x128) + + +class ActivationMethod(IntEnum): + # This allows interfacing with AITER ActivationType enum + # without importing the ActivationType enum from AITER globally. + SILU = 0 + GELU = 1 + + @cache def is_rocm_aiter_moe_enabled() -> bool: return current_platform.is_rocm() \ @@ -29,13 +52,12 @@ def rocm_aiter_asm_moe_tkw1_impl( a16: bool = False, per_tensor_quant_scale: Optional[torch.Tensor] = None, expert_mask: Optional[torch.Tensor] = None, - activation_str: str = "silu") -> torch.Tensor: + activation_method: int = ActivationMethod.SILU.value) -> torch.Tensor: from aiter import ActivationType from aiter.fused_moe_bf16_asm import asm_moe_tkw1 - activation = \ - ActivationType.Gelu if activation_str == "gelu" else ActivationType.Silu + activation = ActivationType(activation_method) return asm_moe_tkw1(hidden_states, w1, @@ -65,163 +87,7 @@ def rocm_aiter_asm_moe_tkw1_fake( a16: bool = False, per_tensor_quant_scale: Optional[torch.Tensor] = None, expert_mask: Optional[torch.Tensor] = None, - activation_str: str = "silu") -> torch.Tensor: - return torch.empty_like(hidden_states) - - -def rocm_aiter_fmoe_fp8_blockscale_g1u1_impl( - topk_ids: torch.Tensor, - topk_weights: torch.Tensor, - hidden_states_dtype: torch.dtype, - expert_mask: torch.Tensor, - a1: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - a1_scale: torch.Tensor, - block_shape: list[int], - smooth_scale: Optional[torch.Tensor] = None) -> torch.Tensor: - from aiter import fmoe_fp8_blockscale_g1u1 - from aiter.fused_moe_bf16_asm import moe_sorting_ck - - topk = topk_ids.shape[1] - model_dim = w1.shape[-1] - local_E = E = w1.shape[0] - if expert_mask is not None: - E = expert_mask.numel() - - ( - sorted_token_ids, - sorted_weight_buf, - sorted_expert_ids, - num_valid_ids, - out_asm, - ) = moe_sorting_ck(topk_ids, - topk_weights, - E, - model_dim, - hidden_states_dtype, - expert_mask=expert_mask) - - fmoe_fp8_blockscale_g1u1(out_asm, a1, w1, w2, sorted_token_ids, - sorted_weight_buf, sorted_expert_ids, - num_valid_ids, topk, - a1_scale.t().contiguous(), - w1_scale.view(local_E, -1), - w2_scale.view(local_E, - -1), *block_shape, smooth_scale) - - return out_asm - - -def rocm_aiter_fmoe_fp8_blockscale_g1u1_fake( - topk_ids: torch.Tensor, - topk_weights: torch.Tensor, - hidden_states_dtype: torch.dtype, - expert_mask: torch.Tensor, - a1: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - a1_scale: torch.Tensor, - block_shape: list[int], - smooth_scale: Optional[torch.Tensor] = None) -> torch.Tensor: - - return torch.empty_like(a1, dtype=hidden_states_dtype) - - -def rocm_aiter_asm_moe_impl(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - fc1_scale: Optional[torch.Tensor] = None, - fc2_scale: Optional[torch.Tensor] = None, - fc1_smooth_scale: Optional[torch.Tensor] = None, - fc2_smooth_scale: Optional[torch.Tensor] = None, - a16: bool = False, - activation: str = "silu") -> torch.Tensor: - import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe - from aiter import ActivationType - - assert activation in ["silu", "gelu"], "The given activation:" \ - f" {activation}" \ - " is not supported in" \ - " AITER." - if activation == "silu": - aiter_activation = ActivationType.Silu - else: - aiter_activation = ActivationType.Gelu - - return rocm_aiter_asm_fmoe.asm_moe(hidden_states=hidden_states, - w1=w1, - w2=w2, - topk_weight=topk_weights, - topk_ids=topk_ids, - fc1_scale=fc1_scale, - fc2_scale=fc2_scale, - fc1_smooth_scale=fc1_smooth_scale, - fc2_smooth_scale=fc2_smooth_scale, - a16=a16, - activation=aiter_activation) - - -def rocm_aiter_asm_moe_fake(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - fc1_scale: Optional[torch.Tensor] = None, - fc2_scale: Optional[torch.Tensor] = None, - fc1_smooth_scale: Optional[torch.Tensor] = None, - fc2_smooth_scale: Optional[torch.Tensor] = None, - a16: bool = False, - activation: str = "silu") -> torch.Tensor: - return torch.empty_like(hidden_states) - - -def rocm_aiter_ck_moe_2stages_impl( - hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - fc1_scale: Optional[torch.Tensor] = None, - fc2_scale: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_size: Optional[list[int]] = None, - expert_mask: Optional[torch.Tensor] = None, -) -> torch.Tensor: - from aiter.fused_moe_bf16_asm import ck_moe_2stages - return ck_moe_2stages(a1=hidden_states, - w1=w1, - w2=w2, - topk_weight=topk_weights, - topk_ids=topk_ids, - fc1_scale=fc1_scale, - fc2_scale=fc2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, - block_size=block_size, - expert_mask=expert_mask) - - -def rocm_aiter_ck_moe_2stages_fake( - hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - fc1_scale: Optional[torch.Tensor] = None, - fc2_scale: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_size: Optional[list[int]] = None, - expert_mask: Optional[torch.Tensor] = None, -) -> torch.Tensor: + activation_method: int = ActivationMethod.SILU.value) -> torch.Tensor: return torch.empty_like(hidden_states) @@ -274,6 +140,80 @@ def rocm_aiter_biased_grouped_topk_fake( pass +def rocm_aiter_grouped_topk_impl( + gating_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_expert_group: int, + topk_group: int, + need_renorm: bool, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0 # mul to topk_weights +) -> None: + + from aiter import grouped_topk + + grouped_topk(gating_output, topk_weights, topk_ids, num_expert_group, + topk_group, need_renorm, scoring_func, routed_scaling_factor) + + +def rocm_aiter_grouped_topk_fake( + gating_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_expert_group: int, + topk_group: int, + need_renorm: bool, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0 # mul to topk_weights +) -> None: + pass + + +def rocm_aiter_fused_moe_impl( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weight: torch.Tensor, + topk_ids: torch.Tensor, + expert_mask: Optional[torch.Tensor] = None, + activation_method: int = ActivationMethod.SILU.value, + quant_method: int = QuantMethod.NO.value, + doweight_stage1: bool = False, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, +) -> torch.Tensor: + from aiter import ActivationType, QuantType + from aiter.fused_moe import fused_moe + + activation = ActivationType(activation_method) + quant_type = QuantType(quant_method) + + return fused_moe(hidden_states, w1, w2, topk_weight, topk_ids, expert_mask, + activation, quant_type, doweight_stage1, w1_scale, + w2_scale, a1_scale, a2_scale) + + +def rocm_aiter_fused_moe_fake( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weight: torch.Tensor, + topk_ids: torch.Tensor, + expert_mask: Optional[torch.Tensor] = None, + activation_method: int = ActivationMethod.SILU.value, + quant_method: int = QuantMethod.NO.value, + doweight_stage1: bool = False, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, +) -> torch.Tensor: + return torch.empty_like(hidden_states) + + if current_platform.is_rocm(): direct_register_custom_op( @@ -285,26 +225,10 @@ if current_platform.is_rocm(): ) direct_register_custom_op( - op_name="rocm_aiter_fmoe_fp8_blockscale_g1u1", - op_func=rocm_aiter_fmoe_fp8_blockscale_g1u1_impl, + op_name="rocm_aiter_fused_moe", + op_func=rocm_aiter_fused_moe_impl, mutates_args=[], - fake_impl=rocm_aiter_fmoe_fp8_blockscale_g1u1_fake, - dispatch_key=current_platform.dispatch_key, - ) - - direct_register_custom_op( - op_name="rocm_aiter_asm_moe", - op_func=rocm_aiter_asm_moe_impl, - mutates_args=[], - fake_impl=rocm_aiter_asm_moe_fake, - dispatch_key=current_platform.dispatch_key, - ) - - direct_register_custom_op( - op_name="rocm_aiter_ck_moe_2stages", - op_func=rocm_aiter_ck_moe_2stages_impl, - mutates_args=[], - fake_impl=rocm_aiter_ck_moe_2stages_fake, + fake_impl=rocm_aiter_fused_moe_fake, dispatch_key=current_platform.dispatch_key, ) @@ -324,36 +248,54 @@ if current_platform.is_rocm(): dispatch_key=current_platform.dispatch_key, ) + direct_register_custom_op( + op_name="rocm_aiter_grouped_topk", + op_func=rocm_aiter_grouped_topk_impl, + mutates_args=["topk_weights", "topk_ids"], + fake_impl=rocm_aiter_grouped_topk_fake, + dispatch_key=current_platform.dispatch_key, + ) -def rocm_aiter_biased_group_topk( + +def rocm_aiter_grouped_topk( hidden_states: torch.Tensor, gating_output: torch.Tensor, topk: int, renormalize: bool, num_expert_group: int = 0, topk_group: int = 0, - scoring_func: str = "sigmoid", + scoring_func: str = "softmax", e_score_correction_bias: Optional[torch.Tensor] = None ) -> tuple[torch.Tensor, torch.Tensor]: - assert scoring_func == "sigmoid", ( - "rocm_aiter_biased_group_topk only supports 'sigmoid' scoring_func.") - assert e_score_correction_bias is not None, ( - "'e_score_correction_bias' must not be None.") token = hidden_states.shape[0] device = hidden_states.device topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device) topk_weights = torch.empty((token, topk), dtype=torch.float32, device=device) - torch.ops.vllm.rocm_aiter_biased_grouped_topk( - gating_output, - e_score_correction_bias, - topk_weights, - topk_ids, - num_expert_group, - topk_group, - renormalize, - ) + + if e_score_correction_bias is not None: + torch.ops.vllm.rocm_aiter_biased_grouped_topk( + gating_output, + e_score_correction_bias, + topk_weights, + topk_ids, + num_expert_group, + topk_group, + renormalize, + ) + else: + assert (scoring_func == "softmax" or scoring_func == "sigmoid") + torch.ops.vllm.rocm_aiter_grouped_topk( + gating_output, + topk_weights, + topk_ids, + num_expert_group, + topk_group, + renormalize, + scoring_func, + ) + return topk_weights, topk_ids @@ -373,32 +315,14 @@ def rocm_aiter_fused_experts( a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[list[int]] = None) -> torch.Tensor: - from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - per_token_group_quant_fp8) - + activation_method = (ActivationMethod.SILU + if activation == "silu" else ActivationMethod.GELU) # All AITER Fused MoE kernels are expecting the following datatypes topk_weights = topk_weights.to(torch.float32) topk_ids = topk_ids.to(torch.int32) - # w8a8 block-scaled - if block_shape is not None and use_fp8_w8a8: - assert not apply_router_weight_on_input, ( - "apply_router_weight_on_input is not supported for block scaled moe" - ) - assert w1_scale is not None - assert w2_scale is not None - - # The default block sizes are 128 in AITER. - block_shape = [128, 128] if block_shape is None else block_shape - - a1, a1_scale = per_token_group_quant_fp8(hidden_states, block_shape[1]) - - return torch.ops.vllm.rocm_aiter_fmoe_fp8_blockscale_g1u1( - topk_ids, topk_weights, hidden_states.dtype, None, a1, w1, w2, - w1_scale, w2_scale, a1_scale, block_shape, None) - # w8a8 per-channel quantization - elif per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8: + if per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8: # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input` # This applies topk_weights on the GEMM output of the first FC layer # rather than the second FC. @@ -421,60 +345,44 @@ def rocm_aiter_fused_experts( a16=False, per_tensor_quant_scale=None, expert_mask=None, - activation_str=activation) + activation_method=activation_method) - # w8a8 per-tensor activation per-tensor weight - elif use_fp8_w8a8: - assert not apply_router_weight_on_input, ( - "apply_router_weight_on_input is not supported for fp8_w8a8") + else: + quant_method = QuantMethod.NO.value - # - faster static per-tensor-activation static per-tensor-weight - # fp8 quantization w8a8 - if a1_scale is not None and a2_scale is not None: - return torch.ops.vllm.rocm_aiter_ck_moe_2stages( - hidden_states=hidden_states, - w1=w1, - w2=w2, - topk_weights=topk_weights, - topk_ids=topk_ids, - fc1_scale=w1_scale, - fc2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale) + # w8a8 block-scaled + if block_shape is not None and use_fp8_w8a8: + assert not apply_router_weight_on_input, ( + "apply_router_weight_on_input is\ + not supported for block scaled moe") + assert w1_scale is not None + assert w2_scale is not None + quant_method = QuantMethod.BLOCK_128x128.value + elif use_fp8_w8a8: + # Currently only per tensor quantization method is enabled. + quant_method = QuantMethod.PER_TENSOR.value - # - fallback static per-tensor-activation static per-tensor-weight - # fp8 quantization w8a8 - # - dynamic per-tensor activation static per-tensor-weight - # fp8 quantization w8a8 - return torch.ops.vllm.rocm_aiter_asm_moe(hidden_states=hidden_states, - w1=w1, - w2=w2, - topk_weights=topk_weights, - topk_ids=topk_ids, - fc1_scale=w1_scale, - fc2_scale=w2_scale, - fc1_smooth_scale=None, - fc2_smooth_scale=None, - a16=False, - activation=activation) - if apply_router_weight_on_input: - assert (topk_weights.dim() == 2 - ), "`topk_weights` should be in shape (num_tokens, topk)" - _, topk = topk_weights.shape - assert ( - topk == 1 - ), "Only support topk=1 when `apply_router_weight_on_input` is True" + if apply_router_weight_on_input: + assert (topk_weights.dim() == 2 + ), "`topk_weights` should be in shape (num_tokens, topk)" + _, topk = topk_weights.shape + assert ( + topk == 1 + ), "Only support topk=1 when `apply_router_weight_on_input` is True" - hidden_states = hidden_states * topk_weights.to(hidden_states.dtype) - topk_ids = topk_ids.to(torch.int32) - topk_weights = torch.ones_like(topk_weights, dtype=torch.float32) - - return torch.ops.vllm.rocm_aiter_ck_moe_2stages( - hidden_states=hidden_states, - w1=w1, - w2=w2, - topk_weights=topk_weights, - topk_ids=topk_ids) + return torch.ops.vllm.rocm_aiter_fused_moe( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + quant_method=quant_method, + activation_method=activation_method, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + doweight_stage1=apply_router_weight_on_input) def rocm_aiter_topk_softmax(topk_weights: torch.Tensor, @@ -488,14 +396,21 @@ def rocm_aiter_topk_softmax(topk_weights: torch.Tensor, return topk_weights, topk_indices -def shuffle_weights(*tensors: torch.Tensor, - layout: tuple[int, int]) -> tuple[torch.Tensor, ...]: +def shuffle_weights( + *tensors: torch.Tensor, layout: tuple[int, int] = (16, 16) +) -> tuple[torch.Tensor, ...]: """ Applies shuffle_weight function from AITER to each input tensor and returns them. + + Rearranges (shuffles) the input tensor/s + into a specified block layout for optimized computation. Args: - *tensors: Variable number of torch.Tensor objects. + *tensors: Variable number of torch.Tensor objects. + layout: A pair of integers specifying the + block sizes used to divide the tensors during shuffling. + Default is (16, 16). Returns: A Tuple of shuffled tensors. @@ -503,25 +418,3 @@ def shuffle_weights(*tensors: torch.Tensor, from aiter.ops.shuffle import shuffle_weight return tuple(shuffle_weight(tensor, layout=layout) for tensor in tensors) - - -def expand_weights(*tensors: torch.Tensor, - expansion_dims: list[int]) -> tuple[torch.Tensor, ...]: - """ - Expands the dimensions of input tensors. - - Args: - *tensors: A variable number of torch.Tensor objects. - expansion_dims: A list of expansion dimensions - corresponding to each tensor. - - Returns: - A Tuple of tensors with expanded dimensions. - """ - - assert len(tensors) == len(expansion_dims), \ - "Number of tensors must match the number of expansion dimensions." - - return tuple( - tensor.unsqueeze(-1).unsqueeze(-1).expand((-1, dim, -1)) - for tensor, dim in zip(tensors, expansion_dims)) \ No newline at end of file diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 54dd1251e59ff..269ac043d26c4 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -261,6 +261,7 @@ class ReplicatedLinear(LinearBase): quant_config: Quantization configure. prefix: The name of the layer in the state dict, including all parents (e.g. model.layers.0.qkv_proj) + return_bias: If true, return bias together with outputs in forward pass. """ def __init__( @@ -523,6 +524,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): quant_config: Quantization configure. prefix: The name of the layer in the state dict, including all parents (e.g. model.layers.0.qkv_proj) + return_bias: If true, return bias together with outputs in forward pass. """ def __init__( @@ -585,8 +587,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear): param.shard_id.append(loaded_shard_id) param.shard_id_map[loaded_shard_id] = len(param.data_container) param.data_container.append(loaded_weight) - if len(param.data_container) == 2: - self.qweight = param.materialize_nested() return param_data = param.data @@ -805,6 +805,7 @@ class QKVParallelLinear(ColumnParallelLinear): quant_config: Quantization configure. prefix: The name of the layer in the state dict, including all parents (e.g. model.layers.0.qkv_proj) + return_bias: If true, return bias together with outputs in forward pass. """ def __init__( @@ -979,8 +980,6 @@ class QKVParallelLinear(ColumnParallelLinear): param.shard_id.append(loaded_shard_id) param.shard_id_map[loaded_shard_id] = len(param.data_container) param.data_container.append(loaded_weight) - if len(param.data_container) == 3: - self.qweight = param.materialize_nested() return param_data = param.data @@ -1155,7 +1154,13 @@ class RowParallelLinear(LinearBase): bias can be fused with other element-wise operations. We skip adding bias but instead return it. params_dtype: Data type for the parameters. + reduce_results: If true, call all-reduce on output and make Y available + to all GPUs, otherwise, every GPU will have its output + which is Y = X_iA_i quant_config: Quantization configure. + prefix: The name of the layer in the state dict, including all parents + (e.g. model.layers.0.down_proj) + return_bias: If true, return bias together with outputs in forward pass. """ def __init__( diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py index e5b88de2fcc8d..019f634a9ef41 100644 --- a/vllm/model_executor/layers/mamba/mamba2_metadata.py +++ b/vllm/model_executor/layers/mamba/mamba2_metadata.py @@ -5,10 +5,9 @@ from dataclasses import dataclass import torch from vllm.attention.backends.abstract import AttentionMetadata -from vllm.attention.backends.flash_attn import FlashAttentionMetadata from vllm.attention.backends.placeholder_attn import ( PlaceholderAttentionMetadata) -from vllm.attention.backends.xformers import XFormersMetadata +from vllm.platforms import current_platform @dataclass @@ -23,6 +22,21 @@ class Mamba2Metadata: chunk_offsets: torch.Tensor +def get_platform_metadata_classes() -> tuple[type[AttentionMetadata], ...]: + """Returns the appropriate metadata classes for the current platform.""" + if current_platform.is_rocm(): + from vllm.attention.backends.rocm_flash_attn import ( + ROCmFlashAttentionMetadata) + return (ROCmFlashAttentionMetadata, PlaceholderAttentionMetadata) + elif current_platform.is_cuda(): + from vllm.attention.backends.flash_attn import FlashAttentionMetadata + from vllm.attention.backends.xformers import XFormersMetadata + return (FlashAttentionMetadata, XFormersMetadata, + PlaceholderAttentionMetadata) + raise ValueError( + f"Unsupported platform for Mamba2: {current_platform.device_type}") + + def _query_start_loc_to_chunk_indices_offsets(query_start_loc: torch.Tensor, chunk_size: int, total_seqlens: int): @@ -78,9 +92,8 @@ def prepare_mamba2_metadata( # Compute seq_idx, chunk_indices and chunk_offsets for prefill only if num_prefills > 0: - if (isinstance(attn_metadata, - (FlashAttentionMetadata, XFormersMetadata, - PlaceholderAttentionMetadata)) + attn_metadata_instances = get_platform_metadata_classes() + if (isinstance(attn_metadata, attn_metadata_instances) and attn_metadata.context_lens_tensor is not None): has_initial_states = \ attn_metadata.context_lens_tensor[:num_prefills] > 0 #[batch,] diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index bc6e6fcdd0a2e..f94ab75f9a4f0 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -34,7 +34,11 @@ from vllm.model_executor.utils import set_weight_attrs @CustomOp.register("mixer2_gated_rms_norm") class Mixer2RMSNormGated(CustomOp): - def __init__(self, full_hidden_size, full_n_groups, eps=1e-6): + def __init__(self, + full_hidden_size: int, + full_n_groups: int, + use_rms_norm: bool = True, + eps: float = 1e-6): super().__init__() self.tp_size = get_tensor_model_parallel_world_size() self.tp_rank = get_tensor_model_parallel_rank() @@ -44,11 +48,17 @@ class Mixer2RMSNormGated(CustomOp): self.n_groups = full_hidden_size // self.group_size self.variance_epsilon = eps - self.weight = nn.Parameter(torch.ones(self.per_rank_hidden_size)) - set_weight_attrs(self.weight, - {"weight_loader": sharded_weight_loader(0)}) - assert self.full_hidden_size % self.tp_size== 0,\ - "Tensor parallel world size must divide hidden size." + self.use_rms_norm = use_rms_norm + if self.use_rms_norm: + # Register norm weight only if we're actually applying RMSNorm + self.weight = nn.Parameter(torch.ones(self.per_rank_hidden_size)) + set_weight_attrs(self.weight, + {"weight_loader": sharded_weight_loader(0)}) + else: + # Avoid checkpoint mismatch by skipping unused parameter + self.register_parameter("weight", None) + assert (self.full_hidden_size % self.tp_size == 0 + ), "Tensor parallel world size must divide hidden size." def forward_native( self, @@ -66,6 +76,8 @@ class Mixer2RMSNormGated(CustomOp): # the input and then redundantly compute the RMSNorm. input_dtype = x.dtype x = x * nn.functional.silu(gate.to(torch.float32)) + if not self.use_rms_norm: + return x.to(input_dtype) if self.n_groups == 1: if self.tp_size > 1: @@ -74,7 +86,7 @@ class Mixer2RMSNormGated(CustomOp): global_sums = tensor_model_parallel_all_reduce(local_sums) # Calculate the variance count = self.tp_size * x.shape[-1] - variance = (global_sums / count) + variance = global_sums / count else: variance = x.pow(2).mean(-1, keepdim=True) @@ -105,6 +117,11 @@ class Mixer2RMSNormGated(CustomOp): x: torch.Tensor, gate: torch.Tensor, ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + input_dtype = x.dtype + if not self.use_rms_norm: + # Keep gate in float32 for numerical stability during silu + return x * nn.functional.silu(gate.to( + torch.float32)).to(input_dtype) if self.tp_size > 1 or self.n_groups != 1: return self.forward_native(x, gate) @@ -124,7 +141,7 @@ class Mixer2RMSNormGated(CustomOp): def extra_groups_for_head_shards(ngroups: int, tp_size: int): - """Compute the increase in group numbers to account for + """Compute the increase in group numbers to account for replication in order to accompany the head shards.""" # in the case ngoups % tp_size == 0, this will be zero @@ -182,13 +199,15 @@ def mamba_v2_sharded_weight_loader( # seem to handle slices well. # https://github.com/python/mypy/issues/2410 param.data[ - boundary:(boundary + take), # type: ignore[misc] - ...] = loaded_weight[loaded_start_idx:( # type: ignore[misc] - loaded_start_idx + take)] # type: ignore[misc] + boundary:(boundary + take), + ... # type: ignore[misc] + ] = loaded_weight[loaded_start_idx:(loaded_start_idx + + take) # type: ignore[misc] + ] # type: ignore[misc] # move indexing boundaries boundary += shard_size - loaded_boundary += (full_dim - extra) + loaded_boundary += full_dim - extra return loader @@ -206,19 +225,22 @@ class MambaMixer2(CustomOp): **selective** state spaces) """ - def __init__(self, - hidden_size: int, - ssm_state_size: int, - conv_kernel_size: int, - intermediate_size: int, - use_conv_bias: bool, - use_bias: bool, - n_groups: int = 1, - num_heads: int = 128, - head_dim: int = 64, - rms_norm_eps: float = 1e-5, - activation="silu", - quant_config: Optional[QuantizationConfig] = None): + def __init__( + self, + hidden_size: int, + ssm_state_size: int, + conv_kernel_size: int, + intermediate_size: int, + use_conv_bias: bool, + use_bias: bool, + n_groups: int = 1, + num_heads: int = 128, + head_dim: int = 64, + rms_norm_eps: float = 1e-5, + activation: str = "silu", + use_rms_norm: bool = True, + quant_config: Optional[QuantizationConfig] = None, + ): super().__init__() # For TP, the sharding plan is as follows: @@ -238,17 +260,16 @@ class MambaMixer2(CustomOp): self.tp_size = get_tensor_model_parallel_world_size() tp_rank = get_tensor_model_parallel_rank() - assert num_heads % self.tp_size == 0, \ - "Tensor parallel world size must divide num heads." + assert (num_heads % self.tp_size == 0 + ), "Tensor parallel world size must divide num heads." - assert (n_groups % self.tp_size) == 0 or n_groups == 1, \ - ( - "If tensor parallel world size does not divide num_heads, " - "then num_groups must equal 1." - ) + assert (n_groups % self.tp_size) == 0 or n_groups == 1, ( + "If tensor parallel world size does not divide num_heads, " + "then num_groups must equal 1.") - assert self.tp_size == 1 or quant_config is None, \ - "Tensor parallel currently not supported for quantized models." + assert ( + self.tp_size == 1 or quant_config is None + ), "Tensor parallel currently not supported for quantized models." self.ssm_state_size = ssm_state_size self.activation = activation @@ -265,8 +286,7 @@ class MambaMixer2(CustomOp): self.n_groups = n_groups + extra_groups_for_head_shards( n_groups, self.tp_size) - self.conv_dim = (intermediate_size + - 2 * self.n_groups * ssm_state_size) + self.conv_dim = intermediate_size + 2 * self.n_groups * ssm_state_size self.conv1d = ColumnParallelLinear( input_size=conv_kernel_size, output_size=self.conv_dim, @@ -279,11 +299,12 @@ class MambaMixer2(CustomOp): # doesn't allow to override it self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1) - self.in_proj = ColumnParallelLinear(input_size=hidden_size, - output_size=intermediate_size + - self.conv_dim + self.num_heads, - bias=use_bias, - quant_config=quant_config) + self.in_proj = ColumnParallelLinear( + input_size=hidden_size, + output_size=intermediate_size + self.conv_dim + self.num_heads, + bias=use_bias, + quant_config=quant_config, + ) # - because in_proj is a concatenation of 3 weights, we # need to interleave them before sharding @@ -305,7 +326,8 @@ class MambaMixer2(CustomOp): # - ditto for the otther two weights below delattr(self.conv1d.bias, "weight_loader") set_weight_attrs( - self.conv1d.bias, { + self.conv1d.bias, + { "weight_loader": mamba_v2_sharded_weight_loader( [ @@ -316,18 +338,25 @@ class MambaMixer2(CustomOp): self.tp_size, tp_rank, ) - }) + }, + ) delattr(self.conv1d.weight, "weight_loader") set_weight_attrs( - self.conv1d.weight, { + self.conv1d.weight, + { "weight_loader": - mamba_v2_sharded_weight_loader([ - intermediate_settings, - group_shard_settings, - group_shard_settings, - ], self.tp_size, tp_rank) - }) + mamba_v2_sharded_weight_loader( + [ + intermediate_settings, + group_shard_settings, + group_shard_settings, + ], + self.tp_size, + tp_rank, + ) + }, + ) if quant_config is None: # - quant layers do not have a weight loader @@ -345,8 +374,10 @@ class MambaMixer2(CustomOp): head_setings, # for dt ], self.tp_size, - tp_rank) - }) + tp_rank, + ) + }, + ) # - these are TPed by heads to reduce the size of the # temporal shape @@ -357,6 +388,7 @@ class MambaMixer2(CustomOp): )) self.D = nn.Parameter(torch.ones(num_heads // self.tp_size)) self.dt_bias = nn.Parameter(torch.ones(num_heads // self.tp_size)) + self.use_rms_norm = use_rms_norm set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)}) a_weight_loader = composed_weight_loader( @@ -365,18 +397,25 @@ class MambaMixer2(CustomOp): set_weight_attrs(self.dt_bias, {"weight_loader": sharded_weight_loader(0)}) - self.out_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=use_bias, - input_is_parallel=True, - quant_config=quant_config) + self.out_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=use_bias, + input_is_parallel=True, + quant_config=quant_config, + ) self.norm = Mixer2RMSNormGated(intermediate_size, n_groups, + self.use_rms_norm, eps=rms_norm_eps) - def forward_native(self, hidden_states: torch.Tensor, - conv_state: torch.Tensor, ssm_state: torch.Tensor): + def forward_native( + self, + hidden_states: torch.Tensor, + conv_state: torch.Tensor, + ssm_state: torch.Tensor, + ): pass def forward_cuda( @@ -384,6 +423,7 @@ class MambaMixer2(CustomOp): hidden_states: torch.Tensor, mamba_cache_params: MambaCacheParams, mamba2_metadata: Mamba2Metadata, + mup_vector: Optional[torch.Tensor] = None, ): # mamba2_metadata contains metadata necessary for the mamba2 triton # kernels to operate in continuous batching and in chunked prefill @@ -401,6 +441,10 @@ class MambaMixer2(CustomOp): # 1. Gated MLP's linear projection projected_states, _ = self.in_proj(hidden_states) + + if mup_vector is not None: + projected_states = projected_states * mup_vector + gate, hidden_states_B_C, dt = torch.split( projected_states, [ @@ -561,6 +605,9 @@ class MambaMixer2(CustomOp): hidden_states = torch.vstack(ssd_output_list) # 4. gated MLP + # GatedRMSNorm internally applying SiLU to the gate + # SiLU is applied internally before normalization, unlike standard + # norm usage hidden_states = self.norm(hidden_states, gate) # 5. Final linear projection diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 6abbc90819a82..d2c42191bb3ff 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -6,10 +6,9 @@ from typing import Optional, Union import torch import torch.nn as nn import torch.nn.functional as F -from transformers import PretrainedConfig from typing_extensions import assert_never -from vllm.config import PoolerConfig +from vllm.config import ModelConfig, PoolerConfig from vllm.model_executor.pooling_metadata import (PoolingMetadata, PoolingTensors) from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput @@ -283,30 +282,37 @@ class Pooler(nn.Module): ) -class CrossEncodingPooler(nn.Module): - """A layer that pools specific information from hidden states. +class ClassifierPooler(nn.Module): + """A pooling layer for classification tasks. This layer does the following: - 1. Extracts specific tokens or aggregates data based on pooling method. - 2. Normalizes output if specified. - 3. Returns structured results as `PoolerOutput`. - - Attributes: - pooling_type: The type of pooling to use. - normalize: Whether to normalize the pooled data. + 1. Applies a classification layer to the hidden states. + 2. Optionally applies a pooler layer. + 3. Applies an activation function to the output. In the case of + classification models it is either sigmoid or softmax. In the + case of scoring models, the same behavior is configuration + dependent, as in the sentence-transformers library. """ def __init__( self, - config: PretrainedConfig, + config: ModelConfig, classifier: nn.Module, pooler: Optional[nn.Module] = None, ): super().__init__() self.classifier = classifier self.pooler = pooler - self.default_activation_function = \ - get_cross_encoder_activation_function(config) + + if config.task == "score": + self.default_activation_function = \ + get_cross_encoder_activation_function(config.hf_config) + elif config.task == "classify": + self.default_activation_function = nn.Sigmoid() \ + if config.hf_config.num_labels == 1 else nn.Softmax() + else: + raise NotImplementedError(f"task={config.task!r} is not supported" + " with the classification pooler") def forward( self, diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py index a5e63843cf62a..eb8ffa37882cb 100644 --- a/vllm/model_executor/layers/quantization/auto_round.py +++ b/vllm/model_executor/layers/quantization/auto_round.py @@ -8,6 +8,7 @@ import torch from vllm.logger import init_logger from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead @@ -74,7 +75,7 @@ class AutoRoundConfig(QuantizationConfig): f"group_size={self.group_size}, sym={self.sym})") @classmethod - def get_name(cls): ## use str will trigger preci issue + def get_name(cls) -> QuantizationMethods: return "auto-round" @classmethod @@ -115,8 +116,9 @@ class AutoRoundConfig(QuantizationConfig): quantized = True if self.block_name_to_quantize: - quantized = any(name in layer_name - for name in self.block_name_to_quantize) + quantized = any( + layer_name.startswith(name) + for name in self.block_name_to_quantize) elif isinstance(layer, ParallelLMHead): quantized = False @@ -142,18 +144,18 @@ class AutoRoundConfig(QuantizationConfig): prefix, layer.__class__.__name__, weight_bits, group_size, sym) if backend == "auto" or "marlin" in backend: - if isinstance(layer, FusedMoE): - use_marlin = check_moe_marlin_supports_layer(layer, group_size) - else: + AWQ_TYPE_MAP = { + 4: scalar_types.uint4, + 8: scalar_types.uint8, + } + use_marlin = (weight_bits + in AWQ_TYPE_MAP) and check_marlin_supported( + AWQ_TYPE_MAP[weight_bits], group_size, not sym) + + if isinstance(layer, FusedMoE): + use_marlin = use_marlin and check_moe_marlin_supports_layer( + layer, group_size) - AWQ_TYPE_MAP = { - 4: scalar_types.uint4, - 8: scalar_types.uint8, - } - use_marlin = ((weight_bits, sym) in AWQ_TYPE_MAP - and check_marlin_supported( - AWQ_TYPE_MAP[(weight_bits)], group_size, - not sym)) else: use_marlin = False if use_marlin: @@ -180,10 +182,11 @@ class AutoRoundConfig(QuantizationConfig): from vllm.model_executor.layers.quantization.moe_wna16 import ( MoeWNA16Config) config = { - "linear_quant_method": "awq", - "weight_bits": weight_bits, + "quant_method": "awq", + "bits": weight_bits, "group_size": group_size, "zero_point": not sym, + "lm_head": False, } return MoeWNA16Config.from_config(config).get_quant_method( layer, prefix) @@ -213,18 +216,18 @@ class AutoRoundConfig(QuantizationConfig): prefix, layer.__class__.__name__, weight_bits, group_size, sym) if backend == "auto" or "marlin" in backend: + GPTQ_TYPE_MAP = { + (4, True): scalar_types.uint4b8, + (8, True): scalar_types.uint8b128, + } + use_marlin = ((weight_bits, sym) in GPTQ_TYPE_MAP + and check_marlin_supported( + GPTQ_TYPE_MAP[(weight_bits, sym)], + group_size, + has_zp=not sym)) if isinstance(layer, FusedMoE): - use_marlin = check_moe_marlin_supports_layer(layer, group_size) - else: - GPTQ_TYPE_MAP = { - (4, True): scalar_types.uint4b8, - (8, True): scalar_types.uint8b128, - } - use_marlin = ((weight_bits, sym) in GPTQ_TYPE_MAP - and check_marlin_supported( - GPTQ_TYPE_MAP[(weight_bits, sym)], - group_size, - has_zp=not sym)) + use_marlin = use_marlin and check_moe_marlin_supports_layer( + layer, group_size) else: use_marlin = False if use_marlin: @@ -251,11 +254,11 @@ class AutoRoundConfig(QuantizationConfig): from vllm.model_executor.layers.quantization.moe_wna16 import ( MoeWNA16Config) config = { - "linear_quant_method": "gptq", - "weight_bits": weight_bits, + "quant_method": "gptq", + "bits": weight_bits, "group_size": group_size, "sym": sym, - "lm_head_quantized": False, + "lm_head": False, } return MoeWNA16Config.from_config(config).get_quant_method( layer, prefix) diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 4660c28c8de4a..87afdb623d912 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -101,7 +101,13 @@ class AWQLinearMethod(LinearMethodBase): output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): - if input_size_per_partition % self.quant_config.group_size != 0: + # Normalize group_size + if self.quant_config.group_size != -1: + group_size = self.quant_config.group_size + else: + group_size = input_size + + if input_size_per_partition % group_size != 0: raise ValueError( "The input size is not aligned with the quantized " "weight shape. This can be caused by too large " @@ -127,9 +133,11 @@ class AWQLinearMethod(LinearMethodBase): packed_factor=self.quant_config.pack_factor, weight_loader=weight_loader) + num_groups = input_size_per_partition // group_size + qzeros = PackedvLLMParameter( data=torch.empty( - input_size_per_partition // self.quant_config.group_size, + num_groups, output_size_per_partition // self.quant_config.pack_factor, dtype=torch.int32, ), @@ -140,7 +148,7 @@ class AWQLinearMethod(LinearMethodBase): weight_loader=weight_loader) scales = GroupQuantScaleParameter(data=torch.empty( - input_size_per_partition // self.quant_config.group_size, + num_groups, output_size_per_partition, dtype=params_dtype, ), diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index fa0067c448028..9241ceeb4db29 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -286,9 +286,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): rocm_aiter_fused_experts, shuffle_weights) # reshaping weights is required for aiter moe kernel. - shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight.data, - layer.w2_weight.data, - layout=(16, 16)) + shuffled_w13, shuffled_w2 = shuffle_weights( + layer.w13_weight.data, layer.w2_weight.data) layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index ccd54281ceb7e..75e81c4dd49d8 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 -import re from collections.abc import Iterable, Mapping from types import MappingProxyType from typing import Optional +import regex as re from compressed_tensors import CompressionFormat from torch.nn import Module diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index f4cdc3db1a0d3..ac9b74945e0ce 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -10,7 +10,6 @@ from torch.nn import Module from torch.nn.parameter import Parameter import vllm.envs as envs -import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger @@ -63,10 +62,9 @@ class Fp8Config(QuantizationConfig): weight_block_size: Optional[list[int]] = None, ) -> None: super().__init__() + self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized - if is_checkpoint_fp8_serialized: - logger.warning("Detected fp8 checkpoint. Please note that the " - "format is experimental and subject to change.") + if activation_scheme not in ACTIVATION_SCHEMES: raise ValueError( f"Unsupported activation scheme {activation_scheme}") @@ -461,7 +459,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): logger.warning_once( "DeepGemm not supported on the current platform.") - self.fused_experts = functools.partial( + self.fused_experts = functools.partial( # type: ignore fused_experts, block_shape=self.quant_config.weight_block_size, allow_deep_gemm=self.allow_deep_gemm) @@ -597,7 +595,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): def process_weights_after_loading(self, layer: Module) -> None: # Lazy import to avoid importing triton too early. from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( - expand_weights, is_rocm_aiter_moe_enabled, shuffle_weights) + is_rocm_aiter_moe_enabled, shuffle_weights) self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled() @@ -629,9 +627,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): if self.rocm_aiter_moe_enabled: # reshaping weights is required for aiter moe kernel. shuffled_w13, shuffled_w2 = shuffle_weights( - layer.w13_weight.data, - layer.w2_weight.data, - layout=(16, 16)) + layer.w13_weight.data, layer.w2_weight.data) layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) @@ -677,20 +673,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): requires_grad=False) if self.rocm_aiter_moe_enabled: # reshaping weights is required for aiter moe kernel. - w13_scales, w2_scales = expand_weights( - layer.w13_weight_scale.data, - layer.w2_weight_scale.data, - expansion_dims=[ - layer.w13_weight.shape[1], layer.w2_weight.shape[1] - ]) - layer.w13_weight_scale = torch.nn.Parameter( - w13_scales.contiguous(), requires_grad=False) - layer.w2_weight_scale = torch.nn.Parameter( - w2_scales.contiguous(), requires_grad=False) - - shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight, - layer.w2_weight, - layout=(16, 16)) + shuffled_w13, shuffled_w2 = shuffle_weights( + layer.w13_weight, layer.w2_weight) layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) @@ -762,20 +746,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): start += shard_size if self.rocm_aiter_moe_enabled: - # reshaping weights is required for aiter moe kernel. - expansion_dims = [ - layer.w13_weight.shape[1], layer.w2_weight.shape[1] - ] - max_w13_scales, w2_scales = expand_weights( - max_w13_scales, - layer.w2_weight_scale.data, - expansion_dims=expansion_dims) - layer.w2_weight_scale = torch.nn.Parameter( - w2_scales.contiguous(), requires_grad=False) - - shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight, - layer.w2_weight, - layout=(32, 32)) + shuffled_w13, shuffled_w2 = shuffle_weights( + layer.w13_weight, layer.w2_weight) layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) @@ -791,17 +763,12 @@ class Fp8MoEMethod(FusedMoEMethodBase): del layer.w13_input_scale del layer.w2_input_scale - def set_prepare_finalize( - self, - dp_size: int, - world_size: int, - prepare_finalize: mk.FusedMoEPrepareAndFinalize, - ) -> bool: + def select_gemm_impl(self, prepare_finalize): from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( TritonOrDeepGemmExperts) - if self.use_marlin or self.rocm_aiter_moe_enabled: - return False + assert not self.use_marlin and not self.rocm_aiter_moe_enabled, ( + "Marlin and ROCm AITER are not supported with all2all yet.") experts = TritonOrDeepGemmExperts( use_fp8_w8a8=True, @@ -809,12 +776,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): allow_deep_gemm=self.allow_deep_gemm, ) - self.fused_experts = mk.FusedMoEModularKernel( - prepare_finalize, - experts, - ) - - return True + return experts def apply( self, diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index d7d4a5d6acdbf..1fcb6d7afc9b3 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -9,7 +9,6 @@ from torch.nn.parameter import Parameter, UninitializedParameter from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, FusedMoEMethodBase) from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase @@ -19,6 +18,7 @@ from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.utils import set_weight_attrs +from vllm.utils import direct_register_custom_op logger = init_logger(__name__) @@ -96,8 +96,8 @@ MMVQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES -def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor, - qweight_type: int) -> torch.Tensor: +def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor, + qweight_type: int) -> torch.Tensor: # HACK: when doing chunked prefill we don't generate output tokens # so input to logits generator is empty which causes invalid parameter if x.shape[0] == 0: @@ -130,6 +130,30 @@ def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor, return y +def _fused_mul_mat_gguf_fake( + x: torch.Tensor, + qweight: torch.Tensor, + qweight_type: int, +) -> torch.Tensor: + return torch.empty(x.shape[0], + qweight.shape[0], + dtype=x.dtype, + device=x.device) + + +try: + direct_register_custom_op( + op_name="_fused_mul_mat_gguf", + op_func=_fused_mul_mat_gguf, + mutates_args=[], + fake_impl=_fused_mul_mat_gguf_fake, + ) + fused_mul_mat_gguf = torch.ops.vllm._fused_mul_mat_gguf + +except AttributeError as error: + raise error + + def _fused_moe_gguf( x: torch.Tensor, w1: torch.Tensor, @@ -138,8 +162,21 @@ def _fused_moe_gguf( topk_ids: torch.Tensor, qweight_type: int, qweight_type2: int, - act, + activation: str, ) -> torch.Tensor: + + def act(x: torch.Tensor): + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + if activation == "silu": + torch.ops._C.silu_and_mul(out, x) + elif activation == "gelu": + torch.ops._C.gelu_and_mul(out, x) + else: + raise ValueError(f"Unsupported activation: {activation}") + return out + # lazy import to avoid triggering triton import in CPU backend from vllm.model_executor.layers.fused_moe.fused_moe import ( moe_align_block_size) @@ -189,12 +226,12 @@ def _fused_moe_gguf( for ww, ii in zip(w, idx): expert_up = w1[ii] - out = _fuse_mul_mat(inp, expert_up, qweight_type) + out = fused_mul_mat_gguf(inp, expert_up, qweight_type) out = act(out) expert_down = w2[ii] - current_state = _fuse_mul_mat(out, expert_down, - qweight_type2).mul_(ww) + current_state = fused_mul_mat_gguf(out, expert_down, + qweight_type2).mul_(ww) if current_hidden_state is None: current_hidden_state = current_state else: @@ -203,6 +240,78 @@ def _fused_moe_gguf( return out_hidden_states +def _fused_moe_gguf_fake( + x: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + qweight_type: int, + qweight_type2: int, + activation: str, +) -> torch.Tensor: + return torch.empty_like(x) + + +try: + direct_register_custom_op( + op_name="_fused_moe_gguf", + op_func=_fused_moe_gguf, + mutates_args=[], + fake_impl=_fused_moe_gguf_fake, + ) + fused_moe_gguf = torch.ops.vllm._fused_moe_gguf + +except AttributeError as error: + raise error + + +def _apply_gguf_embedding( + x: torch.Tensor, + qweight: torch.Tensor, + qweight_type: int, + hidden_size: int, + dtype: Optional[torch.dtype] = None, +) -> torch.Tensor: + if qweight_type in UNQUANTIZED_TYPES: + return torch.embedding(qweight, x) + elif qweight_type in DEQUANT_TYPES: + block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type] + x_flat = x.flatten() + assert (hidden_size == qweight.shape[1] // type_size * block_size) + quant = torch.index_select(qweight, dim=0, index=x_flat) + dequant = ops.ggml_dequantize(quant, qweight_type, hidden_size, + x_flat.shape[0], dtype) + return dequant.view(*x.shape, hidden_size) + else: + qweight_type = WeightType(qweight_type) + raise NotImplementedError( + f"Unsupported GGUF quantization type: {qweight_type}") + + +def _apply_gguf_embedding_fake( + x: torch.Tensor, + qweight: torch.Tensor, + qweight_type: int, + hidden_size: int, + dtype: Optional[torch.dtype] = None, +) -> torch.Tensor: + return torch.empty(x.shape[0], hidden_size, dtype=dtype, device=x.device) + + +try: + direct_register_custom_op( + op_name="_apply_gguf_embedding", + op_func=_apply_gguf_embedding, + mutates_args=[], + fake_impl=_apply_gguf_embedding_fake, + ) + apply_gguf_embedding = torch.ops.vllm._apply_gguf_embedding + +except AttributeError as error: + raise error + + class GGUFLinearMethod(LinearMethodBase): """Linear method for GGUF. @@ -249,26 +358,76 @@ class GGUFLinearMethod(LinearMethodBase): set_weight_attrs(qweight_type, extra_weight_attrs) layer.register_parameter("qweight_type", qweight_type) + def process_weights_after_loading(self, layer: torch.nn.Module): + qweight_type = layer.qweight_type.weight_type + if not (qweight_type in UNQUANTIZED_TYPES + or qweight_type in DEQUANT_TYPES): + qweight_type = WeightType(qweight_type) + raise ValueError( + f"Unsupported GGUF quantization type {qweight_type} in " + f"layer {layer}.") + # For MergedColumnParallelLinear and QKVParallelLinear, we need to + # materialize the padded weight parameter for CUDA Graph compatibility. + self._create_padded_weight_param(layer) + + def _create_padded_weight_param(self, layer: torch.nn.Module): + """Create padded weight parameter for GGUF MergedLinear layer.""" + qweight = layer.qweight + shard_id_map = qweight.shard_id_map + shard_id = qweight.shard_id + if len(data_container := qweight.data_container) > 1: + dtype = {data.dtype for data in data_container} + assert len(dtype) == 1, ValueError( + f"Data container has mixed dtypes: {dtype}") + dtype = next(iter(dtype)) + # concat dim0 and pad dim1 + padded_side = max(x.size(1) for x in data_container) + concat_side = sum(x.size(0) for x in data_container) + # Pad the quantized weights to dense tensor, and create a map + # with the location of each shard in the padded tensor. + padded_data = torch.zeros((concat_side, padded_side), + dtype=dtype, + device=qweight.device) + # (dim0_start, dim0_end, dim1_size) + shard_offset_map = dict[str, tuple[int, int, int]]() + for idx in shard_id: + id_in_container = shard_id_map[idx] + start = sum( + x.size(0) for x in data_container[:id_in_container]) + end = start + data_container[id_in_container].size(0) + size = data_container[id_in_container].size(1) + padded_data[start:end, :size] = data_container[id_in_container] + shard_offset_map[idx] = (start, end, size) + qweight.data_container.clear() + padded_param = Parameter(padded_data, requires_grad=False) + set_weight_attrs(padded_param, vars(qweight)) + set_weight_attrs(padded_param, + {"shard_offset_map": shard_offset_map}) + layer.register_parameter("qweight", padded_param) + def apply(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: - shard_id = getattr(layer.qweight, "shard_id", None) + shard_id = layer.qweight.shard_id if shard_id: # dequantize shard weights respectively shard_id = ["q", "k", "v"] if "q" in shard_id else shard_id - qweight = layer.qweight.unbind(0) + qweight = layer.qweight result = [] for idx in shard_id: - q_idx = layer.qweight.shard_id_map[idx] + start, end, offset = layer.qweight.shard_offset_map[idx] qweight_type = layer.qweight_type.shard_weight_type[idx] - result.append(_fuse_mul_mat(x, qweight[q_idx], qweight_type)) + result.append( + fused_mul_mat_gguf( + x, qweight[start:end, :offset].contiguous(), + qweight_type)) out = torch.cat(result, axis=1) else: qweight = layer.qweight qweight_type = layer.qweight_type.weight_type - out = _fuse_mul_mat(x, qweight, qweight_type) + out = fused_mul_mat_gguf(x, qweight, qweight_type) if bias is not None: out.add_(bias) return out @@ -338,7 +497,6 @@ class GGUFMoEMethod(FusedMoEMethodBase): set_weight_attrs(w2_qweight_type, extra_weight_attrs) layer.register_parameter("w2_qweight_type", w2_qweight_type) - self.act = SiluAndMul() def apply( self, @@ -375,10 +533,10 @@ class GGUFMoEMethod(FusedMoEMethodBase): custom_routing_function=custom_routing_function, scoring_func=scoring_func, e_score_correction_bias=e_score_correction_bias) - return _fused_moe_gguf(x, layer.w13_qweight, layer.w2_qweight, - topk_weights, topk_ids, - layer.w13_qweight_type.weight_type, - layer.w2_qweight_type.weight_type, self.act) + return fused_moe_gguf(x, layer.w13_qweight, layer.w2_qweight, + topk_weights, topk_ids, + layer.w13_qweight_type.weight_type, + layer.w2_qweight_type.weight_type, activation) class GGUFEmbeddingMethod(GGUFLinearMethod): @@ -392,34 +550,15 @@ class GGUFEmbeddingMethod(GGUFLinearMethod): x: torch.Tensor) -> torch.Tensor: qweight = layer.qweight qweight_type = layer.qweight_type.weight_type + hidden_size = qweight.tensor_shape[1] - block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type] - hidden_size = qweight.shape[1] // type_size * block_size - if qweight_type < 2: - return torch.embedding(qweight, x) - x_flat = x.flatten() - quant = torch.index_select(qweight, dim=0, index=x_flat) - dequant = ops.ggml_dequantize(quant, qweight_type, hidden_size, - x_flat.shape[0], self.params_dtype) - return dequant.view(*x.shape, hidden_size) + return apply_gguf_embedding(x, + qweight, + qweight_type, + hidden_size, + dtype=self.params_dtype) class GGUFUninitializedParameter(UninitializedParameter): cls_to_become = Parameter data_container: list[torch.Tensor] - - def materialize_nested(self) -> Parameter: - dtype = {data.dtype for data in self.data_container} - assert len(dtype) == 1, ValueError( - f"Data container has mixed dtypes: {dtype}") - dtype = next(iter(dtype)) - nested_data = torch.nested.nested_tensor(self.data_container, - device=self.device, - dtype=dtype) - self.data_container.clear() - param = torch.Tensor._make_subclass(self.cls_to_become, - nested_data, - require_grad=False) - for k, v in self.__dict__.items(): - setattr(param, k, v) - return param diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index b7baa3d3363bf..8108c797637d4 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -14,7 +14,7 @@ from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod from vllm.platforms import current_platform -MIN_IPEX_VERSION = "2.5.0" +MIN_IPEX_VERSION = "2.7.0" class IPEXConfig(QuantizationConfig): diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index b108b02a43e20..2abe16a08a265 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -228,7 +228,7 @@ class ModelOptNvFp4Config(QuantizationConfig): exclude_modules, group_size) def is_layer_excluded(self, prefix: str, exclude_modules: list): - import re + import regex as re for pattern in exclude_modules: regex_str = pattern.replace('.', r'\.').replace('*', r'.*') if re.fullmatch(regex_str, prefix): @@ -585,9 +585,11 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # GEMM 1 - assert torch.allclose( - layer.w13_weight_scale_2[:, 0], layer.w13_weight_scale_2[:, 1]), ( - "w1_weight_scale_2 must match w3_weight_scale_2") + if not torch.allclose(layer.w13_weight_scale_2[:, 0], + layer.w13_weight_scale_2[:, 1]): + logger.warning_once( + "w1_weight_scale_2 must match w3_weight_scale_2. " + "Accuracy may be affected.") w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0] layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2, diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py index 38b374feea81d..b2d6bf5dbf9cc 100644 --- a/vllm/model_executor/layers/quantization/neuron_quant.py +++ b/vllm/model_executor/layers/quantization/neuron_quant.py @@ -13,6 +13,12 @@ from vllm.model_executor.layers.quantization.base_config import ( SUPPORTED_QUANT_DTYPE_LIST = ['s8', 'f8e4m3fn'] +class AlwaysSupportedDtypes(list): + + def __contains__(self, item): + return True + + class NeuronQuantConfig(QuantizationConfig): """Int8 Quantization Config class for Neuron Backend.""" @@ -35,7 +41,8 @@ class NeuronQuantConfig(QuantizationConfig): return "neuron_quant" def get_supported_act_dtypes(self) -> list[str]: - return SUPPORTED_QUANT_DTYPE_LIST + # Neuron implements custom handling logic for quantization support + return AlwaysSupportedDtypes() @classmethod def get_min_capability(cls) -> int: diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py index d1d293b017914..5e56bcb7564cd 100644 --- a/vllm/model_executor/layers/quantization/quark/utils.py +++ b/vllm/model_executor/layers/quantization/quark/utils.py @@ -1,10 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 -import re from collections.abc import Iterable, Mapping from types import MappingProxyType from typing import Any, Optional +import regex as re + def deep_compare(dict1: Any, dict2: Any) -> bool: if type(dict1) is not type(dict2): diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py index ff7a8169e6fbc..36161d13b24f8 100644 --- a/vllm/model_executor/layers/quantization/utils/gptq_utils.py +++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 -import re from copy import deepcopy from typing import Optional, Union +import regex as re import torch from vllm.config import QuantizationConfig diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py index 15177af58ae6e..13dcdc00a2156 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py @@ -22,7 +22,12 @@ def is_fp4_marlin_supported(): def fp4_marlin_process_scales(marlin_scales): - assert (marlin_scales >= 0).all() + if not (marlin_scales >= 0).all(): + logger.warning_once( + "NVFP4 Marlin assumes the scales to be >=0, but has encountered " + "negative scales. Accuracy will likely be degraded. This is " + "because it changes the scales from FP8-S1E4M3 to a special " + "FP8-S0E5M3 format to speedup the dequantization.") # convert to half first, we would convert to fp8 later marlin_scales = marlin_scales.to(torch.half) diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 4b041cff2eccb..eed8998fe3da5 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -155,8 +155,8 @@ def rocm_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor, scale_b: torch.Tensor, bias: torch.Tensor, input_2d: torch.Tensor, output_shape: list) -> torch.Tensor: - from vllm.platforms.rocm import on_mi250_mi300 - if envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi250_mi300( + from vllm.platforms.rocm import on_mi3xx + if envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi3xx( ) and qinput.shape[0] == 1 and qinput.shape[1] % 16 == 0: output = ops.wvSplitKQ(weight.t(), qinput, out_dtype, scale_a, scale_b, current_platform.get_cu_count()) diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index af82b9dc93b70..3db73495827c6 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -262,16 +262,16 @@ class RejectionSampler(SpecDecodeStochasticBaseSampler): True, then a token can be accepted, else it should be rejected. - Given {math}`q(\hat{x}_{n+1}|x_1, \dots, x_n)`, the probability of - {math}`\hat{x}_{n+1}` given context {math}`x_1, \dots, x_n` according - to the target model, and {math}`p(\hat{x}_{n+1}|x_1, \dots, x_n)`, the + Given $q(\hat{x}_{n+1}|x_1, \dots, x_n)$, the probability of + $\hat{x}_{n+1}$ given context $x_1, \dots, x_n$ according + to the target model, and $p(\hat{x}_{n+1}|x_1, \dots, x_n)$, the same conditional probability according to the draft model, the token is accepted with probability: - :::{math} + $$ \min\left(1, \frac{q(\hat{x}_{n+1}|x_1, \dots, x_n)} {p(\hat{x}_{n+1}|x_1, \dots, x_n)}\right) - ::: + $$ This implementation does not apply causality. When using the output, if a token is rejected, subsequent tokens should not be used. @@ -314,30 +314,31 @@ class RejectionSampler(SpecDecodeStochasticBaseSampler): target model is recovered (within hardware numerics). The probability distribution used in this rejection case is constructed - as follows. Given {math}`q(x|x_1, \dots, x_n)`, the probability of - {math}`x` given context {math}`x_1, \dots, x_n` according to the target - model and {math}`p(x|x_1, \dots, x_n)`, the same conditional probability + as follows. Given $q(x|x_1, \dots, x_n)$, the probability of + $x$ given context $x_1, \dots, x_n$ according to the target + model and $p(x|x_1, \dots, x_n)$, the same conditional probability according to the draft model: - :::{math} + $$ x_{n+1} \sim (q(x|x_1, \dots, x_n) - p(x|x_1, \dots, x_n))_+ - ::: + $$ - where {math}`(f(x))_+` is defined as: + where $(f(x))_+$ is defined as: - :::{math} + $$ (f(x))_+ = \frac{\max(0, f(x))}{\sum_x \max(0, f(x))} - ::: + $$ See https://github.com/vllm-project/vllm/pull/2336 for a visualization of the draft, target, and recovered probability distributions. Returns a tensor of shape [batch_size, k, vocab_size]. - Note: This batches operations on GPU and thus constructs the recovered - distribution for all tokens, even if they are accepted. This causes - division-by-zero errors, so we use self._smallest_positive_value to - avoid that. This introduces some drift to the distribution. + Note: + This batches operations on GPU and thus constructs the recovered + distribution for all tokens, even if they are accepted. This causes + division-by-zero errors, so we use self._smallest_positive_value to + avoid that. This introduces some drift to the distribution. """ _, k, _ = draft_probs.shape diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 70463ecd90ae7..afc0597197962 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -96,7 +96,7 @@ class RotaryEmbedding(CustomOp): head_size: int, rotary_dim: int, max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, dtype: torch.dtype, ) -> None: @@ -113,7 +113,7 @@ class RotaryEmbedding(CustomOp): self.cos_sin_cache: torch.Tensor self.register_buffer("cos_sin_cache", cache, persistent=False) - def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: + def _compute_inv_freq(self, base: float) -> torch.Tensor: """Compute the inverse frequency.""" # NOTE(woosuk): To exactly match the HF implementation, we need to # use CPU to compute the cache and then move it to GPU. However, we @@ -404,7 +404,7 @@ class LinearScalingRotaryEmbedding(RotaryEmbedding): head_size: int, rotary_dim: int, max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, scaling_factors: Union[list[float], float], dtype: torch.dtype, @@ -464,7 +464,7 @@ class NTKScalingRotaryEmbedding(RotaryEmbedding): head_size: int, rotary_dim: int, max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, scaling_factor: float, dtype: torch.dtype, @@ -474,7 +474,7 @@ class NTKScalingRotaryEmbedding(RotaryEmbedding): super().__init__(head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype) - def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: + def _compute_inv_freq(self, base: float) -> torch.Tensor: base = self.base * (self.scaling_factor if self.mixed_b is None else 1) inv_freq = super()._compute_inv_freq(base) @@ -501,7 +501,7 @@ class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding): head_size: int, rotary_dim: int, max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, scaling_factor: float, dtype: torch.dtype, @@ -582,7 +582,7 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding): head_size: int, rotary_dim: int, max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, scaling_factor: float, dtype: torch.dtype, @@ -644,7 +644,7 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module): rotary_dim: int, max_position_embeddings: int, original_max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, dtype: torch.dtype, short_factor: list[float], @@ -769,7 +769,7 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding): head_size: int, rotary_dim: int, max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, scaling_factor: float, dtype: torch.dtype, @@ -877,7 +877,7 @@ class Llama3RotaryEmbedding(RotaryEmbedding): head_size: int, rotary_dim: int, max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, dtype: torch.dtype, scaling_factor: float, @@ -892,7 +892,7 @@ class Llama3RotaryEmbedding(RotaryEmbedding): super().__init__(head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype) - def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: + def _compute_inv_freq(self, base: float) -> torch.Tensor: inv_freqs = super()._compute_inv_freq(base) low_freq_wavelen = self.orig_max_position / self.low_freq_factor high_freq_wavelen = self.orig_max_position / self.high_freq_factor @@ -923,14 +923,14 @@ class Llama4VisionRotaryEmbedding(RotaryEmbedding): head_size: int, rotary_dim: int, max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, dtype: torch.dtype, ): super().__init__(head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype) - def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: + def _compute_inv_freq(self, base: float) -> torch.Tensor: inv_freqs = super()._compute_inv_freq(base) inv_freqs = inv_freqs[:(self.rotary_dim // 2)] return inv_freqs @@ -989,7 +989,7 @@ class MRotaryEmbedding(RotaryEmbedding): head_size: int, rotary_dim: int, max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, dtype: torch.dtype, mrope_section: Optional[list[int]] = None, @@ -1529,7 +1529,7 @@ class DualChunkRotaryEmbedding(CustomOp): head_size: int, rotary_dim: int, max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, dtype: torch.dtype, chunk_size: int, @@ -1558,7 +1558,7 @@ class DualChunkRotaryEmbedding(CustomOp): q_inter_cache, persistent=False) - def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: + def _compute_inv_freq(self, base: float) -> torch.Tensor: """Compute the inverse frequency.""" # NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`. # However, we use `torch.arange(..., dtype=torch.float)` instead to @@ -1705,7 +1705,7 @@ def get_rope( head_size: int, rotary_dim: int, max_position: int, - base: int, + base: float, is_neox_style: bool = True, rope_scaling: Optional[dict[str, Any]] = None, dtype: Optional[torch.dtype] = None, diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index d6b910e4b75a0..32375db0c8f1a 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -228,17 +228,19 @@ class Sampler(nn.Module): ) -> Optional[SamplerOutput]: """ Single-step scheduling: - * Perform GPU-side sampling computation & compute - GPU-side logprobs tensor - * Pythonize sampling result & logprobs tensor + * Perform GPU-side sampling computation & compute + GPU-side logprobs tensor + * Pythonize sampling result & logprobs tensor Multi-step scheduling: - * Perform GPU-side sampling computation & compute - GPU-side logprobs tensor - * Defer Pythonization of sampling result & logprobs - tensor - * Encapsulate arguments required for deferred Pythonization - in the {class}`SamplerOutput` structure + * Perform GPU-side sampling computation & compute + GPU-side logprobs tensor + * Defer Pythonization of sampling result & logprobs + tensor + * Encapsulate arguments required for deferred Pythonization + in the + [`SamplerOutput`][vllm.model_executor.layers.sampler.SamplerOutput] + structure Args: logits: (num_tokens, vocab_size). diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py index 527a301cd8e26..a14c86148e730 100644 --- a/vllm/model_executor/layers/typical_acceptance_sampler.py +++ b/vllm/model_executor/layers/typical_acceptance_sampler.py @@ -93,29 +93,27 @@ class TypicalAcceptanceSampler(SpecDecodeDeterministicBaseSampler): Evaluates and returns a mask of accepted tokens based on the posterior probabilities. - Parameters: - ---------- - target_probs : torch.Tensor - A tensor of shape (batch_size, k, vocab_size) representing - the probabilities of each token in the vocabulary for each - position in the proposed sequence. This is the distribution - generated by the target model. - draft_token_ids : torch.Tensor - A tensor of shape (batch_size, k) representing the proposed - token ids. + Args: + target_probs (torch.Tensor): A tensor of shape + (batch_size, k, vocab_size) representing the probabilities of + each token in the vocabulary for each position in the proposed + sequence. This is the distribution generated by the target + model. + draft_token_ids (torch.Tensor): A tensor of shape (batch_size, k) + representing the proposed token ids. A draft token_id x_{n+k} is accepted if it satisfies the following condition - :::{math} + $$ p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) > \min \left( \epsilon, \delta * \exp \left( -H(p_{\text{original}}( \cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right) - ::: + $$ - where {math}`p_{\text{original}}` corresponds to target_probs - and {math}`\epsilon` and {math}`\delta` correspond to hyperparameters + where $p_{\text{original}}$ corresponds to target_probs + and $\epsilon$ and $\delta$ correspond to hyperparameters specified using self._posterior_threshold and self._posterior_alpha This method computes the posterior probabilities for the given @@ -126,13 +124,10 @@ class TypicalAcceptanceSampler(SpecDecodeDeterministicBaseSampler): returns a boolean mask indicating which tokens can be accepted. Returns: - ------- - torch.Tensor - A boolean tensor of shape (batch_size, k) where each element - indicates whether the corresponding draft token has been accepted - or rejected. True indicates acceptance and false indicates - rejection. - + torch.Tensor: A boolean tensor of shape (batch_size, k) where each + element indicates whether the corresponding draft token has + been accepted or rejected. True indicates acceptance and false + indicates rejection. """ device = target_probs.device candidates_prob = torch.gather( @@ -156,17 +151,14 @@ class TypicalAcceptanceSampler(SpecDecodeDeterministicBaseSampler): The recovered token ids will fill the first unmatched token by the target token. - Parameters - ---------- - target_probs : torch.Tensor - A tensor of shape (batch_size, k, vocab_size) containing - the target probability distribution + Args: + target_probs (torch.Tensor): A tensor of shape + (batch_size, k, vocab_size) containing the target probability + distribution. - Returns - ------- - torch.Tensor - A tensor of shape (batch_size, k) with the recovered token - ids which are selected from target probs. + Returns: + torch.Tensor: A tensor of shape (batch_size, k) with the recovered + token ids which are selected from target probs. """ max_indices = torch.argmax(target_probs, dim=-1) diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index 18783d0d77856..001e6aaf0cc7f 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -70,9 +70,9 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor, def rocm_unquantized_gemm(x: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None): - from vllm.platforms.rocm import on_mi250_mi300 + from vllm.platforms.rocm import on_gfx9 k = weight.shape[1] - use_skinny = (envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi250_mi300() and \ + use_skinny = (envs.VLLM_ROCM_USE_SKINNY_GEMM and on_gfx9() and \ x.dtype in [torch.float16, torch.bfloat16] \ and k % 8 == 0 and bias is None) diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py index 92a0b0923b6e0..a443a652d8a3f 100644 --- a/vllm/model_executor/model_loader/__init__.py +++ b/vllm/model_executor/model_loader/__init__.py @@ -1,8 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 +from typing import Optional + from torch import nn -from vllm.config import LoadConfig, LoadFormat, VllmConfig +from vllm.config import LoadConfig, LoadFormat, ModelConfig, VllmConfig from vllm.model_executor.model_loader.base_loader import BaseModelLoader from vllm.model_executor.model_loader.bitsandbytes_loader import ( BitsAndBytesModelLoader) @@ -47,9 +49,14 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader: return DefaultModelLoader(load_config) -def get_model(*, vllm_config: VllmConfig) -> nn.Module: +def get_model(*, + vllm_config: VllmConfig, + model_config: Optional[ModelConfig] = None) -> nn.Module: loader = get_model_loader(vllm_config.load_config) - return loader.load_model(vllm_config=vllm_config) + if model_config is None: + model_config = vllm_config.model_config + return loader.load_model(vllm_config=vllm_config, + model_config=model_config) __all__ = [ diff --git a/vllm/model_executor/model_loader/base_loader.py b/vllm/model_executor/model_loader/base_loader.py index f17cab05c25d3..d619d9f25e087 100644 --- a/vllm/model_executor/model_loader/base_loader.py +++ b/vllm/model_executor/model_loader/base_loader.py @@ -1,9 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 from abc import ABC, abstractmethod +import torch import torch.nn as nn from vllm.config import LoadConfig, ModelConfig, VllmConfig +from vllm.model_executor.model_loader.utils import ( + initialize_model, process_weights_after_loading, set_default_torch_dtype) class BaseModelLoader(ABC): @@ -18,6 +21,22 @@ class BaseModelLoader(ABC): raise NotImplementedError @abstractmethod - def load_model(self, *, vllm_config: VllmConfig) -> nn.Module: - """Load a model with the given configurations.""" + def load_weights(self, model: nn.Module, + model_config: ModelConfig) -> None: + """Load weights into a model. This standalone API allows + inplace weights loading for an already-initialized model""" raise NotImplementedError + + def load_model(self, vllm_config: VllmConfig, + model_config: ModelConfig) -> nn.Module: + """Load a model with the given configurations.""" + device_config = vllm_config.device_config + target_device = torch.device(device_config.device) + with set_default_torch_dtype(model_config.dtype): + with target_device: + model = initialize_model(vllm_config=vllm_config, + model_config=model_config) + # Quantization does not happen in `load_weights` but after it + self.load_weights(model, model_config) + process_weights_after_loading(model, model_config, target_device) + return model.eval() diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index 6771c128c5a1b..3df835a938968 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # ruff: noqa: SIM117 -import copy import fnmatch import glob import itertools @@ -15,7 +14,7 @@ from huggingface_hub import HfApi from torch import nn from transformers.utils import SAFE_WEIGHTS_INDEX_NAME -from vllm.config import LoadConfig, ModelConfig, VllmConfig +from vllm.config import LoadConfig, ModelConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) # yapf: enable @@ -29,14 +28,14 @@ from vllm.model_executor.layers.linear import (LinearBase, RowParallelLinear) from vllm.model_executor.model_loader.base_loader import BaseModelLoader from vllm.model_executor.model_loader.utils import (ParamMapping, - initialize_model, set_default_torch_dtype) from vllm.model_executor.model_loader.weight_utils import ( download_safetensors_index_file_from_hf, download_weights_from_hf, filter_duplicate_safetensors_files, filter_files_not_needed_for_inference, pt_weights_iterator, safetensors_weights_iterator) from vllm.model_executor.models import is_pooling_model -from vllm.model_executor.utils import set_weight_attrs +from vllm.model_executor.utils import (get_packed_modules_mapping, + set_weight_attrs) from vllm.platforms import current_platform logger = init_logger(__name__) @@ -408,8 +407,7 @@ class BitsAndBytesModelLoader(BaseModelLoader): ), "vllm currently does not support BNB quantization for" f" {type(model).__name__}" - def _load_weights(self, model_config: ModelConfig, - model: nn.Module) -> None: + def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None: if not hasattr(model, "load_weights"): raise AttributeError( "The required method 'load_weights' is not defined in class" @@ -420,8 +418,8 @@ class BitsAndBytesModelLoader(BaseModelLoader): f"Model {type(model).__name__} does not support BitsAndBytes " "quantization yet. No 'packed_modules_mapping' found.") self.is_pool_model=is_pooling_model(model) - self.modules_mapping = ParamMapping( - copy.deepcopy(model.packed_modules_mapping)) + + self.modules_mapping = ParamMapping(get_packed_modules_mapping(model)) # For some models like Molmo, we need to use hf_to_vllm_mapper # to ensure correct loading of weights. @@ -568,16 +566,3 @@ class BitsAndBytesModelLoader(BaseModelLoader): def download_model(self, model_config: ModelConfig) -> None: self._prepare_weights(model_config.model, model_config.revision) - - def load_model(self, vllm_config: VllmConfig) -> nn.Module: - device_config = vllm_config.device_config - model_config = vllm_config.model_config - - with set_default_torch_dtype(model_config.dtype): - with torch.device(device_config.device): - - model = initialize_model(vllm_config=vllm_config) - - self._load_weights(model_config, model) - - return model.eval() diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index 21eb7d8a75fbf..6946627a54d24 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -11,12 +11,10 @@ import torch from torch import nn from transformers.utils import SAFE_WEIGHTS_INDEX_NAME -from vllm.config import LoadConfig, LoadFormat, ModelConfig, VllmConfig -from vllm.envs import VLLM_USE_MODELSCOPE +from vllm import envs +from vllm.config import LoadConfig, LoadFormat, ModelConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.base_loader import BaseModelLoader -from vllm.model_executor.model_loader.utils import ( - initialize_model, process_weights_after_loading, set_default_torch_dtype) from vllm.model_executor.model_loader.weight_utils import ( download_safetensors_index_file_from_hf, download_weights_from_hf, fastsafetensors_weights_iterator, filter_duplicate_safetensors_files, @@ -64,7 +62,7 @@ class DefaultModelLoader(BaseModelLoader): Returns the path to the downloaded model, or None if the model is not downloaded from ModelScope.""" - if VLLM_USE_MODELSCOPE: + if envs.VLLM_USE_MODELSCOPE: # download model from ModelScope hub, # lazy import so that modelscope is not required for normal use. # pylint: disable=C. @@ -264,31 +262,20 @@ class DefaultModelLoader(BaseModelLoader): fall_back_to_pt=True, allow_patterns_overrides=None) - def load_model(self, vllm_config: VllmConfig) -> nn.Module: - device_config = vllm_config.device_config - model_config = vllm_config.model_config - target_device = torch.device(device_config.device) - with set_default_torch_dtype(model_config.dtype): - with target_device: - model = initialize_model(vllm_config=vllm_config) - - weights_to_load = {name for name, _ in model.named_parameters()} - loaded_weights = model.load_weights( - self.get_all_weights(model_config, model)) - self.counter_after_loading_weights = time.perf_counter() - logger.info( - "Loading weights took %.2f seconds", - self.counter_after_loading_weights - - self.counter_before_loading_weights) - # We only enable strict check for non-quantized models - # that have loaded weights tracking currently. - if model_config.quantization is None and loaded_weights is not None: - weights_not_loaded = weights_to_load - loaded_weights - if weights_not_loaded: - raise ValueError( - "Following weights were not initialized from " - f"checkpoint: {weights_not_loaded}") - - process_weights_after_loading(model, model_config, target_device) - - return model.eval() + def load_weights(self, model: nn.Module, + model_config: ModelConfig) -> None: + weights_to_load = {name for name, _ in model.named_parameters()} + loaded_weights = model.load_weights( + self.get_all_weights(model_config, model)) + self.counter_after_loading_weights = time.perf_counter() + logger.info( + "Loading weights took %.2f seconds", + self.counter_after_loading_weights - + self.counter_before_loading_weights) + # We only enable strict check for non-quantized models + # that have loaded weights tracking currently. + if model_config.quantization is None and loaded_weights is not None: + weights_not_loaded = weights_to_load - loaded_weights + if weights_not_loaded: + raise ValueError("Following weights were not initialized from " + f"checkpoint: {weights_not_loaded}") diff --git a/vllm/model_executor/model_loader/dummy_loader.py b/vllm/model_executor/model_loader/dummy_loader.py index 5047a161f3f97..64fa2be76d08b 100644 --- a/vllm/model_executor/model_loader/dummy_loader.py +++ b/vllm/model_executor/model_loader/dummy_loader.py @@ -1,11 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 -import torch import torch.nn as nn -from vllm.config import LoadConfig, ModelConfig, VllmConfig +from vllm.config import LoadConfig, ModelConfig from vllm.model_executor.model_loader.base_loader import BaseModelLoader -from vllm.model_executor.model_loader.utils import ( - initialize_model, process_weights_after_loading, set_default_torch_dtype) from vllm.model_executor.model_loader.weight_utils import ( initialize_dummy_weights) @@ -22,16 +19,8 @@ class DummyModelLoader(BaseModelLoader): def download_model(self, model_config: ModelConfig) -> None: pass # Nothing to download - def load_model(self, vllm_config: VllmConfig) -> nn.Module: - device_config = vllm_config.device_config - model_config = vllm_config.model_config - target_device = torch.device(device_config.device) - with set_default_torch_dtype(model_config.dtype): - with target_device: - model = initialize_model(vllm_config=vllm_config) - # NOTE(woosuk): For accurate performance evaluation, we assign - # random values to the weights. - initialize_dummy_weights(model) - - process_weights_after_loading(model, model_config, target_device) - return model.eval() + def load_weights(self, model: nn.Module, + model_config: ModelConfig) -> None: + # NOTE(woosuk): For accurate performance evaluation, we assign + # random values to the weights. + initialize_dummy_weights(model) diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py index 2766c9787b83e..1eac504227e25 100644 --- a/vllm/model_executor/model_loader/gguf_loader.py +++ b/vllm/model_executor/model_loader/gguf_loader.py @@ -92,9 +92,16 @@ class GGUFModelLoader(BaseModelLoader): def download_model(self, model_config: ModelConfig) -> None: self._prepare_weights(model_config.model) - def load_model(self, vllm_config: VllmConfig) -> nn.Module: + def load_weights(self, model: nn.Module, + model_config: ModelConfig) -> None: + local_model_path = self._prepare_weights(model_config.model) + gguf_weights_map = self._get_gguf_weights_map(model_config) + model.load_weights( + self._get_weights_iterator(local_model_path, gguf_weights_map)) + + def load_model(self, vllm_config: VllmConfig, + model_config: ModelConfig) -> nn.Module: device_config = vllm_config.device_config - model_config = vllm_config.model_config local_model_path = self._prepare_weights(model_config.model) gguf_weights_map = self._get_gguf_weights_map(model_config) # we can only know if tie word embeddings after mapping weights @@ -106,8 +113,7 @@ class GGUFModelLoader(BaseModelLoader): with set_default_torch_dtype(model_config.dtype): with target_device: model = initialize_model(vllm_config=vllm_config) - model.load_weights( - self._get_weights_iterator(local_model_path, gguf_weights_map)) + self.load_weights(model, model_config) process_weights_after_loading(model, model_config, target_device) return model diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py index b98cea7fe6e16..72ad4da296ac6 100644 --- a/vllm/model_executor/model_loader/neuronx_distributed.py +++ b/vllm/model_executor/model_loader/neuronx_distributed.py @@ -17,6 +17,8 @@ from neuronx_distributed_inference.models.config import ( FusedSpecNeuronConfig, OnDeviceSamplingConfig) from neuronx_distributed_inference.models.mllama.utils import ( create_vision_mask) +from neuronx_distributed_inference.modules.lora_serving import ( + LoraServingConfig) from neuronx_distributed_inference.utils.hf_adapter import ( load_pretrained_config) from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig @@ -80,23 +82,37 @@ class NeuronCausalLM(nn.Module): # Lazy initialized self.model: nn.Module - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - input_block_ids: torch.Tensor, - sampling_params: torch.Tensor, - ) -> torch.Tensor: + def forward(self, + input_ids: torch.Tensor, + positions: torch.Tensor, + input_block_ids: torch.Tensor, + sampling_params: torch.Tensor, + prev_hidden: Optional[torch.Tensor] = None, + adapter_ids: Optional[torch.Tensor] = None) -> torch.Tensor: + # sort block ids sequentially for perf/neuron support reasons + sorted_input_block_ids, sorted_indices = torch.sort(input_block_ids) + input_ids = torch.index_select(input_ids, 0, sorted_indices) + positions = torch.index_select(positions, 0, sorted_indices) + sampling_params = torch.index_select(sampling_params, 0, + sorted_indices) output = self.model(input_ids, attention_mask=None, position_ids=positions, - seq_ids=input_block_ids, - sampling_params=sampling_params) + seq_ids=sorted_input_block_ids, + sampling_params=sampling_params, + prev_hidden=prev_hidden, + adapter_ids=adapter_ids) # on-device sampling if self.config.neuron_config.on_device_sampling_config: - return output.hidden_states + output = output.hidden_states else: - return output.logits[:, -1, :] + output = output.logits[:, -1, :] + + restored_indices = torch.argsort(sorted_indices) + if input_block_ids.shape[0] != 1: + output = torch.index_select(output, 0, restored_indices) + + return output def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata) -> torch.Tensor: @@ -188,6 +204,11 @@ class NeuronMllamaForCausalLM(nn.Module): config: PretrainedConfig, on_device_sampling_disabled: bool = False) -> None: super().__init__() + # has_image is the only multimodal input that is used in + # token-generation + # This is a cache (on CPU) that saves has_image data per sequence id + # The number of entries in this cache is <= Batch-Size + self.has_image_cache: dict[int, torch.Tensor] = {} self.config = config self.logits_processor = LogitsProcessor( config.get_text_config().vocab_size, logits_as_input=True) @@ -199,11 +220,57 @@ class NeuronMllamaForCausalLM(nn.Module): # Lazy initialized self.model: nn.Module + self.is_reorder_needed: bool = True + + def read_from_has_image_cache(self, seq_ids: torch.Tensor): + has_image_list = [] + for index in range(len(seq_ids)): + seq_id = seq_ids[index].item() + if seq_id in self.has_image_cache: + has_image_list.append(self.has_image_cache[seq_id]) + else: + has_image_list.append(torch.tensor([0])) + return torch.tensor(has_image_list) + + def write_to_has_image_cache(self, seq_ids: torch.Tensor, + has_image: torch.Tensor): + for index in range(len(seq_ids)): + seq_id = seq_ids[index].item() + if index < len(has_image): + self.has_image_cache[seq_id] = has_image[index] + else: + self.has_image_cache[seq_id] = torch.zeros(1) def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, seq_ids: torch.Tensor, pixel_values: torch.Tensor, aspect_ratios: torch.Tensor, num_chunks: torch.Tensor, has_image: torch.Tensor, sampling_params) -> torch.Tensor: + + # We update the has_image cache during prefill + # and read the has_image cache during decode + if input_ids.shape[-1] > 1: # prefill + self.write_to_has_image_cache(seq_ids, has_image) + else: + has_image = self.read_from_has_image_cache(seq_ids) + bs = input_ids.shape[0] + num_chunks = torch.zeros((bs, 1)) + aspect_ratios = torch.zeros((bs, 1, 2)) + + input_block_ids = seq_ids + origin_input_block_ids = seq_ids + if self.is_reorder_needed: + # sort block ids sequentially for perf/neuron support reasons + input_block_ids, sorted_indices = torch.sort(input_block_ids) + input_ids = torch.index_select(input_ids, 0, sorted_indices) + positions = torch.index_select(positions, 0, sorted_indices) + sampling_params = torch.index_select(sampling_params, 0, + sorted_indices) + pixel_values = torch.index_select(pixel_values, 0, sorted_indices) + aspect_ratios = torch.index_select(aspect_ratios, 0, + sorted_indices) + num_chunks = torch.index_select(num_chunks, 0, sorted_indices) + has_image = torch.index_select(has_image, 0, sorted_indices) + self.vision_mask = create_vision_mask(input_ids, self.vision_token_id) output = self.model( input_ids.to(torch.int32), @@ -219,8 +286,14 @@ class NeuronMllamaForCausalLM(nn.Module): has_image=has_image.to(torch.int32), ) if self.config.neuron_config.on_device_sampling_config: - return output.hidden_states - return output.logits[:, -1, :] + output = output.hidden_states + else: + output = output.logits[:, -1, :] + + if self.is_reorder_needed and origin_input_block_ids.shape[0] != 1: + restored_indices = torch.argsort(sorted_indices) + output = torch.index_select(output, 0, restored_indices) + return output def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata) -> torch.Tensor: @@ -283,7 +356,7 @@ class NeuronMllamaForCausalLM(nn.Module): self.model = neuronx_model_cls(compiled_model_path) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) self.vision_token_id = tokenizer( - "<|image|>", add_special_tokens=False).input_ids + "<|image|>", add_special_tokens=False).input_ids[0] self.model.load(compiled_model_path) return except (FileNotFoundError, ValueError): @@ -310,7 +383,7 @@ class NeuronMllamaForCausalLM(nn.Module): # Read "<|image|>" token_id from the tokenizer self.vision_token_id = tokenizer("<|image|>", - add_special_tokens=False).input_ids + add_special_tokens=False).input_ids[0] logger.info("\nLoading model from compiled checkpoint...") self.model.load(compiled_model_path) @@ -340,14 +413,26 @@ class NeuronSpeculationCausalLM(nn.Module): input_block_ids: torch.Tensor, sampling_params: torch.Tensor, ) -> torch.Tensor: + # sort block ids sequentially for perf/neuron support reasons + sorted_input_block_ids, sorted_indices = torch.sort(input_block_ids) + input_ids = torch.index_select(input_ids, 0, sorted_indices) + positions = torch.index_select(positions, 0, sorted_indices) + sampling_params = torch.index_select(sampling_params, 0, + sorted_indices) + output = self.model(input_ids, attention_mask=None, position_ids=positions, - seq_ids=input_block_ids, + seq_ids=sorted_input_block_ids, sampling_params=sampling_params) + restored_indices = torch.argsort(sorted_indices) + # CTX encoding if (positions[:, 0]).sum().item() == 0: - return output.fused_outputs[0][:, 0:1] + output = output.fused_outputs[0][:, 0:1] + if input_block_ids.shape[0] != 1: + output = torch.index_select(output, 0, restored_indices) + return output # Fused Spec (Generation) accepted_tokens_with_padding = output.fused_outputs[0] @@ -362,6 +447,10 @@ class NeuronSpeculationCausalLM(nn.Module): -1) >= generated_token_counts accepted_tokens_with_padding[mask] = -1 + if input_block_ids.shape[0] != 1: + accepted_tokens_with_padding = torch.index_select( + accepted_tokens_with_padding, 0, restored_indices) + return accepted_tokens_with_padding def sample( @@ -416,6 +505,10 @@ class NeuronSpeculationCausalLM(nn.Module): draft_neuron_config.speculation_length = 0 draft_neuron_config.trace_tokengen_model = True draft_neuron_config.enable_fused_speculation = False + if getattr(config.neuron_config, "draft_model_modules_to_not_convert", + None): + draft_neuron_config.modules_to_not_convert = ( + draft_neuron_config.draft_model_modules_to_not_convert) if config.neuron_config.enable_eagle_speculation: draft_neuron_config.is_eagle_draft = True draft_neuron_config.sequence_parallel_enabled = False @@ -489,7 +582,8 @@ def _get_model_architecture(config: PretrainedConfig) -> str: def _get_default_neuron_config(model_config: ModelConfig, parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig): + scheduler_config: SchedulerConfig, + lora_serving_config: LoraServingConfig): """Generate a neuron config based on vllm config args.""" on_device_sampling_config = OnDeviceSamplingConfig(dynamic=True, deterministic=False) @@ -502,13 +596,13 @@ def _get_default_neuron_config(model_config: ModelConfig, max_context_length=scheduler_config.max_model_len, seq_len=scheduler_config.max_model_len, enable_bucketing=True, - is_continuous_batching=(batch_size > 1), + is_continuous_batching=True, quantized=False, torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], padding_side="right", on_device_sampling_config=on_device_sampling_config, sequence_parallel_enabled=True, - ) + lora_serving_config=lora_serving_config) return neuron_config @@ -520,6 +614,7 @@ def _get_default_speculation_config(model_config: ModelConfig, args.""" neuron_config = dict( tp_degree=parallel_config.tensor_parallel_size, + ctx_batch_size=1, batch_size=scheduler_config.max_num_seqs, max_context_length=scheduler_config.max_model_len, seq_len=scheduler_config.max_model_len, @@ -527,6 +622,7 @@ def _get_default_speculation_config(model_config: ModelConfig, trace_tokengen_model=False, enable_fused_speculation=True, enable_bucketing=True, + is_continuous_batching=True, quantized=False, torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], on_device_sampling_config=dict( @@ -546,7 +642,8 @@ def _get_neuron_config_after_override(default_neuron_config, def get_neuron_model(model_config: ModelConfig, parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig) -> nn.Module: + scheduler_config: SchedulerConfig, + lora_serving_config: LoraServingConfig) -> nn.Module: """Initializes a neuron-optimized model for inference.""" model_arch = _get_model_architecture(model_config.hf_config) if model_arch == "MllamaForConditionalGeneration": @@ -554,7 +651,7 @@ def get_neuron_model(model_config: ModelConfig, else: model = NeuronCausalLM(model_config.hf_config) default_neuron_config_args = _get_default_neuron_config( - model_config, parallel_config, scheduler_config) + model_config, parallel_config, scheduler_config, lora_serving_config) neuron_config = _get_neuron_config_after_override( default_neuron_config_args, model_config.override_neuron_config) diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py index a695ba03bd1db..a39e26c6da50d 100644 --- a/vllm/model_executor/model_loader/runai_streamer_loader.py +++ b/vllm/model_executor/model_loader/runai_streamer_loader.py @@ -9,10 +9,8 @@ import torch from torch import nn from transformers.utils import SAFE_WEIGHTS_INDEX_NAME -from vllm.config import LoadConfig, ModelConfig, VllmConfig +from vllm.config import LoadConfig, ModelConfig from vllm.model_executor.model_loader.base_loader import BaseModelLoader -from vllm.model_executor.model_loader.utils import ( - initialize_model, process_weights_after_loading, set_default_torch_dtype) from vllm.model_executor.model_loader.weight_utils import ( download_safetensors_index_file_from_hf, download_weights_from_hf, runai_safetensors_weights_iterator) @@ -100,22 +98,11 @@ class RunaiModelStreamerLoader(BaseModelLoader): """Download model if necessary""" self._prepare_weights(model_config.model, model_config.revision) - def load_model(self, vllm_config: VllmConfig) -> nn.Module: - """Perform streaming of the model to destination""" - device_config = vllm_config.device_config - model_config = vllm_config.model_config - - target_device = torch.device(device_config.device) - with set_default_torch_dtype(model_config.dtype): - with target_device: - model = initialize_model(vllm_config=vllm_config) - - model_weights = model_config.model - if hasattr(model_config, "model_weights"): - model_weights = model_config.model_weights - model.load_weights( - self._get_weights_iterator(model_weights, - model_config.revision)) - - process_weights_after_loading(model, model_config, target_device) - return model.eval() + def load_weights(self, model: nn.Module, + model_config: ModelConfig) -> None: + """Load weights into a model.""" + model_weights = model_config.model + if hasattr(model_config, "model_weights"): + model_weights = model_config.model_weights + model.load_weights( + self._get_weights_iterator(model_weights, model_config.revision)) diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py index 913bda7e007a7..b5a5031bb6f91 100644 --- a/vllm/model_executor/model_loader/sharded_state_loader.py +++ b/vllm/model_executor/model_loader/sharded_state_loader.py @@ -9,11 +9,9 @@ from typing import Any, Optional import torch from torch import nn -from vllm.config import LoadConfig, ModelConfig, VllmConfig +from vllm.config import LoadConfig, ModelConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.base_loader import BaseModelLoader -from vllm.model_executor.model_loader.utils import ( - initialize_model, process_weights_after_loading, set_default_torch_dtype) from vllm.model_executor.model_loader.weight_utils import ( download_weights_from_hf, runai_safetensors_weights_iterator) from vllm.transformers_utils.s3_utils import glob as s3_glob @@ -100,11 +98,8 @@ class ShardedStateLoader(BaseModelLoader): def download_model(self, model_config: ModelConfig) -> None: self._prepare_weights(model_config.model, model_config.revision) - def load_model(self, vllm_config: VllmConfig) -> nn.Module: - device_config = vllm_config.device_config - model_config = vllm_config.model_config - target_device = torch.device(device_config.device) - + def load_weights(self, model: nn.Module, + model_config: ModelConfig) -> None: from vllm.distributed import get_tensor_model_parallel_rank model_weights = model_config.model @@ -112,53 +107,47 @@ class ShardedStateLoader(BaseModelLoader): model_weights = model_config.model_weights local_model_path = model_weights - with set_default_torch_dtype(model_config.dtype): - with target_device: - model = initialize_model(vllm_config=vllm_config) - process_weights_after_loading(model, model_config, - target_device) - rank = get_tensor_model_parallel_rank() - pattern = os.path.join( - local_model_path, - self.pattern.format(rank=rank, part="*"), - ) + rank = get_tensor_model_parallel_rank() + pattern = os.path.join( + local_model_path, + self.pattern.format(rank=rank, part="*"), + ) - filepaths = [] - if is_s3(local_model_path): - file_pattern = f"*{self.pattern.format(rank=rank, part=' * ')}" - filepaths = s3_glob(path=local_model_path, - allow_pattern=[file_pattern]) - else: - filepaths = glob.glob(pattern) - if not filepaths: - # TODO: support un-sharded checkpoints too - raise ValueError( - f"Could not find checkpoint files '{pattern}', only " - f"pre-sharded checkpoints are currently supported!") - state_dict = self._filter_subtensors(model.state_dict()) - for key, tensor in self.iterate_over_files(filepaths): - # If loading with LoRA enabled, additional padding may - # be added to certain parameters. We only load into a - # narrowed view of the parameter data. - param_data = state_dict[key].data - param_shape = state_dict[key].shape - for dim, size in enumerate(tensor.shape): - if size < param_shape[dim]: - param_data = param_data.narrow(dim, 0, size) - if tensor.shape != param_shape: - logger.warning( - "loading tensor of shape %s into " - "parameter '%s' of shape %s", - tensor.shape, - key, - param_shape, - ) - param_data.copy_(tensor) - state_dict.pop(key) - if state_dict: - raise ValueError( - f"Missing keys {tuple(state_dict)} in loaded state!") - return model.eval() + filepaths = [] + if is_s3(local_model_path): + file_pattern = f"*{self.pattern.format(rank=rank, part=' * ')}" + filepaths = s3_glob(path=local_model_path, + allow_pattern=[file_pattern]) + else: + filepaths = glob.glob(pattern) + if not filepaths: + # TODO: support un-sharded checkpoints too + raise ValueError( + f"Could not find checkpoint files '{pattern}', only " + f"pre-sharded checkpoints are currently supported!") + state_dict = self._filter_subtensors(model.state_dict()) + for key, tensor in self.iterate_over_files(filepaths): + # If loading with LoRA enabled, additional padding may + # be added to certain parameters. We only load into a + # narrowed view of the parameter data. + param_data = state_dict[key].data + param_shape = state_dict[key].shape + for dim, size in enumerate(tensor.shape): + if size < param_shape[dim]: + param_data = param_data.narrow(dim, 0, size) + if tensor.shape != param_shape: + logger.warning( + "loading tensor of shape %s into " + "parameter '%s' of shape %s", + tensor.shape, + key, + param_shape, + ) + param_data.copy_(tensor) + state_dict.pop(key) + if state_dict: + raise ValueError( + f"Missing keys {tuple(state_dict)} in loaded state!") def iterate_over_files( self, paths) -> Generator[tuple[str, torch.Tensor], None, None]: diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 459c4b4392e3f..90c0bdf08ef88 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -1,24 +1,29 @@ # SPDX-License-Identifier: Apache-2.0 import argparse +import contextlib +import contextvars import dataclasses import io +import json import os -import re +import threading import time from collections.abc import Generator from dataclasses import dataclass from functools import partial -from typing import BinaryIO, Optional, Union +from typing import Any, BinaryIO, Optional, Union +import regex as re import torch from torch import nn +from torch.utils._python_dispatch import TorchDispatchMode from transformers import PretrainedConfig import vllm.envs as envs -from vllm.config import ModelConfig, ParallelConfig, set_current_vllm_config +from vllm.config import (ModelConfig, ParallelConfig, VllmConfig, + set_current_vllm_config) from vllm.engine.arg_utils import EngineArgs -from vllm.engine.llm_engine import LLMEngine from vllm.logger import init_logger from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) @@ -58,9 +63,79 @@ __all__ = [ logger = init_logger(__name__) +class MetaTensorMode(TorchDispatchMode): + + def __torch_dispatch__(self, func, types, args=(), kwargs=None): + kwargs = kwargs or {} + + if func._schema.name == "aten::empty" and "device" not in kwargs: + kwargs["device"] = "meta" + + return func(*args, **kwargs) + + +def meta_tensor_mode(loading_code=None, ): + + if loading_code is None: + return _NoInitOrTensorImpl.context_manager() + elif callable(loading_code): + with _NoInitOrTensorImpl.context_manager(): + return loading_code() + else: + raise TypeError( + "expected a callable to evaluate," + " or None if being used as a context manager;" + f' got an object of type "{type(loading_code).__name__}" instead.') + + +class _NoInitOrTensorImpl: + _MODULES = (torch.nn.Linear, torch.nn.Embedding, torch.nn.LayerNorm) + _MODULE_ORIGINALS = tuple((m, m.reset_parameters) for m in _MODULES) + + is_active = contextvars.ContextVar("_NoInitOrTensorImpl.is_active", + default=False) + _count_active: int = 0 + _count_active_lock = threading.Lock() + + @classmethod + @contextlib.contextmanager + def context_manager(cls): + if cls.is_active.get(): + yield + return + + with cls._count_active_lock: + cls._count_active += 1 + if cls._count_active == 1: + for mod in cls._MODULES: + mod.reset_parameters = cls._disable(mod.reset_parameters) + + reset_token = cls.is_active.set(True) + + try: + with MetaTensorMode(): + yield + finally: + cls.is_active.reset(reset_token) + with cls._count_active_lock: + cls._count_active -= 1 + if cls._count_active == 0: + for mod, original in cls._MODULE_ORIGINALS: + mod.reset_parameters = original + + @staticmethod + def _disable(func): + + def wrapper(*args, **kwargs): + if not _NoInitOrTensorImpl.is_active.get(): + return func(*args, **kwargs) + + return wrapper + + @dataclass class TensorizerConfig: - tensorizer_uri: str + tensorizer_uri: Union[str, None] = None vllm_tensorized: Optional[bool] = False verify_hash: Optional[bool] = False num_readers: Optional[int] = None @@ -71,12 +146,29 @@ class TensorizerConfig: model_class: Optional[type[torch.nn.Module]] = None hf_config: Optional[PretrainedConfig] = None dtype: Optional[Union[str, torch.dtype]] = None + lora_dir: Optional[str] = None _is_sharded: bool = False def __post_init__(self): # check if the configuration is for a sharded vLLM model self._is_sharded = isinstance(self.tensorizer_uri, str) \ and re.search(r'%0\dd', self.tensorizer_uri) is not None + if not self.tensorizer_uri and not self.lora_dir: + raise ValueError("tensorizer_uri must be provided.") + if not self.tensorizer_uri and self.lora_dir: + self.tensorizer_uri = f"{self.lora_dir}/adapter_model.tensors" + assert self.tensorizer_uri is not None, ("tensorizer_uri must be " + "provided.") + self.tensorizer_dir = os.path.dirname(self.tensorizer_uri) + self.lora_dir = self.tensorizer_dir + + @classmethod + def as_dict(cls, *args, **kwargs) -> dict[str, Any]: + cfg = TensorizerConfig(*args, **kwargs) + return dataclasses.asdict(cfg) + + def to_dict(self) -> dict[str, Any]: + return dataclasses.asdict(self) def _construct_tensorizer_args(self) -> "TensorizerArgs": tensorizer_args = { @@ -117,12 +209,6 @@ class TensorizerConfig: **tensorizer_args.stream_params) -def load_with_tensorizer(tensorizer_config: TensorizerConfig, - **extra_kwargs) -> nn.Module: - tensorizer = TensorizerAgent(tensorizer_config, **extra_kwargs) - return tensorizer.deserialize() - - @dataclass class TensorizerArgs: tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, BinaryIO, str, @@ -140,7 +226,9 @@ class TensorizerArgs: Args: tensorizer_uri: Path to serialized model tensors. Can be a local file - path or a S3 URI. + path or a S3 URI. This is a required field unless lora_dir is + provided and the config is meant to be used for the + `tensorize_lora_adapter` function. vllm_tensorized: If True, indicates that the serialized model is a vLLM model. This is used to determine the behavior of the TensorDeserializer when loading tensors from a serialized model. @@ -158,7 +246,7 @@ class TensorizerArgs: encryption_keyfile: File path to a binary file containing a binary key to use for decryption. `None` (the default) means no decryption. See the example script in - examples/other/tensorize_vllm_model.py. + examples/others/tensorize_vllm_model.py. s3_access_key_id: The access key for the S3 bucket. Can also be set via the S3_ACCESS_KEY_ID environment variable. s3_secret_access_key: The secret access key for the S3 bucket. Can also @@ -273,100 +361,72 @@ class TensorizerArgs: return tensorizer_args -class TensorizerAgent: - """ - A class for performing tensorizer deserializations specifically for - vLLM models using plaid_mode. Uses TensorizerArgs to configure the - behavior of the TensorDeserializer when loading tensors from a serialized - model. For deserializations of HuggingFace models, TensorDeserializer is - instead used as an iterator directly in the func hf_model_weights_iterator - in vllm/model_executor/model_loader/weight_utils.py - """ +def _check_tensors_on_meta_device(model: nn.Module) -> None: + for tensor in model.state_dict().values(): + if tensor.device.type == 'meta': + raise ValueError( + "The serialized model contains tensors on the meta device," + " indicating that some tensors were not loaded properly." + " Please check that the parameters of the model being" + " specified match that of the serialized model, such as" + " its quantization.") - def __init__(self, tensorizer_config: TensorizerConfig, vllm_config): - self.tensorizer_config = tensorizer_config - self.tensorizer_args = ( - self.tensorizer_config._construct_tensorizer_args()) - self.vllm_config = vllm_config - self.model = self._init_model() - def _init_model(self): - assert self.tensorizer_config.hf_config is not None - model_args = self.tensorizer_config.hf_config - model_args.torch_dtype = self.tensorizer_config.dtype - assert self.tensorizer_config.model_class is not None - # TODO: Do we need to consider old-style model class? - with no_init_or_tensor(), set_current_vllm_config(self.vllm_config, - check_compile=True): - return self.tensorizer_config.model_class( - vllm_config=self.vllm_config, ) +def _resize_lora_embeddings(model: nn.Module): + """Modify LoRA embedding layers to use bigger tensors + to allow for adapter added tokens.""" + for child in model.modules(): + if (isinstance(child, VocabParallelEmbedding) and child.weight.shape[0] + < child.num_embeddings_per_partition): + new_weight = torch.empty(child.num_embeddings_per_partition, + child.embedding_dim, + dtype=child.weight.dtype, + device=child.weight.device) + new_weight[:child.weight.shape[0]].copy_(child.weight.data) + new_weight[child.weight.shape[0]:].fill_(0) + child.weight.data = new_weight - def _resize_lora_embeddings(self): - """Modify LoRA embedding layers to use bigger tensors - to allow for adapter added tokens.""" - for child in self.model.modules(): - if (isinstance(child, VocabParallelEmbedding) - and child.weight.shape[0] - < child.num_embeddings_per_partition): - new_weight = torch.empty(child.num_embeddings_per_partition, - child.embedding_dim, - dtype=child.weight.dtype, - device=child.weight.device) - new_weight[:child.weight.shape[0]].copy_(child.weight.data) - new_weight[child.weight.shape[0]:].fill_(0) - child.weight.data = new_weight - def _check_tensors_on_meta_device(self): - for tensor in self.model.state_dict().values(): - if tensor.device.type == 'meta': - raise ValueError( - "The serialized model contains tensors on the meta device," - " indicating that some tensors were not loaded properly." - " Please check that the parameters of the model being" - " specified match that of the serialized model, such as" - " its quantization.") +def init_tensorizer_model(tensorizer_config: TensorizerConfig, + vllm_config: VllmConfig) -> nn.Module: + assert tensorizer_config.hf_config is not None + model_args = tensorizer_config.hf_config + model_args.torch_dtype = tensorizer_config.dtype + assert tensorizer_config.model_class is not None + # TODO: Do we need to consider old-style model class? + with meta_tensor_mode(), set_current_vllm_config(vllm_config, + check_compile=True): + return tensorizer_config.model_class(vllm_config=vllm_config) - def deserialize(self): - """ - Deserialize the model using the TensorDeserializer. This method is - specifically for vLLM models using tensorizer's plaid_mode. - The deserializer makes use of tensorizer_args.stream_params - to configure the behavior of the stream when loading tensors from a - serialized model. The deserializer_params are used to configure the - behavior of the TensorDeserializer when loading tensors themselves. - Documentation on these params can be found in TensorizerArgs - - Returns: - nn.Module: The deserialized model. - """ - before_mem = get_mem_usage() - start = time.perf_counter() - with _read_stream( - self.tensorizer_config.tensorizer_uri, - **self.tensorizer_args.stream_params - ) as stream, TensorDeserializer( +def deserialize_tensorizer_model(model: nn.Module, + tensorizer_config: TensorizerConfig) -> None: + tensorizer_args = tensorizer_config._construct_tensorizer_args() + before_mem = get_mem_usage() + start = time.perf_counter() + with _read_stream( + tensorizer_config.tensorizer_uri, + **tensorizer_args.stream_params) as stream, TensorDeserializer( stream, - dtype=self.tensorizer_config.dtype, + dtype=tensorizer_config.dtype, device=f'cuda:{torch.cuda.current_device()}', - **self.tensorizer_args.deserializer_params) as deserializer: - deserializer.load_into_module(self.model) - end = time.perf_counter() + **tensorizer_args.deserializer_params) as deserializer: + deserializer.load_into_module(model) + end = time.perf_counter() - total_bytes_str = convert_bytes(deserializer.total_tensor_bytes) - duration = end - start - per_second = convert_bytes(deserializer.total_tensor_bytes / duration) - after_mem = get_mem_usage() - deserializer.close() - logger.info("Deserialized %s in %0.2fs, %s/s", total_bytes_str, - end - start, per_second) - logger.info("Memory usage before: %s", before_mem) - logger.info("Memory usage after: %s", after_mem) + total_bytes_str = convert_bytes(deserializer.total_tensor_bytes) + duration = end - start + per_second = convert_bytes(deserializer.total_tensor_bytes / duration) + after_mem = get_mem_usage() + deserializer.close() + logger.info("Deserialized %s in %0.2fs, %s/s", total_bytes_str, + end - start, per_second) + logger.info("Memory usage before: %s", before_mem) + logger.info("Memory usage after: %s", after_mem) - self._check_tensors_on_meta_device() - self._resize_lora_embeddings() - del self.model.vllm_tensorized_marker - return self.model.eval() + _check_tensors_on_meta_device(model) + _resize_lora_embeddings(model) + del model.vllm_tensorized_marker def tensorizer_weights_iterator( @@ -376,7 +436,7 @@ def tensorizer_weights_iterator( "loading on vLLM, as tensorizer is forced to load to CPU. " "Consider deserializing a vLLM model instead for faster " "load times. See the " - "examples/other/tensorize_vllm_model.py example script " + "examples/others/tensorize_vllm_model.py example script " "for serializing vLLM models.") deserializer_args = tensorizer_args.deserializer_params @@ -467,8 +527,73 @@ def tensorize_vllm_model(engine_args: EngineArgs, ) as stream: stream.write(encryption_params.key) - engine = LLMEngine.from_engine_args(engine_args) - engine.model_executor.collective_rpc( - "save_tensorized_model", - kwargs=dict(tensorizer_config=tensorizer_config), - ) + from vllm import LLMEngine + from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine + + if not envs.VLLM_USE_V1: + engine = LLMEngine.from_engine_args(engine_args) + engine.model_executor.collective_rpc( + "save_tensorized_model", + kwargs=dict(tensorizer_config=tensorizer_config), + ) + else: + engine = V1LLMEngine.from_vllm_config(engine_config) + engine.collective_rpc( + "save_tensorized_model", + kwargs=dict(tensorizer_config=tensorizer_config), + ) + + +def tensorize_lora_adapter(lora_path: str, + tensorizer_config: TensorizerConfig): + """ + Uses tensorizer to serialize a LoRA adapter. Assumes that the files + needed to load a LoRA adapter are a safetensors-format file called + adapter_model.safetensors and a json config file called adapter_config.json. + + Serializes the files in the tensorizer_config.lora_dir + """ + import safetensors + + from vllm.lora.utils import get_adapter_absolute_path + + lora_dir = get_adapter_absolute_path(lora_path) + + tensor_path = config_path = "" + + for file in os.listdir(lora_dir): + if file.startswith("adapter_model"): + tensor_path = lora_dir + "/" + file + if file.startswith("adapter_config"): + config_path = lora_dir + "/" + file + if tensor_path and config_path: + break + + if tensor_path.endswith(".safetensors"): + tensors = safetensors.torch.load_file(tensor_path) + elif tensor_path.endswith(".bin"): + tensors = torch.load(tensor_path) + else: + raise ValueError("Unsupported file: %s", tensor_path) + + with open(config_path) as f: + config = json.load(f) + + tensorizer_args = tensorizer_config._construct_tensorizer_args() + + with open_stream(f"{tensorizer_config.lora_dir}/adapter_config.json", + mode="wb+", + **tensorizer_args.stream_params) as f: + + f.write(json.dumps(config).encode("utf-8")) + + lora_uri = (f"{tensorizer_config.lora_dir}" + f"/adapter_model.tensors") + with open_stream(lora_uri, mode="wb+", + **tensorizer_args.stream_params) as f: + serializer = TensorSerializer(f) + serializer.write_state_dict(tensors) + serializer.close() + + logger.info("Successfully serialized LoRA files to %s", + str(tensorizer_config.lora_dir)) diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py index 4107e741fd8fe..1923e040af381 100644 --- a/vllm/model_executor/model_loader/tensorizer_loader.py +++ b/vllm/model_executor/model_loader/tensorizer_loader.py @@ -2,6 +2,7 @@ # ruff: noqa: SIM117 import copy from collections.abc import Generator +from typing import Union import torch from torch import nn @@ -10,8 +11,8 @@ from vllm.config import LoadConfig, ModelConfig, ParallelConfig, VllmConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.base_loader import BaseModelLoader from vllm.model_executor.model_loader.tensorizer import ( - TensorizerConfig, is_vllm_tensorized, load_with_tensorizer, - serialize_vllm_model, tensorizer_weights_iterator) + TensorizerConfig, deserialize_tensorizer_model, init_tensorizer_model, + is_vllm_tensorized, serialize_vllm_model, tensorizer_weights_iterator) from vllm.model_executor.model_loader.utils import (get_model_architecture, initialize_model, set_default_torch_dtype) @@ -47,7 +48,7 @@ class TensorizerLoader(BaseModelLoader): """Load a serialized model with tensorizer to the CPU. This is only necessary when the model isn't vLLM-tensorized (see - examples/other/tensorize_vllm_model.py) This should still + examples/others/tensorize_vllm_model.py) This should still be faster than default HuggingFace loading, but will be slower than loading a vLLM-tensorized model. """ @@ -60,40 +61,36 @@ class TensorizerLoader(BaseModelLoader): model.load_weights(self._get_weights_iterator()) return model.eval() - def _load_model_serialized( - self, - vllm_config: VllmConfig, - ) -> nn.Module: - """Load a serialized model with tensorizer. - - Expects a vLLM-tensorized model. See the - examples/other/tensorize_vllm_model.py example script - for serializing vLLM models.""" - - device_config = vllm_config.device_config - model_config = vllm_config.model_config - - with set_default_torch_dtype(model_config.dtype): - with torch.device(device_config.device): - model_class = get_model_architecture(model_config)[0] - - tensorizer_config = copy.copy(self.tensorizer_config) - tensorizer_config.model_class = model_class - tensorizer_config.hf_config = model_config.hf_config - tensorizer_config.dtype = model_config.dtype - - model = load_with_tensorizer(tensorizer_config, - vllm_config=vllm_config) - return model.eval() - def download_model(self, model_config: ModelConfig) -> None: self.tensorizer_config.verify_with_model_config(model_config) with self.tensorizer_config.open_stream(): pass - def load_model(self, vllm_config: VllmConfig) -> nn.Module: - model_config = vllm_config.model_config + def _patch_tensorizer_config( + self, model_config: ModelConfig) -> TensorizerConfig: + model_class = get_model_architecture(model_config)[0] + tensorizer_config = copy.copy(self.tensorizer_config) + tensorizer_config.model_class = model_class + tensorizer_config.hf_config = model_config.hf_config + tensorizer_config.dtype = model_config.dtype + return tensorizer_config + + def load_weights(self, model: nn.Module, + model_config: ModelConfig) -> None: + """Load serialized model weights with tensorizer. + + Expects a vLLM-tensorized model. See the + examples/others/tensorize_vllm_model.py example script + for serializing vLLM models.""" + if is_vllm_tensorized(self.tensorizer_config): + tensorizer_config = self._patch_tensorizer_config(model_config) + deserialize_tensorizer_model(model, tensorizer_config) + else: + model.load_weights(self._get_weights_iterator()) + + def load_model(self, vllm_config: VllmConfig, + model_config: ModelConfig) -> nn.Module: parallel_config = vllm_config.parallel_config self._verify_config(model_config, parallel_config) @@ -105,14 +102,20 @@ class TensorizerLoader(BaseModelLoader): get_tensor_model_parallel_rank()) if is_vllm_tensorized(self.tensorizer_config): - return self._load_model_serialized(vllm_config=vllm_config) + tensorizer_config = self._patch_tensorizer_config(model_config) + model = init_tensorizer_model(tensorizer_config=tensorizer_config, + vllm_config=vllm_config) + self.load_weights(model, model_config) + return model return self._load_model_serialized_cpu(vllm_config=vllm_config) @staticmethod def save_model( model: torch.nn.Module, - tensorizer_config: TensorizerConfig, + tensorizer_config: Union[TensorizerConfig, dict], ) -> None: + if isinstance(tensorizer_config, dict): + tensorizer_config = TensorizerConfig(**tensorizer_config) serialize_vllm_model( model=model, tensorizer_config=tensorizer_config, diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 68b1f1ad74d32..9c8d647a24fea 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -42,9 +42,11 @@ def initialize_model( *, prefix: str = "", model_class: Optional[type[nn.Module]] = None, + model_config: Optional[ModelConfig] = None, ) -> nn.Module: """Initialize a model with the given configurations.""" - model_config = vllm_config.model_config + if model_config is None: + model_config = vllm_config.model_config if model_class is None: model_class, _ = get_model_architecture(model_config) @@ -223,17 +225,16 @@ def get_model_architecture( "fp8", "compressed-tensors", "gptq_marlin", "awq_marlin", "quark" ] - if (model_config.quantization is not None - and model_config.quantization not in mixtral_supported - and "MixtralForCausalLM" in architectures): - architectures = ["QuantMixtralForCausalLM"] - vllm_supported_archs = ModelRegistry.get_supported_archs() vllm_not_supported = not any(arch in vllm_supported_archs for arch in architectures) if (model_config.model_impl == ModelImpl.TRANSFORMERS or model_config.model_impl != ModelImpl.VLLM and vllm_not_supported): architectures = resolve_transformers_arch(model_config, architectures) + elif (model_config.quantization is not None + and model_config.quantization not in mixtral_supported + and "MixtralForCausalLM" in architectures): + architectures = ["QuantMixtralForCausalLM"] model_cls, arch = ModelRegistry.resolve_model_cls(architectures) if model_config.task == "embed": diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index a1cf43328bab1..7a9a68be8805e 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -319,6 +319,7 @@ def download_safetensors_index_file_from_hf( Args: model_name_or_path (str): The model name or path. + index_file (str): The safetensors index file name cache_dir (Optional[str]): The cache directory to store the model weights. If None, will use HF defaults. revision (Optional[str]): The revision of the model. @@ -337,10 +338,10 @@ def download_safetensors_index_file_from_hf( ) # If file not found on remote or locally, we should not fail since # only some models will have index_file. - except huggingface_hub.utils.EntryNotFoundError: - logger.info("No %s found in remote.", index_file) except huggingface_hub.utils.LocalEntryNotFoundError: logger.info("No %s found in local cache.", index_file) + except huggingface_hub.utils.EntryNotFoundError: + logger.info("No %s found in remote.", index_file) # For models like Mistral-7B-v0.3, there are both sharded @@ -634,7 +635,7 @@ def row_parallel_weight_loader(param: torch.Tensor, return default_weight_loader(param, loaded_weight) -LoaderFunction = Callable[[torch.Tensor, torch.Tensor], torch.Tensor] +LoaderFunction = Callable[[torch.Tensor, torch.Tensor], None] def sharded_weight_loader(shard_axis: int) -> LoaderFunction: @@ -695,7 +696,7 @@ def initialize_dummy_weights( # Note: We avoid using torch.rank_like as it doesn't currently # support the generator argument. param.copy_((high - low) * - torch.rand(*param.shape, + torch.rand(param.shape, generator=generator, dtype=param.dtype, layout=param.layout, diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py index aefd6c9737552..2e2a18abd03dd 100644 --- a/vllm/model_executor/models/aimv2.py +++ b/vllm/model_executor/models/aimv2.py @@ -2,16 +2,23 @@ # A modified implementation of the AIMv2 Transformer # inserted here also the image tokenizer used by Ovis2 +from collections.abc import Iterable from typing import Optional import torch import torch.nn as nn -from torch.nn import functional as F +from vllm.attention.layer import MultiHeadAttention +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.distributed.utils import divide +from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import ReplicatedLinear +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.transformers_utils.configs.ovis import AIMv2Config @@ -24,29 +31,27 @@ class AIMv2SwiGLUFFN(nn.Module): in_features = config.hidden_size bias = config.use_bias - # TODO(Isotr0py): investigate if we can add TP to visual tokenizer - self.fc1 = ReplicatedLinear(in_features, - hidden_features, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.fc1") - self.fc2 = ReplicatedLinear(hidden_features, - in_features, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.fc2") - self.fc3 = ReplicatedLinear(in_features, - hidden_features, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.fc3") + self.fc13 = MergedColumnParallelLinear( + in_features, + [hidden_features] * 2, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.fc13", + ) + self.fc2 = RowParallelLinear( + input_size=hidden_features, + output_size=in_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.fc2", + ) + self.act_fn = SiluAndMul() def forward(self, x: torch.Tensor) -> torch.Tensor: - x_parallel, _ = self.fc1(x) - gate, _ = self.fc3(x) - x_parallel = F.silu(x_parallel) * gate - out, _ = self.fc2(x_parallel) - return out + x, _ = self.fc13(x) + x = self.act_fn(x) + x, _ = self.fc2(x) + return x class AIMv2PatchEmbed(nn.Module): @@ -90,39 +95,45 @@ class AIMv2Attention(nn.Module): def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, prefix: str): super().__init__() - dim = config.hidden_size - - # TODO(Isotr0py): investigate if we can add TP to visual tokenizer + self.config = config + self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads - self.qkv = ReplicatedLinear(dim, dim * 3, bias=config.qkv_bias) - # self.qkv = QKVParallelLinear( - # hidden_size=dim, - # head_size=dim // config.num_attention_heads, - # total_num_heads=config.num_attention_heads, - # bias=config.qkv_bias, - # quant_config=quant_config, - # prefix=f"{prefix}.qkv") - self.proj = ReplicatedLinear(dim, dim, bias=config.use_bias) - # self.proj = RowParallelLinear(input_size=dim, - # output_size=dim, - # bias = config.use_bias, - # quant_config=quant_config, - # prefix=f"{prefix}.proj") + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + "embed_dim must be divisible by num_heads " + f"(got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads}).") + self.scale = self.head_dim**-0.5 - def forward( # todo might implement multiple attn implementations - self, - x: torch.Tensor, - mask: Optional[torch.Tensor] = None) -> torch.Tensor: - B, N, C = x.shape + self.qkv = QKVParallelLinear( + hidden_size=self.embed_dim, + head_size=self.head_dim, + total_num_heads=self.num_heads, + bias=config.qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv", + ) + + self.proj = RowParallelLinear( + input_size=self.embed_dim, + output_size=self.embed_dim, + bias=config.use_bias, + quant_config=quant_config, + prefix=f"{prefix}.proj", + ) + + self.tp_size = get_tensor_model_parallel_world_size() + self.num_heads_per_partition = divide(self.num_heads, self.tp_size) + + self.attn = MultiHeadAttention(self.num_heads_per_partition, + self.head_dim, self.scale) + + def forward(self, x: torch.Tensor) -> torch.Tensor: qkv, _ = self.qkv(x) + q, k, v = qkv.chunk(3, dim=-1) - qkv = qkv.reshape(B, N, 3, self.num_heads, - C // self.num_heads).permute(2, 0, 3, 1, 4) - - q, k, v = qkv.unbind(0) - - x = F.scaled_dot_product_attention(q, k, v, attn_mask=mask) - x = x.transpose(1, 2).contiguous().reshape(B, N, C) + x = self.attn(q, k, v) x, _ = self.proj(x) return x @@ -141,37 +152,40 @@ class AIMv2Block(nn.Module): prefix=f"{prefix}.mlp") self.norm_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - def forward(self, - x: torch.Tensor, - mask: Optional[torch.Tensor] = None) -> torch.Tensor: - x = x + self.attn(self.norm_1.forward_native(x), mask) + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x + self.attn(self.norm_1.forward_native(x)) x = x + self.mlp(self.norm_2.forward_native(x)) return x class AIMv2Transformer(nn.Module): - def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, - prefix: str): + def __init__( + self, + config: AIMv2Config, + quant_config: QuantizationConfig, + *, + require_post_norm: Optional[bool] = None, + prefix: str = "", + ): super().__init__() self.blocks = nn.ModuleList([ AIMv2Block(config, quant_config, prefix=f"{prefix}.blocks.{i}") for i in range(config.num_hidden_layers) ]) - self.post_trunk_norm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) + if require_post_norm: + self.post_trunk_norm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + else: + self.post_trunk_norm = None - def forward( - self, - tokens: torch.Tensor, - mask: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + def forward(self, tokens: torch.Tensor) -> torch.Tensor: # they take the -1 as the ref embeddings, like a clip skip for block in self.blocks: - tokens = block(tokens, mask) - # NO NORM IN THE OG IMPLEMENTATION - # tokens = self.post_trunk_norm(tokens) + tokens = block(tokens) + if self.post_trunk_norm is not None: + tokens = self.post_trunk_norm(tokens) return tokens @@ -180,20 +194,52 @@ class AIMv2Model(torch.nn.Module): def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, + *, + require_post_norm: Optional[bool] = None, prefix: str = ""): super().__init__() self.preprocessor = AIMv2ViTPreprocessor(config) self.trunk = AIMv2Transformer(config, quant_config=quant_config, + require_post_norm=require_post_norm, prefix=f"{prefix}.trunk") - def forward( - self, - pixel_values: torch.Tensor, - mask: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: x = self.preprocessor(pixel_values) - x = self.trunk(x, mask) + x = self.trunk(x) return x + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".fc13", ".fc1", 0), + (".fc13", ".fc3", 1), + ] + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + + for name, loaded_weight in weights: + # post_layernorm is optional in SiglipVisionModel + if (name.startswith("trunk.post_trunk_norm") + and self.trunk.post_trunk_norm is None): + continue + + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 077e36176430a..bcff6eb3fd315 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -42,7 +42,8 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, row_parallel_weight_loader) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors @@ -384,7 +385,7 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP, lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config - + self.tp_size = get_tensor_model_parallel_world_size() self.quant_config = quant_config self.model = BaiChuanModel(vllm_config=vllm_config, prefix=prefix, @@ -438,8 +439,10 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP, is_baichuan2 = self.config.vocab_size == 125696 if is_baichuan2: loaded_weight = torch.nn.functional.normalize(loaded_weight) - - default_weight_loader(param, loaded_weight) + if self.tp_size > 1: + row_parallel_weight_loader(param, loaded_weight) + else: + default_weight_loader(param, loaded_weight) class BaichuanForCausalLM(BaiChuanBaseForCausalLM): diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 0c6593bbe3a10..0b1d0f1034083 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -16,7 +16,7 @@ from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import (CrossEncodingPooler, Pooler, +from vllm.model_executor.layers.pooler import (ClassifierPooler, Pooler, PoolingType) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -470,8 +470,8 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, embedding_class=BertEmbedding, add_pooling_layer=True) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self._pooler = CrossEncodingPooler(config, self.classifier, - self.bert.pooler) + self._pooler = ClassifierPooler(vllm_config.model_config, + self.classifier, self.bert.pooler) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index af6deb3bf072e..8a387d71f1cb0 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from collections.abc import Iterable +from copy import deepcopy from typing import Optional import torch @@ -10,6 +11,7 @@ from vllm.attention import Attention, AttentionType from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.logger import init_logger from vllm.model_executor.layers.activation import (get_act_and_mul_fn, get_act_fn) from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -27,6 +29,8 @@ from vllm.model_executor.models.interfaces import SupportsQuant from vllm.model_executor.models.utils import WeightsMapper from vllm.sequence import IntermediateTensors +logger = init_logger(__name__) + class BertWithRopeEmbedding(nn.Module): @@ -513,10 +517,11 @@ class NomicBertModel(BertWithRope): head_dim = config.hidden_size // config.num_attention_heads rotary_emb_dim = head_dim * config.rotary_emb_fraction + max_trained_positions = getattr(config, "max_trained_positions", 2048) config.rotary_kwargs = { "head_size": head_dim, "rotary_dim": rotary_emb_dim, - "max_position": config.max_trained_positions, + "max_position": max_trained_positions, "base": getattr(config, "rope_theta", config.rotary_emb_base), "rope_scaling": getattr(config, "rope_scaling", None) } @@ -525,8 +530,52 @@ class NomicBertModel(BertWithRope): # than max_trained_positions 2048, the results are consistent # with SentenceTransformer. # The context extension uses vllm style rope_theta and rope_scaling. - # See #17785 + # See #17785 #18755 + if (not vllm_config.model_config.hf_overrides + and vllm_config.model_config.original_max_model_len is None): + # Default + # Reset max_model_len to max_trained_positions. + # nomic-embed-text-v2-moe the length is set to 512 + # by sentence_bert_config.json. + max_model_len_before = vllm_config.model_config.max_model_len + max_model_len = min(vllm_config.model_config.max_model_len, + max_trained_positions) + vllm_config.recalculate_max_model_len(max_model_len) + logger.warning( + "Nomic context extension is disabled. " + "Changing max_model_len from %s to %s. " + "To enable context extension, see: " + "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/context_extension.html", + max_model_len_before, vllm_config.model_config.max_model_len) + else: + # We need to re-verify max_model_len to avoid lengths + # greater than position_embedding. + model_config = vllm_config.model_config + hf_text_config = model_config.hf_text_config + + if isinstance(model_config.hf_overrides, dict): + # hf_overrides_kw + max_model_len = model_config.hf_overrides.get( + "max_model_len", vllm_config.model_config.max_model_len) + else: + # hf_overrides_fn + # This might be overridden by sentence_bert_config.json. + max_model_len = vllm_config.model_config.max_model_len + + # reset hf_text_config for recalculate_max_model_len. + if hasattr(hf_text_config, "max_model_len"): + delattr(hf_text_config, "max_model_len") + hf_text_config.max_position_embeddings = max_trained_positions + hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"] + + # The priority of sentence_bert_config.json is higher + # than max_position_embeddings + encoder_config = deepcopy(model_config.encoder_config) + encoder_config.pop("max_seq_length", None) + model_config.encoder_config = encoder_config + + vllm_config.recalculate_max_model_len(max_model_len) return config diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 2ff7e394a4163..db0dd2051d527 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -681,9 +681,8 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, batch. pixel_values: The pixels in each input image. - :::{seealso} - {class}`Blip2ImageInputs` - ::: + Info: + [Blip2ImageInputs][] """ if intermediate_tensors is not None: diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index e8f3ae2156e02..9fd528fd79779 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -106,7 +106,6 @@ class CLIPAttention(nn.Module): f"(got `embed_dim`: {self.embed_dim} and `num_heads`:" f" {self.num_heads}).") self.scale = self.head_dim**-0.5 - self.dropout = config.attention_dropout self.qkv_proj = QKVParallelLinear( hidden_size=self.embed_dim, @@ -129,10 +128,6 @@ class CLIPAttention(nn.Module): self.attn = MultiHeadAttention(self.num_heads_per_partition, self.head_dim, self.scale) - def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, - self.head_dim).transpose(1, 2).contiguous() - def forward( self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index 6d7b52aba5f91..03ef7bed0edcf 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -19,6 +19,7 @@ from vllm.sequence import IntermediateTensors from .deepseek_v2 import (DeepseekV2DecoderLayer, get_spec_layer_idx_from_weight_name) +from .interfaces import SupportsPP from .utils import maybe_prefix @@ -145,7 +146,7 @@ class DeepSeekMultiTokenPredictor(nn.Module): return logits -class DeepSeekMTP(nn.Module): +class DeepSeekMTP(nn.Module, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 164fa40ffebe5..5c8793f59ffbe 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -210,9 +210,7 @@ class DeepseekVL2MultiModalProcessor( dict(prompt=prompt, **mm_data), mm_kwargs, ) - target_dtype = self.info.ctx.model_config.dtype - pixel_values = processed_outputs.pop("pixel_values").to( - target_dtype) + pixel_values = processed_outputs["pixel_values"] # split pixel values into patches corresponding to each image images_spatial_crop = processed_outputs["images_spatial_crop"] patches_per_image = [ diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 4ffd06319684c..838560692bcf5 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -127,8 +127,9 @@ class ExaoneAttention(nn.Module): assert tp_size % self.total_num_kv_heads == 0 self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) # MistralConfig has an optional head_dim introduced by Mistral-Nemo - self.head_dim = getattr(config, "head_dim", - self.hidden_size // self.total_num_heads) + self.head_dim = getattr(config, "head_dim", None) + if self.head_dim is None: + self.head_dim = self.hidden_size // self.total_num_heads self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py new file mode 100644 index 0000000000000..1c0e3911fccee --- /dev/null +++ b/vllm/model_executor/models/falcon_h1.py @@ -0,0 +1,684 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Inference-only FalconH1 model.""" +from collections.abc import Iterable +from typing import Optional + +import torch +from torch import nn +from transformers import FalconH1Config + +from vllm.attention.layer import Attention +from vllm.config import CacheConfig, VllmConfig +from vllm.distributed import divide, get_tensor_model_parallel_world_size +from vllm.distributed.parallel_state import get_pp_group +from vllm.forward_context import get_forward_context +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.mamba.mamba2_metadata import ( + Mamba2Metadata, prepare_mamba2_metadata) +from vllm.model_executor.layers.mamba.mamba_mixer2 import ( + MambaMixer2, extra_groups_for_head_shards) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.mamba_cache import (MambaCacheManager, + MambaCacheParams) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP, + SupportsV0Only) +from .utils import (PPMissingLayer, is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) + + +class FalconH1MLP(nn.Module): + + def __init__( + self, + config: FalconH1Config, + quant_config: Optional[QuantizationConfig] = None, + bias: bool = False, + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + input_size=config.hidden_size, + output_sizes=[config.intermediate_size] * 2, + bias=bias, + quant_config=quant_config, + ) + self.down_proj = RowParallelLinear( + input_size=config.intermediate_size, + output_size=config.hidden_size, + bias=bias, + quant_config=quant_config, + ) + self.tp_size = get_tensor_model_parallel_world_size() + self.intermediate_size = config.intermediate_size + self.gate_multiplier, self.down_multiplier = config.mlp_multipliers + if config.hidden_act != "silu": + raise ValueError(f"Unsupported activation: {config.hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + x, _ = self.gate_up_proj(x) + x[:, :self.intermediate_size // self.tp_size] *= self.gate_multiplier + x = self.act_fn(x) + x, _ = self.down_proj(x) + x = x * self.down_multiplier + return x + + +class FalconH1SSMDecoderLayer(nn.Module): + + def __init__( + self, + config: FalconH1Config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.config = config + self.tp_size = get_tensor_model_parallel_world_size() + + self.d_ssm = (int(config.mamba_expand * config.hidden_size) + if config.mamba_d_ssm is None else config.mamba_d_ssm) + + self.mamba = MambaMixer2( + hidden_size=config.hidden_size, + ssm_state_size=config.mamba_d_state, + conv_kernel_size=config.mamba_d_conv, + intermediate_size=self.d_ssm, + use_conv_bias=config.mamba_conv_bias, + use_bias=config.mamba_proj_bias, + n_groups=config.mamba_n_groups, + num_heads=config.mamba_n_heads, + head_dim=config.mamba_d_head, + rms_norm_eps=config.rms_norm_eps, + activation=config.hidden_act, + quant_config=quant_config, + use_rms_norm=config.mamba_rms_norm, + ) + # n_groups is overridden later by `MambaMixer2` + self.groups_time_state_size = self.mamba.n_groups * config.mamba_d_state + self.zxbcdt_multipliers = config.ssm_multipliers + self._init_mup_vector() + + def _init_mup_vector(self): + """ + Non learnable per-block scaling vector composed of element-wise + multipliersapplied to each separate contiguous block of the output + of the linear projection (in_proj) before further processing + (gating, convolution, SSM): + + - Z block: [0 : d_ssm] โ†’ zxbcdt_multipliers[0] + - X block: [d_ssm : 2 * d_ssm] โ†’ zxbcdt_multipliers[1] + - B block: [2 * d_ssm : 2 * d_ssm + G * S] โ†’ zxbcdt_multipliers[2] + - C block: [2 * d_ssm + G * S : 2 * d_ssm + 2 * G * S] + โ†’ zxbcdt_multipliers[3] + - dt block: [2 * d_ssm + 2 * G * S : end] โ†’ zxbcdt_multipliers[4] + + where: + - d_ssm: Dimension of state-space model latent + - G: Number of groups (n_groups) + - S: SSM state size per group + - All indices are divided by tp_size to support tensor parallelism + """ + vector_shape = (2 * self.d_ssm + 2 * self.groups_time_state_size + + self.config.mamba_n_heads) // self.tp_size + mup_vector = torch.ones(1, vector_shape) + # Z vector 0 -> d_ssm + mup_vector[:, :self.d_ssm // + self.tp_size] *= self.zxbcdt_multipliers[0] + # X vector d_ssm -> 2 * d_ssm + mup_vector[:, + (self.d_ssm // + self.tp_size):(2 * self.d_ssm // + self.tp_size)] *= self.zxbcdt_multipliers[1] + # B vector 2 * d_ssm -> 2 * d_ssm + (n_group * d_state) + mup_vector[ + :, + (2 * self.d_ssm) // + self.tp_size:(2 * self.d_ssm + self.groups_time_state_size) // + self.tp_size, + ] *= self.zxbcdt_multipliers[2] + # C vector 2 * d_ssm + (n_group * d_state) + # -> 2 * d_ssm + 2 * (n_group * d_state) + mup_vector[ + :, + (2 * self.d_ssm + self.groups_time_state_size) // + self.tp_size:(2 * self.d_ssm + 2 * self.groups_time_state_size) // + self.tp_size, + ] *= self.zxbcdt_multipliers[3] + # dt vector 2 * d_ssm + 2 * (n_group * d_state) + # -> 2 * d_ssm + 2 * (n_group * d_state) + n_heads + mup_vector[ + :, + (2 * self.d_ssm + 2 * self.groups_time_state_size) // + self.tp_size:, + ] *= self.zxbcdt_multipliers[4] + + self.register_buffer("mup_vector", mup_vector, persistent=False) + + def forward( + self, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + mamba_cache_params: MambaCacheParams, + mamba2_metadata: Mamba2Metadata, + **kwargs, + ): + hidden_states = self.mamba( + hidden_states, + mamba_cache_params, + mamba2_metadata=mamba2_metadata, + mup_vector=self.mup_vector, + ) + return hidden_states, residual + + +class FalconH1AttentionDecoderLayer(nn.Module): + + def __init__( + self, + config: FalconH1Config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + rope_theta = getattr(config, "rope_theta", 1e11) + rope_scaling = getattr(config, "rope_scaling", None) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + self.hidden_size = config.hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = config.num_attention_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = config.num_key_value_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = (config.hidden_size // self.total_num_heads if getattr( + config, "head_dim", None) is None else config.head_dim) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + if hasattr(config, "partial_rotary_factor"): + rotary_dim = self.head_dim * config.partial_rotary_factor + elif hasattr(config, "attn_rotary_emb"): + rotary_dim = config.attn_rotary_emb # for backward compatibility + else: + rotary_dim = self.head_dim # default + + self.rotary_emb = get_rope( + head_size=self.head_dim, + rotary_dim=rotary_dim, + max_position=max_position_embeddings, + rope_scaling=rope_scaling, + base=rope_theta, + is_neox_style=True, + dtype=None, # see impl of get_rope + ) + + self.qkv_proj = QKVParallelLinear( + config.hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + config.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + prefix=f"{prefix}.attn", + ) + self.key_multiplier = config.key_multiplier + + def self_attention( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + **kwargs, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + k = k * self.key_multiplier + + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + **kwargs, + ): + hidden_states = self.self_attention( + positions=positions, + hidden_states=hidden_states, + ) + return hidden_states, residual + + +class FalconH1ParallelHybrid(nn.Module): + """ + A hybrid decoder layer for FalconH1 where the input is processed + in parallel through both the self-attention branch and the SSM (Mamba) + branch. Their outputs are then summed to produce the final hidden state. + + This layer uses: + - FalconH1AttentionDecoderLayer for the multi-head self-attention branch. + - FalconH1SSMDecoderLayer for the state-space (Mamba) branch. + """ + + def __init__( + self, + config: FalconH1Config, + layer_idx: int, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + # Instantiate the attention branch + self.self_attn = FalconH1AttentionDecoderLayer( + config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, + ) + # Instantiate the SSM branch + self.mamba = FalconH1SSMDecoderLayer( + config=config, + cache_config=cache_config, + quant_config=quant_config, + ) + self.ssm_out_multiplier = config.ssm_out_multiplier + self.ssm_in_multiplier = config.ssm_in_multiplier + + self.attention_in_multiplier = config.attention_in_multiplier + self.attn_out_multiplier = config.attention_out_multiplier + + self.feed_forward = FalconH1MLP(config) + + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.pre_ff_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + mamba_cache_params: MambaCacheParams, + mamba2_metadata: Mamba2Metadata, + **kwargs, + ): + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + # Process input through the attention branch. + # FalconH1AttentionDecoderLayer expects positions, hidden_states, + # kv_cache, attn_metadata, and residual. + attn_hidden, _ = self.self_attn( + positions=positions, + hidden_states=hidden_states * self.attention_in_multiplier, + residual=residual, + **kwargs, + ) + + # Process input through the SSM branch. + # FalconH1SSMDecoderLayer expects hidden_states, attn_metadata, + # residual, mamba_cache_params, and sequence_idx. + ssm_hidden, _ = self.mamba( + hidden_states=hidden_states * self.ssm_in_multiplier, + residual=residual, + mamba_cache_params=mamba_cache_params, + mamba2_metadata=mamba2_metadata, + **kwargs, + ) + # Sum the outputs from both branches. + # We assume both branches produce outputs of the same + # dimensionality (config.hidden_size). + hidden_states = (attn_hidden * self.attn_out_multiplier) + ( + ssm_hidden * self.ssm_out_multiplier) + hidden_states = hidden_states + residual + + # feed-forward + residual = hidden_states + hidden_states = self.pre_ff_layernorm(hidden_states) + hidden_states = self.feed_forward(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +class FalconH1Model(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config: FalconH1Config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + + self.config = config + lora_vocab = ((lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0) + self.vocab_size = config.vocab_size + lora_vocab + self.org_vocab_size = config.vocab_size + if get_pp_group().is_first_rank: + + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + ) + self.embedding_multiplier = config.embedding_multiplier + else: + self.embed_tokens = PPMissingLayer() + self.embedding_multiplier = 1.0 + + def get_layer(prefix: str): + layer_idx = int(prefix.rsplit(".", 1)[1]) + layer_class = FalconH1ParallelHybrid + return layer_class( + config, + layer_idx, + cache_config, + quant_config=quant_config, + prefix=prefix, + ) + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers") + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) + if get_pp_group().is_last_rank: + self.final_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + else: + self.final_layernorm = PPMissingLayer() + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + mamba_cache_params: MambaCacheParams, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + + # pass a sequence index tensor, that is required for + # proper continuous batching computation including + # chunked prefill + attn_metadata = get_forward_context().attn_metadata + mamba2_metadata = prepare_mamba2_metadata( + chunk_size=self.config.mamba_chunk_size, + attn_metadata=attn_metadata, + ) + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds * self.embedding_multiplier + else: + hidden_states = (self.get_input_embeddings(input_ids) * + self.embedding_multiplier) + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + + for i in range(self.start_layer, self.end_layer): + layer = self.layers[i] + layer_mamba_cache_params = mamba_cache_params.at_layer_idx(i) + hidden_states = layer( + positions=positions, + hidden_states=hidden_states, + mamba_cache_params=layer_mamba_cache_params, + mamba2_metadata=mamba2_metadata, + ) + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + }) + hidden_states = self.final_layernorm(hidden_states) + return hidden_states + + +class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, + IsHybrid, SupportsV0Only): + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } + + embedding_modules = { + "embed_tokens": "input_embeddings", + "lm_head": "output_embeddings", + } + embedding_padding_modules = ["lm_head"] + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + lora_config = vllm_config.lora_config + scheduler_config = vllm_config.scheduler_config + assert (not cache_config.enable_prefix_caching + ), "FalconH1 currently does not support prefix caching" + + self.quant_config = vllm_config.quant_config + + super().__init__() + self.config = config + self.scheduler_config = scheduler_config + self.model = FalconH1Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + self.tie_word_embeddings = config.tie_word_embeddings + self.unpadded_vocab_size = config.vocab_size + self.mamba_cache: Optional[MambaCacheManager] = None + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=( + DEFAULT_VOCAB_PADDING_SIZE + # We need bigger padding if using lora for kernel + # compatibility + if not lora_config else + lora_config.lora_vocab_padding_size), + ) + self.lm_head_multiplier = config.lm_head_multiplier + if self.tie_word_embeddings: + self.lm_head = self.lm_head.tie_weights( + self.model.embed_tokens) + # Used to track and store by the Mamba cache between steps. + + self.logits_processor = LogitsProcessor( + self.unpadded_vocab_size, + config.vocab_size, + scale=config.lm_head_multiplier, + ) + else: + self.lm_head = PPMissingLayer() + + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs, + ): + if self.mamba_cache is None: + self.mamba_cache = MambaCacheManager( + self.vllm_config, + self.lm_head.weight.dtype + if hasattr(self.lm_head, 'weight') else torch.bfloat16, + self.config.num_hidden_layers, + *self._get_mamba_cache_shape(), + ) + mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) + hidden_states = self.model( + input_ids, + positions, + mamba_cache_params, + intermediate_tensors, + inputs_embeds, + ) + + return hidden_states + + def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): + return self.mamba_cache.copy_inputs_before_cuda_graphs( + input_buffers, **kwargs) + + def get_seqlen_agnostic_capture_inputs(self, batch_size: int): + return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) + + def _get_mamba_cache_shape( + self) -> tuple[tuple[int, int], tuple[int, int]]: + world_size = get_tensor_model_parallel_world_size() + hidden_size = self.config.hidden_size + + conv_state_shape, temporal_state_shape = None, None + + intermediate_size = (int(self.config.mamba_expand * + hidden_size) if self.config.mamba_d_ssm + is None else self.config.mamba_d_ssm) + + # if n_groups is not divisible by world_size, need to extend the shards + # to ensure all groups needed by a head is sharded along with it + n_groups = self.config.mamba_n_groups + extra_groups_for_head_shards( + self.config.mamba_n_groups, world_size) + + # - heads and n_groups are TP-ed + conv_dim = intermediate_size + 2 * n_groups * self.config.mamba_d_state + conv_state_shape = ( + divide(conv_dim, world_size), + self.config.mamba_d_conv - 1, + ) + + # These are not TP-ed as they depend on A, dt_bias, D + # - they are typically small + # e.g., (h_heads, d_head, d_state) = (128, 64, 128) + temporal_state_shape = ( + divide(self.config.mamba_n_heads, world_size), + self.config.mamba_d_head, + self.config.mamba_d_state, + ) + return conv_state_shape, temporal_state_shape + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + + return logits + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + if "A_log" in name: + name = name.replace("A_log", "A") + + if "mamba" in name: + name = name.replace("mamba", "mamba.mamba") + + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + if self.tie_word_embeddings and "lm_head" in name: + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + + if self.tie_word_embeddings: + loaded_params.add("lm_head.weight") + return loaded_params diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 743542ec8dfad..182cc86d3ca8f 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -504,18 +504,12 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, return next(self.parameters()).dtype def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: - h = w = self.config.vision_config.image_size - expected_dims = (3, h, w) - - def _validate_shape(d: torch.Tensor): - if d.shape != expected_dims: - raise ValueError( - "The expected shape of pixel values per image per batch " - f"is {expected_dims}. You supplied {tuple(d.shape)}.") - - for d in data: - _validate_shape(d) - + image_size = self.config.vision_config.image_size + expected_dims = (3, image_size, image_size) + if data.shape[1:] != expected_dims: + raise ValueError( + "The expected shape of pixel values per image per batch is " + f"{expected_dims}. You supplied {tuple(data.shape)}.") return data def _parse_and_validate_image_input( @@ -549,9 +543,7 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, vision_tower: SiglipVisionModel, pixel_values: torch.Tensor, ) -> torch.Tensor: - target_dtype = vision_tower.get_input_embeddings().weight.dtype - image_features = vision_tower(pixel_values.to(dtype=target_dtype)) - return image_features + return vision_tower(pixel_values) def _process_image_input( self, diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 470a7053e1b65..c2c310fca4d94 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -43,7 +43,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP -from .utils import (is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -235,6 +235,35 @@ class GPT2Model(nn.Module): hidden_states = self.ln_f(hidden_states) return hidden_states + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if ".attn.bias" in name or ".attn.masked_bias" in name: + # Skip attention mask. + # NOTE: "c_attn.bias" should not be skipped. + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + # The HF's GPT-2 implementation uses Conv1D instead of Linear. + # Because of this, we need to transpose the weights. + # Note(zhuohan): the logic below might break quantized models. + for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]: + if conv1d_weight_name not in name: + continue + if not name.endswith(".weight"): + continue + loaded_weight = loaded_weight.t() + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + class GPT2LMHeadModel(nn.Module, SupportsPP): @@ -283,32 +312,16 @@ class GPT2LMHeadModel(nn.Module, SupportsPP): def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if ".attn.bias" in name or ".attn.masked_bias" in name: - # Skip attention mask. - # NOTE: "c_attn.bias" should not be skipped. - continue - if not name.startswith("transformer.") and not name.startswith( - "lm_head"): - name = "transformer." + name + loader = AutoWeightsLoader(self) + weights = _add_transformer_prefix(weights) + return loader.load_weights(weights) - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - # The HF's GPT-2 implementation uses Conv1D instead of Linear. - # Because of this, we need to transpose the weights. - # Note(zhuohan): the logic below might break quantized models. - for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]: - if conv1d_weight_name not in name: - continue - if not name.endswith(".weight"): - continue - loaded_weight = loaded_weight.t() - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params +def _add_transformer_prefix( + weights: Iterable[tuple[str, torch.Tensor]] +) -> Iterable[tuple[str, torch.Tensor]]: + for name, tensor in weights: + if not name.startswith('transformer.') and not name.startswith( + "lm_head"): + name = 'transformer.' + name + yield name, tensor diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 6a1d97bd7b69c..c4ae4fc3c0062 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -272,12 +272,6 @@ class GPTBigCodeModel(nn.Module): class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): packed_modules_mapping = {"c_attn": ["c_attn"]} - # LoRA specific attributes - embedding_modules = { - "wte": "input_embeddings", - "lm_head": "output_embeddings", - } - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config @@ -330,8 +324,11 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + skip_prefixes = None + if self.config.tie_word_embeddings: + skip_prefixes = ["lm_head."] loader = AutoWeightsLoader( self, - skip_prefixes=(["lm_head."]), + skip_prefixes=skip_prefixes, ) - return loader.load_weights(weights) \ No newline at end of file + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index c49db653f735a..3524d036db222 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -122,8 +122,9 @@ class GraniteAttention(nn.Module): assert tp_size % self.total_num_kv_heads == 0 self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) # MistralConfig has an optional head_dim introduced by Mistral-Nemo - self.head_dim = getattr(config, "head_dim", - self.hidden_size // self.total_num_heads) + self.head_dim = getattr(config, "head_dim", None) + if self.head_dim is None: + self.head_dim = self.hidden_size // self.total_num_heads self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = config.attention_multiplier diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 578d31a851a95..bc9e9a3c02064 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -28,7 +28,7 @@ import torch import torch.nn.functional as F from torch import nn -from vllm.attention import Attention, AttentionMetadata +from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size @@ -182,25 +182,20 @@ class Grok1Attention(nn.Module): quant_config=quant_config, logits_soft_cap=attn_logits_soft_cap, prefix=f"{prefix}.attn") + self.attn_multiplier = getattr(self.config, "attn_output_multiplier", + 1.0) if self.config else 1.0 def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: AttentionMetadata, ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - attn_output = self.attn(q, k, v, kv_cache, attn_metadata) + attn_output = self.attn(q, k, v) output, _ = self.o_proj(attn_output) - - # Apply attention output multiplier if specified in config - attn_multiplier = getattr(self.config, "attn_output_multiplier", - None) if self.config else None - if attn_multiplier is not None: - output = output * attn_multiplier + output *= self.attn_multiplier return output @@ -261,8 +256,6 @@ class Grok1DecoderLayer(nn.Module): self, positions: torch.Tensor, hidden_states: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: AttentionMetadata, residual: Optional[torch.Tensor], ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention @@ -276,8 +269,6 @@ class Grok1DecoderLayer(nn.Module): hidden_states = self.attn( positions=positions, hidden_states=hidden_states, - kv_cache=kv_cache, - attn_metadata=attn_metadata, ) # Post attention normalization @@ -341,8 +332,6 @@ class Grok1Model(nn.Module): self, input_ids: torch.Tensor, positions: torch.Tensor, - kv_caches: list[torch.Tensor], - attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: @@ -359,9 +348,7 @@ class Grok1Model(nn.Module): for i in range(self.start_layer, self.end_layer): layer = self.layers[i] - hidden_states, residual = layer(positions, hidden_states, - kv_caches[i - self.start_layer], - attn_metadata, residual) + hidden_states, residual = layer(positions, hidden_states, residual) if not get_pp_group().is_last_rank: return IntermediateTensors({ @@ -529,13 +516,10 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self, input_ids: torch.Tensor, positions: torch.Tensor, - kv_caches: list[torch.Tensor], - attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: - hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors, + hidden_states = self.model(input_ids, positions, intermediate_tensors, inputs_embeds) return hidden_states diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 99c226439ecb8..904f5330c653e 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -25,9 +25,10 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer from .intern_vit import InternVisionModel from .internvl import (IMG_CONTEXT, IMG_END, IMG_START, + BaseInternVLDummyInputsBuilder, + BaseInternVLMultiModalProcessor, BaseInternVLProcessingInfo, BaseInternVLProcessor, - InternVLChatModel, InternVLDummyInputsBuilder, - InternVLMultiModalProcessor, build_transform, + InternVLChatModel, build_transform, find_closest_aspect_ratio, get_internvl_target_ratios) @@ -430,8 +431,8 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo): ) -class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo] - ): +class H2OVLMultiModalProcessor( + BaseInternVLMultiModalProcessor[H2OVLProcessingInfo]): def _get_prompt_updates( self, @@ -514,7 +515,7 @@ class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo] @MULTIMODAL_REGISTRY.register_processor( H2OVLMultiModalProcessor, info=H2OVLProcessingInfo, - dummy_inputs=InternVLDummyInputsBuilder) + dummy_inputs=BaseInternVLDummyInputsBuilder) class H2OVLChatModel(InternVLChatModel): def _init_vision_model( diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 8f33a3e29c60b..8be8841c1f6c9 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -226,9 +226,11 @@ class SupportsPP(Protocol): intermediate_tensors: Optional["IntermediateTensors"], ) -> Union[Tensor, "IntermediateTensors"]: """ - Accept {class}`IntermediateTensors` when PP rank > 0. + Accept [`IntermediateTensors`][vllm.sequence.IntermediateTensors] when + PP rank > 0. - Return {class}`IntermediateTensors` only for the last PP rank. + Return [`IntermediateTensors`][vllm.sequence.IntermediateTensors] only + for the last PP rank. """ ... diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index d9d9002bd5baa..538e9de4f78fc 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -415,6 +415,10 @@ class InternVisionEncoder(nn.Module): class InternVisionModel(nn.Module): + packed_modules_mapping = { + "qkv": ["qkv"], + } + def __init__( self, config: PretrainedConfig, diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 66e78fcc4e80c..c37d3afb4e440 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -8,8 +8,9 @@ # -------------------------------------------------------- from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, TypedDict, TypeVar, Union +from typing import Any, Literal, Optional, TypedDict, TypeVar, Union +import numpy.typing as npt import torch import torch.nn as nn import torchvision.transforms as T @@ -21,8 +22,10 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.models.intern_vit import (InternVisionModel, InternVisionPatchModel) +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargs, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, @@ -34,7 +37,8 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.tokenizer import AnyTokenizer -from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -73,11 +77,38 @@ InternVLImageInputs = Union[InternVLImagePixelInputs, InternVLImageEmbeddingInputs] +class InternVLVideoPixelInputs(TypedDict): + type: Literal["pixel_values_videos"] + pixel_values_flat: torch.Tensor + """ + Shape: + `(batch_size * num_video * num_frames, num_channels, height, width)` + """ + + num_patches: torch.Tensor + """Shape: `(batch_size * num_images)`""" + + +class InternVLVideoEmbeddingInputs(TypedDict): + type: Literal["video_embeds"] + data: Union[torch.Tensor, list[torch.Tensor]] + """ + A tensor of shape `(num_videos, total_video_feature_size, hidden_size)` + or a list of tensors of shape `(total_video_feature_size, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + """ + + +InternVLVideoInputs = Union[InternVLVideoPixelInputs, + InternVLVideoEmbeddingInputs] + + # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B def build_transform(input_size: int): MEAN, STD = IMAGENET_MEAN, IMAGENET_STD return T.Compose([ - T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), + T.Lambda(lambda img: convert_image_mode(img, 'RGB')), T.Resize((input_size, input_size), interpolation=T.InterpolationMode.BICUBIC), T.ToTensor(), @@ -230,6 +261,33 @@ def image_to_pixel_values_internvl( return pixel_values +# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B +def video_to_pixel_values_internvl( + video: npt.NDArray, + *, + input_size: int, + min_num: int, + max_num: int, + use_thumbnail: bool, +) -> torch.Tensor: + target_ratios = get_internvl_target_ratios(min_num, max_num) + + transform = build_transform(input_size=input_size) + frames_list = list[Image.Image]() + for frame in video: + pil_frame = dynamic_preprocess_internvl( + Image.fromarray(frame, mode="RGB"), + target_ratios=target_ratios, + image_size=input_size, + use_thumbnail=use_thumbnail, + ) + assert len(pil_frame) == 1 + frames_list.extend(pil_frame) + + pixel_values = torch.stack([transform(image) for image in frames_list]) + return pixel_values + + class BaseInternVLProcessor(ABC): """ This model doesn't define its own HF processor, @@ -374,24 +432,14 @@ class BaseInternVLProcessor(ABC): ) for image in images ] - def __call__( + def _preprocess_image( self, - text: Optional[Union[str, list[str]]] = None, - images: Optional[Union[Image.Image, list[Image.Image]]] = None, + text: list[str], + images: list[Image.Image], min_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, - return_tensors: Optional[Union[str, TensorType]] = None, - ) -> Mapping[str, NestedTensors]: - if text is None: - text = [] - if not isinstance(text, list): - text = [text] - if images is None: - images = [] - if not isinstance(images, list): - images = [images] - + ) -> tuple[list[str], dict[str, torch.Tensor]]: if len(images) == 0: image_inputs = {} else: @@ -414,6 +462,34 @@ class BaseInternVLProcessor(ABC): image_repl = self.get_image_repl(feature_size, num_patches) text = [t.replace('<image>', image_repl.full, 1) for t in text] + return text, image_inputs + + def _make_batch_input(self, + input_item: Optional[Union[Any, list[Any]]] = None): + if input_item is None: + input_item = [] + if not isinstance(input_item, list): + input_item = [input_item] + return input_item + + def __call__( + self, + text: Optional[Union[str, list[str]]] = None, + images: Optional[Union[Image.Image, list[Image.Image]]] = None, + min_dynamic_patch: Optional[int] = None, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + ) -> Mapping[str, NestedTensors]: + text, images = [self._make_batch_input(x) for x in (text, images)] + + text, image_inputs = self._preprocess_image( + text=text, + images=images, + min_dynamic_patch=min_dynamic_patch, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) text_inputs = self.tokenizer(text) @@ -424,11 +500,133 @@ class BaseInternVLProcessor(ABC): class InternVLProcessor(BaseInternVLProcessor): + """ + HF Processor for InternVLChatModel with extended video processing logic. + + Code for video processing is adapted from video example: + https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers + """ + + def __init__( + self, + config: PretrainedConfig, + tokenizer: AnyTokenizer, + *, + min_dynamic_patch: Optional[int] = None, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + video_token: Optional[str] = None, + ) -> None: + super().__init__( + config=config, + tokenizer=tokenizer, + min_dynamic_patch=min_dynamic_patch, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) + # add extra video token for video processing + self.video_token = video_token @property def image_token_id(self) -> int: return self.tokenizer.get_vocab()[IMG_CONTEXT] + @property + def video_token_id(self) -> Optional[int]: + if self.video_token is None: + return None + return self.tokenizer.get_vocab().get(self.video_token, None) + + @property + def supports_video(self) -> bool: + return self.video_token_id is not None + + def _videos_to_pixel_values_lst( + self, + videos: list[npt.NDArray], + dynamic_image_size: Optional[bool] = None, + ) -> list[torch.Tensor]: + min_num, max_num = self.resolve_min_max_num( + min_dynamic_patch=1, + max_dynamic_patch=1, + dynamic_image_size=dynamic_image_size, + use_thumbnail=False, # Applied in image_to_pixel_values + ) + + return [ + video_to_pixel_values_internvl( + video, + input_size=self.image_size, + min_num=min_num, + max_num=max_num, + use_thumbnail=False, + ) for video in videos + ] + + def _preprocess_video( + self, + text: list[str], + videos: list[npt.NDArray], + dynamic_image_size: Optional[bool] = None, + ): + if len(videos) == 0 or not self.supports_video: + video_inputs = {} + else: + pixel_values_lst_video = self._videos_to_pixel_values_lst( + videos, + dynamic_image_size=dynamic_image_size, + ) + video_inputs: dict[str, NestedTensors] = { + "pixel_values_flat_video": + torch.cat(pixel_values_lst_video), + "video_num_patches": + torch.tensor([len(item) for item in pixel_values_lst_video]), + } + + for pixel_values in pixel_values_lst_video: + num_patches = pixel_values.shape[0] + + video_repl = self.get_video_repl(self.num_image_token, + num_patches, self.video_token) + text = [t.replace('<video>', video_repl.full, 1) for t in text] + return text, video_inputs + + def __call__( + self, + text: Optional[Union[str, list[str]]] = None, + images: Optional[Union[Image.Image, list[Image.Image]]] = None, + videos: Optional[Union[npt.NDArray, list[npt.NDArray]]] = None, + min_dynamic_patch: Optional[int] = None, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + ) -> Mapping[str, NestedTensors]: + text, images, videos = [ + self._make_batch_input(x) for x in (text, images, videos) + ] + + text, image_inputs = self._preprocess_image( + text=text, + images=images, + min_dynamic_patch=min_dynamic_patch, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) + + text, video_inputs = self._preprocess_video( + text=text, + videos=videos, + dynamic_image_size=dynamic_image_size, + ) + + text_inputs = self.tokenizer(text) + + return { + **BatchEncoding(text_inputs, tensor_type=return_tensors), + **image_inputs, + **video_inputs, + } + def get_image_repl( self, feature_size: int, @@ -439,8 +637,24 @@ class InternVLProcessor(BaseInternVLProcessor): return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT) + def get_video_repl( + self, + feature_size: int, + num_patches: Optional[int] = None, + video_context_token: str = IMG_CONTEXT, + ) -> PromptUpdateDetails[str]: + repl_features = video_context_token * self.num_image_token + repl_features_with_sep = IMG_START + repl_features + IMG_END + # num_patches is equal to num_frames + repl_full = ''.join([ + f'Frame{i+1}: {repl_features_with_sep}' for i in range(num_patches) + ]) + + return PromptUpdateDetails.select_text(repl_full, video_context_token) + class BaseInternVLProcessingInfo(BaseProcessingInfo): + """Basic image-only ProcessingInfo for InternVL-style models.""" @abstractmethod def get_hf_processor( @@ -496,11 +710,22 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo): return largest_feature_pinpoint + def get_max_image_tokens(self) -> int: + processor = self.get_hf_processor() + target_width, target_height = self.get_image_size_with_most_features() + + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + processor=processor, + ) + _I = TypeVar("_I", bound=BaseInternVLProcessingInfo) -class InternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): +class BaseInternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): + """Basic image-only DummyInputsBuilder for InternVL-style models.""" def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: num_images = mm_counts.get("image", 0) @@ -524,7 +749,8 @@ class InternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): } -class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]): +class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]): + """ Basic image-only MultiModalProcessor for InternVL-style models.""" def _call_hf_processor( self, @@ -613,6 +839,38 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]): class InternVLProcessingInfo(BaseInternVLProcessingInfo): + """InternVL ProcessingInfo extended for video processing""" + + @property + def supports_video(self): + return self.get_hf_processor().supports_video + + def get_supported_mm_limits(self): + video_limit = {"video": None} if self.supports_video else {} + return {**super().get_supported_mm_limits(), **video_limit} + + def get_video_token(self) -> Optional[str]: + text_model_type = self.get_hf_config().get_text_config().model_type + if text_model_type == "qwen2": + return "<|video_pad|>" + return None + + def get_num_frames_with_most_features( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> int: + max_images = mm_counts.get("image", 0) + max_videos = mm_counts.get("video", 0) + + processor = self.get_hf_processor() + + max_image_tokens = self.get_max_image_tokens() * max_images + max_total_frames = (seq_len - + max_image_tokens) // processor.num_image_token + max_frames_per_video = max_total_frames // max(max_videos, 1) + + return max(max_frames_per_video, 1) def get_hf_processor( self, @@ -629,6 +887,8 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo): if dynamic_image_size is not None: kwargs["dynamic_image_size"] = dynamic_image_size + kwargs["video_token"] = self.get_video_token() + return self.ctx.init_processor( InternVLProcessor, config=self.get_hf_config(), @@ -637,11 +897,127 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo): ) +class InternVLDummyInputsBuilder( + BaseInternVLDummyInputsBuilder[InternVLProcessingInfo]): + """InternVL DummyInputsBuilder extended for video support""" + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_videos = mm_counts.get("video", 0) + + return super().get_dummy_text(mm_counts) + "<video>" * num_videos + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + dummy_image = super().get_dummy_mm_data(seq_len=seq_len, + mm_counts=mm_counts) + if self.info.supports_video: + config = self.info.get_hf_config() + image_size: int = config.vision_config.image_size + target_num_frames = \ + self.info.get_num_frames_with_most_features(seq_len, mm_counts) + num_videos = mm_counts.get("video", 0) + dummy_video = { + "video": + self._get_dummy_videos(width=image_size, + height=image_size, + num_frames=target_num_frames, + num_videos=num_videos) + } + else: + dummy_video = {} + return {**dummy_image, **dummy_video} + + +class InternVLMultiModalProcessor( + BaseInternVLMultiModalProcessor[InternVLProcessingInfo]): + """InternVL MultiModalProcessor extended for video support""" + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> Mapping[str, NestedTensors]: + processed_outputs = super()._call_hf_processor(prompt, mm_data, + mm_kwargs) + + hf_processor = self.info.get_hf_processor(**mm_kwargs) + if self.info.supports_video and ( + video_token_id := hf_processor.video_token_id) is not None: + processed_outputs["video_token_id"] = torch.tensor(video_token_id) + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: Mapping[str, NestedTensors], + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + image_fields = super()._get_mm_fields_config(hf_inputs, + hf_processor_mm_kwargs) + if self.info.supports_video: + video_num_patches = hf_inputs.get("video_num_patches", + torch.empty(0)) + num_videos = len(video_num_patches) + video_fields = dict( + pixel_values_flat_video=MultiModalFieldConfig.flat_from_sizes( + "video", video_num_patches), + video_num_patches=MultiModalFieldConfig.batched("video"), + video_token_id=MultiModalFieldConfig.shared( + "video", num_videos), + ) + else: + video_fields = {} + + return image_fields | video_fields + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + prompt_repl: list[PromptUpdate] = super()._get_prompt_updates( + mm_items, hf_processor_mm_kwargs, out_mm_kwargs) + + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + if "video_num_patches" in out_mm_kwargs: + video_num_patches = out_mm_kwargs["video_num_patches"] + assert isinstance(video_num_patches, torch.Tensor) + video_num_patches = video_num_patches.tolist() + else: + video_num_patches = [] + + def get_video_replacement_internvl(item_idx: int): + feature_size = hf_processor.num_image_token + num_patches = video_num_patches[item_idx] + if num_patches is not None: + assert isinstance(num_patches, int) + + return hf_processor.get_video_repl( + feature_size, + num_patches, + video_context_token=hf_processor.video_token) + + if self.info.supports_video: + prompt_repl.append( + PromptReplacement( + modality="video", + target="<video>", + replacement=get_video_replacement_internvl, + )) + return prompt_repl + + @MULTIMODAL_REGISTRY.register_processor( InternVLMultiModalProcessor, info=InternVLProcessingInfo, dummy_inputs=InternVLDummyInputsBuilder) -class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): +class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, + SupportsLoRA): def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() @@ -680,6 +1056,8 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): self.mlp1 = self._init_mlp1(config) self.img_context_token_id = None + self.video_context_token_id = None + self.visual_token_mask = None self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) @@ -824,10 +1202,55 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): raise AssertionError("This line should be unreachable.") + def _parse_and_validate_video_input( + self, **kwargs: object) -> Optional[InternVLVideoPixelInputs]: + pixel_values_flat_video = kwargs.pop("pixel_values_flat_video", None) + video_num_patches = kwargs.pop("video_num_patches", None) + video_embeds = kwargs.pop("image_embeds", None) + + if pixel_values_flat_video is None and video_embeds is None: + return None + + if video_embeds is not None: + if not isinstance(video_embeds, (torch.Tensor, list)): + raise ValueError("Incorrect type of video embeddings. " + f"Got type: {type(video_embeds)}") + + return InternVLImageEmbeddingInputs( + type="video_embeds", + data=flatten_bn(video_embeds), + ) + + video_token_id = kwargs["video_token_id"] + assert isinstance(video_token_id, torch.Tensor) + self.video_context_token_id = video_token_id.flatten().unique().item() + + if pixel_values_flat_video is not None: + if not isinstance(pixel_values_flat_video, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values_flat_video)}") + + if not isinstance(video_num_patches, (torch.Tensor, list)): + raise ValueError("Incorrect type of image_num_patches. " + f"Got type: {type(video_num_patches)}") + + pixel_values_flat_video = flatten_bn(pixel_values_flat_video, + concat=True) + video_num_patches = flatten_bn(video_num_patches, concat=True) + + return InternVLVideoPixelInputs( + type="pixel_values_videos", + pixel_values_flat=self._validate_pixel_values( + pixel_values_flat_video), + num_patches=video_num_patches, + ) + + raise AssertionError("This line should be unreachable.") + def _process_image_input( self, - image_input: InternVLImageInputs, - ) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor, ...]]: + image_input: Union[InternVLImageInputs, InternVLVideoPixelInputs], + ) -> tuple[torch.Tensor, ...]: if image_input["type"] == "image_embeds": return image_input["data"] @@ -839,8 +1262,8 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): # Only one image in the current batch if len(num_patches) == 1: - return image_embeds.view( - -1, self.config.text_config.hidden_size).unsqueeze(0) + return (image_embeds.view(-1, + self.config.text_config.hidden_size), ) # NOTE: Image embeddings are split into separate tensors for each image # by the size of each embedding. @@ -852,8 +1275,26 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): ] return image_embeds.split(image_feature_sizes) + def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: + modalities = {} + + # Preserve the order of modalities if there are multiple of them + # from the order of kwargs. + for input_key in kwargs: + if input_key in ("pixel_values_flat", + "image_embeds") and "images" not in modalities: + modalities["images"] = self._parse_and_validate_image_input( + **kwargs) + if input_key in ("pixel_values_flat_video", + ) and "videos" not in modalities: + modalities["videos"] = self._parse_and_validate_video_input( + **kwargs) + + return modalities + def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None: if self.is_mono: + assert self.img_context_token_id is not None self.visual_token_mask = ( input_ids == self.img_context_token_id).reshape(-1, 1) else: @@ -864,11 +1305,28 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): def get_multimodal_embeddings( self, **kwargs: object) -> Optional[MultiModalEmbeddings]: - image_input = self._parse_and_validate_image_input(**kwargs) - if image_input is None: + + modalities = self._parse_and_validate_multimodal_inputs(**kwargs) + if not modalities: return None - return self._process_image_input(image_input) + # The result multimodal_embeddings is tuple of tensors, with each + # tensor correspoending to a multimodal data item (image or video). + multimodal_embeddings: tuple[torch.Tensor, ...] = () + + # NOTE: It is important to iterate over the keys in this dictionary + # to preserve the order of the modalities. + for modality in modalities: + if modality == "images": + image_input = modalities["images"] + vision_embeddings = self._process_image_input(image_input) + multimodal_embeddings += vision_embeddings + if modality == "videos": + video_input = modalities["videos"] + video_embeddings = self._process_image_input(video_input) + multimodal_embeddings += video_embeddings + + return multimodal_embeddings def get_input_embeddings( self, @@ -877,13 +1335,18 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: - assert self.img_context_token_id is not None + context_token_ids = [ + token_id for token_id in (self.img_context_token_id, + self.video_context_token_id) + if token_id is not None + ] + assert len(context_token_ids) >= 1 self._set_visual_token_mask(input_ids) inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, - self.img_context_token_id, + context_token_ids, ) return inputs_embeds @@ -943,3 +1406,12 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): ] loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) return loader.load_weights(weights) + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="mlp1", + tower_model="vision_model") diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index c15c0213b520c..d36b6466c0bb9 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -162,20 +162,9 @@ class LlamaAttention(nn.Module): prefix=f"{prefix}.o_proj", ) - is_neox_style = True - is_gguf = quant_config and quant_config.get_name() == "gguf" - if is_gguf and config.model_type == "llama": - is_neox_style = False - - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, - is_neox_style=is_neox_style, - partial_rotary_factor=self.partial_rotary_factor, - ) + self._init_rotary_emb(config, + rope_scaling=rope_scaling, + quant_config=quant_config) if hasattr(config, "interleaved_sliding_window"): interleaved_sliding_window = config.interleaved_sliding_window @@ -214,6 +203,24 @@ class LlamaAttention(nn.Module): output, _ = self.o_proj(attn_output) return output + def _init_rotary_emb(self, config: LlamaConfig, + rope_scaling: Optional[dict[str, Any]], + quant_config: Optional[QuantizationConfig]) -> None: + is_neox_style = True + is_gguf = quant_config and quant_config.get_name() == "gguf" + if is_gguf and config.model_type == "llama": + is_neox_style = False + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=self.max_position_embeddings, + base=self.rope_theta, + rope_scaling=rope_scaling, + is_neox_style=is_neox_style, + partial_rotary_factor=self.partial_rotary_factor, + ) + class LlamaDecoderLayer(nn.Module): diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py index 018ecc2a8c0f0..172dc8b5ec06a 100644 --- a/vllm/model_executor/models/llama_eagle.py +++ b/vllm/model_executor/models/llama_eagle.py @@ -130,13 +130,15 @@ class LlamaModel(nn.Module): class EagleLlamaForCausalLM(LlamaForCausalLM): - def __init__(self, *, vllm_config: VllmConfig, start_layer_id: int = 0): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): nn.Module.__init__(self) self.config = vllm_config. \ speculative_config.draft_model_config.hf_config + target_layer_num = vllm_config.model_config.get_num_layers( + vllm_config.parallel_config) self.model = LlamaModel(vllm_config=vllm_config, prefix="model", - start_layer_id=start_layer_id) + start_layer_id=target_layer_num) logit_scale = getattr(self.config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.config.vocab_size, diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 2302d1352de64..1e40017fc792a 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -175,13 +175,15 @@ class LlamaModel(nn.Module): class Eagle3LlamaForCausalLM(LlamaForCausalLM): - def __init__(self, *, vllm_config: VllmConfig, start_layer_id: int = 0): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): nn.Module.__init__(self) self.config = vllm_config. \ speculative_config.draft_model_config.hf_config + target_layer_num = vllm_config.model_config.get_num_layers( + vllm_config.parallel_config) self.model = LlamaModel(vllm_config=vllm_config, - start_layer_id=start_layer_id, - prefix="model") + prefix="model", + start_layer_id=target_layer_num) logit_scale = getattr(self.config, "logit_scale", 1.0) self.lm_head = ParallelLMHead( @@ -193,8 +195,7 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM): self.logits_processor = LogitsProcessor(self.config.draft_vocab_size, scale=logit_scale) self.draft_id_to_target_id = nn.Parameter( - torch.zeros((self.config.draft_vocab_size), - dtype=torch.long).type(torch.LongTensor), + torch.zeros(self.config.draft_vocab_size, dtype=torch.long), requires_grad=False, ) @@ -213,6 +214,12 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM): ) -> Optional[torch.Tensor]: logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata) + if self.draft_id_to_target_id is None: + assert logits.shape[1] == self.config.vocab_size, \ + "Expected logits to have shape " \ + f"(*, {self.config.vocab_size}), but got {logits.shape}" + return logits + base = torch.arange(self.config.draft_vocab_size, device=logits.device) targets = base + self.draft_id_to_target_id logits_new = logits.new_full(( @@ -230,19 +237,22 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM): return self.model.fc(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): - loader = AutoWeightsLoader( - self, - skip_prefixes=None, - ) - model_weights = {} + includes_draft_id_mapping = False for name, loaded_weight in weights: if "t2d" in name: continue if "d2t" in name: name = name.replace("d2t", "draft_id_to_target_id") + includes_draft_id_mapping = True elif "lm_head" not in name: name = "model." + name model_weights[name] = loaded_weight - return loader.load_weights(model_weights.items()) + loader = AutoWeightsLoader( + self, + skip_prefixes=None, + skip_substrs=["draft_id_to_target_id"] \ + if not includes_draft_id_mapping else None, + ) + loader.load_weights(model_weights.items()) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 95c1a0ca0b981..ced71b6dcdebe 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -721,9 +721,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): batch. pixel_values: The pixels in each input image. - :::{seealso} - {class}`LlavaImageInputs` - ::: + Info: + [LlavaImageInputs][] """ if intermediate_tensors is not None: inputs_embeds = None diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index e731f1bfdb9ab..2fb79f57a67f1 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -135,11 +135,13 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo): current_aspect_ratio = current_width / current_height if aspect_ratio > current_aspect_ratio: - new_height = (original_height * current_width) // original_width + new_height = int( + round(original_height * (current_width / original_width), 7)) padding = (current_height - new_height) // 2 current_height = current_height - (2 * padding) else: - new_width = (original_width * current_height) // original_height + new_width = int( + round(original_width * (current_height / original_height), 7)) padding = (current_width - new_width) // 2 current_width = current_width - (2 * padding) @@ -538,7 +540,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, Unlike in LLaVA-1.5, the number of image tokens inputted to the language model depends on the original size of the input image. Including the original image token in the input, the required number of image tokens - is given by {func}`get_llava_next_image_feature_size`. + is given by [get_llava_next_image_feature_size][]. This way, the `positions` and `attn_metadata` are consistent with the `input_ids`. @@ -549,9 +551,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, pixel_values: The pixels in each grid patch for each input image. image_sizes: The original `(height, width)` for each input image. - :::{seealso} - {class}`LlavaNextImageInputs` - ::: + Info: + [LlavaNextImageInputs][] """ if intermediate_tensors is not None: inputs_embeds = None diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 49f1ecb4be897..7ea759fd59b82 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -116,11 +116,13 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo): current_aspect_ratio = current_width / current_height if aspect_ratio > current_aspect_ratio: - new_height = (original_height * current_width) // original_width + new_height = int( + round(original_height * (current_width / original_width), 7)) padding = (current_height - new_height) // 2 current_height = current_height - (2 * padding) else: - new_width = (original_width * current_height) // original_height + new_width = int( + round(original_width * (current_height / original_height), 7)) padding = (current_width - new_width) // 2 current_width = current_width - (2 * padding) diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index 858a1633befa0..65c6467bcf5fb 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -32,7 +32,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.utils import LayerBlockType -from .utils import (is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -167,6 +167,27 @@ class Mamba2Model(nn.Module): return hidden_states + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "A_log" in name: + name = name.replace("A_log", "A") + + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsV0Only): @@ -282,21 +303,5 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree, def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "A_log" in name: - name = name.replace("A_log", "A") - - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index 588bcb628f8ca..95ef1134b1bf9 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -51,10 +51,7 @@ class Medusa(nn.Module): needs to have truncated_vocab_size (=k) as an attribute.""" def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: - if hasattr(vllm_config, 'draft_model_config'): - config = vllm_config.draft_model_config.hf_config - else: - config = vllm_config.model_config.hf_config + config = vllm_config.speculative_config.draft_model_config.hf_config super().__init__() self.config = config self.blocks = nn.ModuleList([ diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py index adcfcaa6b1e6a..cbca6a4c8f9d2 100644 --- a/vllm/model_executor/models/mimo_mtp.py +++ b/vllm/model_executor/models/mimo_mtp.py @@ -250,7 +250,7 @@ class MiMoMTP(nn.Module): return loaded_params def map_model_name_to_mtp_param_name(self, name: str) -> str: - import re + import regex as re name_without_prefix = [ "token_layernorm", "hidden_layernorm", "input_proj", "final_layernorm" diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index d99ae81468a9b..f471a86ffba34 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -242,9 +242,7 @@ class MiniCPMAttention(nn.Module): base=rope_theta, rope_scaling=rope_scaling, ) - # set rope as fp32 instead of bf16 - self.rotary_emb.cos_sin_cache = self.rotary_emb._compute_cos_sin_cache( - ) + self.attn = Attention(self.num_heads, self.head_dim, self.scaling, @@ -447,6 +445,7 @@ class MiniCPMModel(nn.Module): for weight_name in ["w1", "w2", "w3"] ] params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: @@ -570,7 +569,7 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, intermediate_tensors, - inputs_embeds) + inputs_embeds) / self.scale_width return hidden_states def compute_logits( @@ -578,7 +577,6 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - hidden_states = hidden_states / self.scale_width logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata) return logits diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py new file mode 100644 index 0000000000000..039c3d22d1604 --- /dev/null +++ b/vllm/model_executor/models/minicpm_eagle.py @@ -0,0 +1,390 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only EagleMiniCPM model compatible with HuggingFace weights.""" +import math +from collections.abc import Iterable +from typing import Optional, Union + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from .interfaces import SupportsLoRA, SupportsPP +from .minicpm import MiniCPMAttention as EagleMiniCPMAttention +from .minicpm import MiniCPMMLP as EagleMiniCPMMLP +from .minicpm import MiniCPMMoE as EagleMiniCPMMoE +from .utils import (AutoWeightsLoader, is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, maybe_prefix) + + +class EagleMiniCPMDecoderLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.config = config + self.cache_config = cache_config + self.quant_config = quant_config + self.hidden_size = config.hidden_size + self.rope_theta = getattr(config, "rope_theta", 10000) + self.rope_scaling = getattr(config, "rope_scaling", None) + self.max_position_embeddings = getattr(config, + "max_position_embeddings", 8192) + self.prefix = prefix + self._init_attn_block() + self._init_ffn_block() + + def _init_attn_block(self): + self.input_layernorm = RMSNorm(self.config.hidden_size, + eps=self.config.rms_norm_eps) + self.self_attn = EagleMiniCPMAttention( + hidden_size=self.hidden_size, + num_heads=self.config.num_attention_heads, + num_kv_heads=self.config.num_key_value_heads, + rope_theta=self.rope_theta, + rope_scaling=self.rope_scaling, + max_position_embeddings=self.max_position_embeddings, + cache_config=self.cache_config, + quant_config=self.quant_config, + prefix=f"{self.prefix}.self_attn", + ) + + def _init_ffn_block(self): + self.post_attention_layernorm = RMSNorm(self.config.hidden_size, + eps=self.config.rms_norm_eps) + self.num_experts = getattr(self.config, "num_experts", 0) + if self.num_experts == 0: + self.mlp = EagleMiniCPMMLP( + hidden_size=self.hidden_size, + intermediate_size=self.config.intermediate_size, + hidden_act=self.config.hidden_act, + hidden_act_param=getattr(self.config, "hidden_act_param", 0.), + quant_config=self.quant_config, + ) + else: + self.mlp = EagleMiniCPMMoE( + num_experts=self.config.num_experts, + top_k=self.config.num_experts_per_tok, + hidden_size=self.config.hidden_size, + intermediate_size=self.config.intermediate_size) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + ) -> tuple[torch.Tensor, torch.Tensor]: + # Self Attention + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + ) + hidden_states = residual + hidden_states * \ + (self.config.scale_depth / math.sqrt(self.config.mup_denominator)) + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states * \ + (self.config.scale_depth / math.sqrt(self.config.mup_denominator)) + + return hidden_states, None + + +@support_torch_compile +class EagleMiniCPMModel(nn.Module): + + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = "", + start_layer: int = 0): + super().__init__() + + config = vllm_config.speculative_config.draft_model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + + self.config = config + self.cache_config = cache_config + self.quant_config = quant_config + lora_vocab = (lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0 + self.vocab_size = config.vocab_size + lora_vocab + self.org_vocab_size = config.vocab_size + self.fc = torch.nn.Linear(self.config.hidden_size * 2, + self.config.hidden_size, + bias=False) + self.input_norm1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.input_norm2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + ) + self.num_experts = getattr(self.config, "num_experts", 0) + self._init_layers(prefix, config, cache_config, quant_config, + start_layer) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], self.config.hidden_size)) + + def _init_layers( + self, + prefix: str, + config: PretrainedConfig, + cache_config: Optional[CacheConfig], + quant_config: Optional[QuantizationConfig], + start_layer: int, + ): + self.eagle_layers = nn.ModuleList([ + EagleMiniCPMDecoderLayer( + config, + cache_config, + quant_config, + f"{prefix}.eagle_layers.{i + start_layer}", + ) for i in range(self.config.num_hidden_layers) + ]) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + embedding = self.embed_tokens(input_ids) + return embedding * self.config.scale_emb + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> Union[torch.Tensor, IntermediateTensors]: + input_embeds = self.get_input_embeddings(input_ids) + input_embeds = self.input_norm1(input_embeds) + hidden_states = self.input_norm2(hidden_states) + + hidden_states = self.fc( + torch.cat((input_embeds, hidden_states), dim=-1)) + residual = None + for layer in self.eagle_layers: + hidden_states, residual = layer( + positions, + hidden_states, + residual, + ) + + return hidden_states, hidden_states + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + expert_params_mapping = [ + # (param_name, weight_name, expert_id) + ("ws" if weight_name in ["w1", "w3"] else "w2s", + f"experts.{expert_id}.{weight_name}.weight", expert_id) + for expert_id in range(self.num_experts) + for weight_name in ["w1", "w2", "w3"] + ] + params_dict = dict(self.named_parameters()) + + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if ("rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name): + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for param_name, weight_name, expert_id in expert_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, + loaded_weight, + weight_name, + expert_id=expert_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + # LoRA specific attributes + embedding_modules = { + "embed_tokens": "input_embeddings", + "lm_head": "output_embeddings", + } + embedding_padding_modules = ["lm_head"] + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.speculative_config.draft_model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + + self.prefix = prefix + self.vllm_config = vllm_config + self.config = config + self.lora_config = lora_config + self.cache_config = cache_config + self.quant_config = quant_config + + target_layer_num = vllm_config.model_config.get_num_layers( + vllm_config.parallel_config) + + self.model = self._init_model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model"), + start_layer=target_layer_num) + + unpadded_vocab_size = config.vocab_size + if lora_config: + unpadded_vocab_size += lora_config.lora_extra_vocab_size + self.lm_head = ParallelLMHead( + unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE + # We need bigger padding if using lora for kernel + # compatibility + if not lora_config else lora_config.lora_vocab_padding_size, + quant_config=quant_config, + ) + if config.tie_word_embeddings: + self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens) + self.scale_width = self.config.hidden_size / self.config.dim_model_base + + self.logits_processor = LogitsProcessor(unpadded_vocab_size, + config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + def _init_model(self, + *, + vllm_config: VllmConfig, + prefix: str = "", + start_layer: int = 0): + return EagleMiniCPMModel(vllm_config=vllm_config, + prefix=prefix, + start_layer=start_layer) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + hidden_states, hidden_states2 = self.model(input_ids, positions, + hidden_states) + hidden_states = hidden_states / self.scale_width + hidden_states2 = hidden_states2 / self.scale_width + return hidden_states, hidden_states2 + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] + if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 0285402dadf7f..ac0fe7b10c836 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -2,10 +2,10 @@ """Inference-only MiniMaxText01 model.""" import copy import math -import re from collections.abc import Iterable from typing import Optional, Union +import regex as re import torch import torch.distributed import torch.nn.functional as F @@ -141,7 +141,7 @@ class MiniMaxText01RotaryEmbedding(CustomOp): head_size: int, rotary_dim: int, max_position: int, - base: int, + base: float, is_neox_style: bool, cache_dtype: torch.dtype, ) -> None: @@ -155,10 +155,7 @@ class MiniMaxText01RotaryEmbedding(CustomOp): cache = self._compute_cos_sin_cache().to(cache_dtype) self.register_buffer("cos_sin_cache", cache, persistent=False) - def _compute_inv_freq( - self, - base: Union[int, float], - ) -> torch.Tensor: + def _compute_inv_freq(self, base: float) -> torch.Tensor: """Compute the inverse frequency.""" inv_freq = 1.0 / (base**(torch.arange( 0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim)) @@ -604,8 +601,9 @@ class MiniMaxText01DecoderLayer(nn.Module): rope_theta = getattr(config, "rope_theta", 10000) - head_dim = getattr(config, "head_dim", - config.hidden_size // config.num_attention_heads) + head_dim = getattr(config, "head_dim", None) + if head_dim is None: + head_dim = config.hidden_size // config.num_attention_heads if hasattr(config, "max_model_len") and isinstance( config.max_model_len, int): max_position_embeddings = min(config.max_position_embeddings, @@ -861,8 +859,9 @@ class MiniMaxText01Model(nn.Module): cache_shape=self.cache_shape) rope_theta = getattr(config, "rope_theta", 10000) - head_dim = getattr(config, "head_dim", - config.hidden_size // config.num_attention_heads) + head_dim = getattr(config, "head_dim", None) + if head_dim is None: + head_dim = config.hidden_size // config.num_attention_heads if hasattr(config, "max_model_len") and isinstance( config.max_model_len, int): max_position_embeddings = min(config.max_position_embeddings, diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 2b9cbf10440ab..051a73120838e 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -559,9 +559,8 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA, batch. pixel_values: The pixels in each input image. - :::{seealso} - {class}`Mistral3ImagePixelInputs` - ::: + Info: + [Mistral3ImagePixelInputs][] """ if intermediate_tensors is not None: inputs_embeds = None diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 4823808e89067..9bc7a16153e1f 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -138,8 +138,9 @@ class MixtralAttention(nn.Module): assert tp_size % self.total_num_kv_heads == 0 self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) # MixtralConfig has an optional head_dim argument - self.head_dim = getattr(config, "head_dim", - self.hidden_size // self.total_num_heads) + self.head_dim = getattr(config, "head_dim", None) + if self.head_dim is None: + self.head_dim = self.hidden_size // self.total_num_heads self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index f096f6a7996dc..8220200d270c2 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -193,8 +193,9 @@ class MixtralAttention(nn.Module): assert tp_size % self.total_num_kv_heads == 0 self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) # MixtralConfig has an optional head_dim argument - self.head_dim = getattr(config, "head_dim", - self.hidden_size // self.total_num_heads) + self.head_dim = getattr(config, "head_dim", None) + if self.head_dim is None: + self.head_dim = self.hidden_size // self.total_num_heads self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 8c98492c0bedd..58549b10e9666 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -34,6 +34,7 @@ from vllm.distributed import get_tensor_model_parallel_world_size from vllm.inputs import InputProcessingContext from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, + ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope @@ -49,6 +50,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.multimodal.utils import run_dp_sharded_vision_model from vllm.sequence import IntermediateTensors from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP @@ -84,23 +86,29 @@ class Llama4ImagePatchInputs(TypedDict): class Llama4VisionMLP(nn.Module): - def __init__(self, - input_size: int, - intermediate_size: int, - output_size: int, - bias: bool, - output_activation: bool, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + def __init__( + self, + input_size: int, + intermediate_size: int, + output_size: int, + bias: bool, + output_activation: bool, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + use_data_parallel: bool = False, + ): super().__init__() - self.fc1 = ColumnParallelLinear( + cls_fc1 = (ReplicatedLinear + if use_data_parallel else ColumnParallelLinear) + self.fc1 = cls_fc1( input_size=input_size, output_size=intermediate_size, bias=bias, quant_config=quant_config, prefix=f"{prefix}.fc1", ) - self.fc2 = RowParallelLinear( + cls_fc2 = ReplicatedLinear if use_data_parallel else RowParallelLinear + self.fc2 = cls_fc2( input_size=intermediate_size, output_size=output_size, bias=bias, @@ -155,10 +163,12 @@ def pixel_shuffle(input_tensor, shuffle_ratio): int(channels / shuffle_ratio)) reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous() - reshaped_tensor = reshaped_tensor.view(batch_size, - int(height * shuffle_ratio), - int(width * shuffle_ratio), - int(channels / (shuffle_ratio**2))) + reshaped_tensor = reshaped_tensor.view( + batch_size, + int(height * shuffle_ratio), + int(width * shuffle_ratio), + int(channels / (shuffle_ratio**2)), + ) reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous() output_tensor = reshaped_tensor.view(batch_size, -1, @@ -173,6 +183,7 @@ class Llama4VisionPixelShuffleMLP(nn.Module): config, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ): super().__init__() self.pixel_shuffle_ratio = config.pixel_shuffle_ratio @@ -186,7 +197,9 @@ class Llama4VisionPixelShuffleMLP(nn.Module): bias=config.multi_modal_projector_bias, output_activation=True, quant_config=quant_config, - prefix=f"{prefix}.mlp") + prefix=f"{prefix}.mlp", + use_data_parallel=use_data_parallel, + ) def forward(self, encoded_patches: torch.Tensor) -> torch.Tensor: encoded_patches = pixel_shuffle(encoded_patches, @@ -201,10 +214,12 @@ class Llama4VisionAttention(nn.Module): config: Llama4VisionConfig, quant_config: Optional[QuantizationConfig], prefix: str = "", + use_data_parallel: bool = False, ): super().__init__() self.config = config - self.tp_size = get_tensor_model_parallel_world_size() + self.tp_size = (1 if use_data_parallel else + get_tensor_model_parallel_world_size()) self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = config.hidden_size // self.num_heads @@ -217,22 +232,39 @@ class Llama4VisionAttention(nn.Module): self.attn = MultiHeadAttention(self.num_local_heads, self.head_dim, self.scaling) - self.qkv_proj = QKVParallelLinear( - self.embed_dim, - self.head_dim, - self.num_heads, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.qkv_proj", - ) - self.o_proj = RowParallelLinear( - self.num_heads * self.head_dim, - self.embed_dim, - bias=True, - input_is_parallel=True, - quant_config=quant_config, - prefix=f"{prefix}.o_proj", - ) + + if use_data_parallel: + self.qkv_proj = ReplicatedLinear( + self.embed_dim, + self.q_size + 2 * self.kv_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.o_proj = ReplicatedLinear( + self.num_heads * self.head_dim, + self.embed_dim, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + else: + self.qkv_proj = QKVParallelLinear( + self.embed_dim, + self.head_dim, + self.num_heads, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.o_proj = RowParallelLinear( + self.num_heads * self.head_dim, + self.embed_dim, + bias=True, + input_is_parallel=True, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) self.rotary_emb = get_rope( head_size=self.head_dim, @@ -275,22 +307,29 @@ class Llama4VisionEncoderLayer(nn.Module): config: Llama4VisionConfig, quant_config: Optional[QuantizationConfig], prefix: str = "", + use_data_parallel: bool = False, ): super().__init__() self.hidden_size = config.hidden_size self.num_attention_heads = config.num_attention_heads self.intermediate_size = config.intermediate_size - self.self_attn = Llama4VisionAttention(config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn") - self.mlp = Llama4VisionMLP(input_size=config.hidden_size, - intermediate_size=config.intermediate_size, - output_size=config.hidden_size, - bias=True, - output_activation=False, - quant_config=quant_config, - prefix=f"{prefix}.mlp") + self.self_attn = Llama4VisionAttention( + config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + use_data_parallel=use_data_parallel, + ) + self.mlp = Llama4VisionMLP( + input_size=config.hidden_size, + intermediate_size=config.intermediate_size, + output_size=config.hidden_size, + bias=True, + output_activation=False, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + use_data_parallel=use_data_parallel, + ) self.input_layernorm = nn.LayerNorm(config.hidden_size) self.post_attention_layernorm = nn.LayerNorm(config.hidden_size) @@ -322,6 +361,7 @@ class Llama4VisionEncoder(nn.Module): config: Llama4VisionConfig, quant_config: Optional[QuantizationConfig], prefix: str = "", + use_data_parallel: bool = False, ): super().__init__() self.config = config @@ -330,6 +370,7 @@ class Llama4VisionEncoder(nn.Module): config, quant_config=quant_config, prefix=f"{prefix}.layers.{layer_idx}", + use_data_parallel=use_data_parallel, ) for layer_idx in range(config.num_hidden_layers) ]) @@ -357,23 +398,33 @@ class Llama4VisionEncoder(nn.Module): class Llama4UnfoldConvolution(nn.Module): - def __init__(self, - config: Llama4VisionConfig, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + def __init__( + self, + config: Llama4VisionConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + use_data_parallel: bool = False, + ): super().__init__() kernel_size = config.patch_size if isinstance(kernel_size, int): kernel_size = (kernel_size, kernel_size) self.unfold = torch.nn.Unfold(kernel_size=kernel_size, stride=config.patch_size) - self.linear = ColumnParallelLinear(config.num_channels * - kernel_size[0] * kernel_size[1], - config.hidden_size, - bias=False, - quant_config=quant_config, - gather_output=True, - prefix=f"{prefix}.linear") + params = { + "input_size": + config.num_channels * kernel_size[0] * kernel_size[1], + "output_size": config.hidden_size, + "bias": False, + "quant_config": quant_config, + "prefix": f"{prefix}.linear", + } + if use_data_parallel: + cls = ReplicatedLinear + else: + cls = ColumnParallelLinear + params["gather_output"] = True + self.linear = cls(**params) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = self.unfold(hidden_states) @@ -389,6 +440,7 @@ class Llama4VisionModel(nn.Module): config: Llama4VisionConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ): super().__init__() self.config = config @@ -403,7 +455,9 @@ class Llama4VisionModel(nn.Module): self.patch_embedding = Llama4UnfoldConvolution( config, quant_config=quant_config, - prefix=f"{prefix}.patch_embedding") + prefix=f"{prefix}.patch_embedding", + use_data_parallel=use_data_parallel, + ) self.class_embedding = nn.Parameter(self.scale * torch.randn(self.hidden_size)) @@ -415,11 +469,18 @@ class Llama4VisionModel(nn.Module): self.layernorm_post = nn.LayerNorm(self.hidden_size, eps=1e-5) # encoders - self.model = Llama4VisionEncoder(config, - quant_config=quant_config, - prefix=f"{prefix}.model") + self.model = Llama4VisionEncoder( + config, + quant_config=quant_config, + prefix=f"{prefix}.model", + use_data_parallel=use_data_parallel, + ) self.vision_adapter = Llama4VisionPixelShuffleMLP( - config, quant_config, prefix=f"{prefix}.vision_adapter") + config, + quant_config, + prefix=f"{prefix}.vision_adapter", + use_data_parallel=use_data_parallel, + ) def forward( self, @@ -528,8 +589,9 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo] vision_config = self.info.get_hf_config().vision_config if processed_outputs.get("pixel_values") is not None: - assert "images" in mm_data, \ - "images expected to be in mm_data when pixel_values is present" + assert ( + "images" in mm_data + ), "images expected to be in mm_data when pixel_values is present" images = mm_data["images"] parsed_images = (self._get_data_parser().parse_mm_data({ @@ -546,8 +608,8 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo] get_best_fit( (image.size[1], image.size[0]), torch.tensor(possible_resolutions), - resize_to_max_canvas=image_processor.resize_to_max_canvas) - for image in parsed_images + resize_to_max_canvas=image_processor.resize_to_max_canvas, + ) for image in parsed_images ] # TODO tile height/width do not necessarily need to match aspect_ratios = [(image_size[0] // tile_size, @@ -659,13 +721,17 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config + self.use_data_parallel = (vllm_config.parallel_config. + enable_multimodal_encoder_data_parallel) self.config = config self.quant_config = quant_config self.multimodal_config = multimodal_config - self.vision_model = Llama4VisionModel(config.vision_config, - None, - prefix=maybe_prefix( - prefix, "vision_model")) + self.vision_model = Llama4VisionModel( + config.vision_config, + None, + prefix=maybe_prefix(prefix, "vision_model"), + use_data_parallel=self.use_data_parallel, + ) self.multi_modal_projector = Llama4MultiModalProjector( self.config, None, @@ -709,7 +775,13 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, flat_data = image_input["flat_data"] patches_per_image = image_input["patches_per_image"].tolist() - vision_embeddings_flat = self.vision_model(flat_data) + # shard image input + if self.use_data_parallel: + vision_embeddings_flat = run_dp_sharded_vision_model( + flat_data, self.vision_model) + else: + vision_embeddings_flat = self.vision_model(flat_data) + vision_embeddings_flat = self.multi_modal_projector( vision_embeddings_flat) @@ -796,6 +868,30 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, return get_prefix_weights(), get_other_weights() + def _consolidate_qkv_weights( + self, weights: Iterable[tuple[str, torch.Tensor]] + ) -> Iterable[tuple[str, torch.Tensor]]: + qkv_idx_mappings = { + ".self_attn.q_proj": 0, + ".self_attn.k_proj": 1, + ".self_attn.v_proj": 2, + } + qkv_weights = {} + for name, loaded_weight in weights: + for weight_name, idx in qkv_idx_mappings.items(): + if weight_name not in name: + continue + new_name = name.replace(weight_name, ".self_attn.qkv_proj") + if new_name not in qkv_weights: + qkv_weights[new_name] = [None] * 3 + qkv_weights[new_name][idx] = loaded_weight + break + else: + yield name, loaded_weight + for key, weight in qkv_weights.items(): + qkv_weight = torch.cat(weight, dim=0) + yield key, qkv_weight + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: @@ -818,9 +914,12 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, assert loaded_language_model_params is not None updated_params.update(loaded_language_model_params) + if self.use_data_parallel: + other_weights = self._consolidate_qkv_weights(other_weights) + for name, loaded_weight in other_weights: for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: + if weight_name not in name or self.use_data_parallel: continue name = name.replace(weight_name, param_name) param = params_dict[name] diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 86552aa05bf95..18eab6051736f 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -12,7 +12,7 @@ from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.linear import (QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import CrossEncodingPooler +from vllm.model_executor.layers.pooler import ClassifierPooler from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) @@ -278,8 +278,9 @@ class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding): self.model = ModernBertModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "modernbert")) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self._pooler = CrossEncodingPooler(config, self.classifier, - ModernBertPooler(config)) + self._pooler = ClassifierPooler(vllm_config.model_config, + self.classifier, + ModernBertPooler(config)) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index e215582a37ac8..640a2049a6293 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -965,7 +965,7 @@ def select_tiling( class MolmoProcessorWrapper: """ - Wraps {class}`MolmoProcessor` so that it can be called directly. + Wraps `MolmoProcessor` so that it can be called directly. The original definition can be found here: https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index c5c5155a2df56..d0999e30e1ba4 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -158,8 +158,9 @@ class NemotronAttention(nn.Module): assert tp_size % self.total_num_kv_heads == 0 self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) # MistralConfig has an optional head_dim introduced by Mistral-Nemo - self.head_dim = getattr(config, "head_dim", - self.hidden_size // self.total_num_heads) + self.head_dim = getattr(config, "head_dim", None) + if self.head_dim is None: + self.head_dim = self.hidden_size // self.total_num_heads self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index f4d5a77f2086d..9808fe05558e2 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -23,18 +23,20 @@ # limitations under the License. """Inference-only deci model compatible with HuggingFace weights.""" from collections.abc import Iterable -from typing import Optional, Union +from typing import Any, Optional, Union import torch from torch import nn from transformers import LlamaConfig +from vllm.attention import AttentionType from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -62,6 +64,48 @@ def _find_multiple(n: int, k: int) -> int: return n + k - (n % k) +class DeciLMAttention(LlamaAttention): + + def __init__( + self, + config: LlamaConfig, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[dict[str, Any]] = None, + max_position_embeddings: int = 8192, + quant_config: Optional[QuantizationConfig] = None, + bias: bool = False, + bias_o_proj: bool = False, + cache_config: Optional[CacheConfig] = None, + prefix: str = "", + attn_type: str = AttentionType.DECODER, + ) -> None: + super().__init__(config, hidden_size, num_heads, num_kv_heads, + rope_theta, rope_scaling, max_position_embeddings, + quant_config, bias, bias_o_proj, cache_config, prefix, + attn_type) + + def _init_rotary_emb(self, config, rope_scaling: Optional[dict[str, Any]], + quant_config: Optional[QuantizationConfig]) -> None: + # Enables YARN for Mistral and LLaMA4 derivatives. + is_neox_style = True + if hasattr(config, "position_embedding_type"): + is_neox_style = config.position_embedding_type not in [ + "mistral_yarn", "rope_llama4" + ] + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=self.max_position_embeddings, + base=self.rope_theta, + rope_scaling=rope_scaling, + is_neox_style=is_neox_style, + partial_rotary_factor=self.partial_rotary_factor) + + class DeciLMDecoderLayer(nn.Module): def __init__( @@ -98,7 +142,7 @@ class DeciLMDecoderLayer(nn.Module): if not self._is_no_op_attention: num_kv_heads = (config.num_attention_heads // block_config.attention.n_heads_in_group) - self.self_attn = LlamaAttention( + self.self_attn = DeciLMAttention( config=config, hidden_size=self.hidden_size, num_heads=config.num_attention_heads, diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 62a7deab6a10c..172434e66ae2c 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -22,9 +22,10 @@ from vllm.multimodal.processing import (PromptReplacement, PromptUpdate, PromptUpdateDetails) from .intern_vit import InternVisionModel -from .internvl import (BaseInternVLProcessingInfo, BaseInternVLProcessor, - InternVLChatModel, InternVLDummyInputsBuilder, - InternVLMultiModalProcessor) +from .internvl import (BaseInternVLDummyInputsBuilder, + BaseInternVLMultiModalProcessor, + BaseInternVLProcessingInfo, BaseInternVLProcessor, + InternVLChatModel) IMG_PAD = "<|vision_pad|>" @@ -84,7 +85,8 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo): ) -class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]): +class NVLMDummyInputsBuilder(BaseInternVLDummyInputsBuilder[NVLMProcessingInfo] + ): def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: num_images = mm_counts.get("image", 0) @@ -110,7 +112,8 @@ class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]): } -class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]): +class NVLMMultiModalProcessor( + BaseInternVLMultiModalProcessor[NVLMProcessingInfo]): def _get_prompt_updates( self, diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 0a1fb10c186e5..33adacdae5f5b 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -314,7 +314,8 @@ class Olmo2Model(nn.Module): hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -325,6 +326,7 @@ class Olmo2Model(nn.Module): ] params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() for name, loaded_weight in weights: if is_pp_missing_parameter(name, self): continue @@ -347,6 +349,8 @@ class Olmo2Model(nn.Module): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params class Olmo2ForCausalLM(nn.Module, SupportsPP): diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 6364b89fb8379..af289455527ce 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -13,6 +13,7 @@ # limitations under the License. """Inference-only OLMoE model compatible with HuggingFace weights.""" from collections.abc import Iterable +from functools import partial from typing import Any, Optional, Union import torch @@ -22,7 +23,10 @@ from transformers import PretrainedConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather) +from vllm.distributed.utils import split_tensor_along_last_dim from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm @@ -140,8 +144,11 @@ class OlmoeAttention(nn.Module): bias=False, quant_config=quant_config, ) - self.q_norm = RMSNorm(hidden_size, eps=1e-5) - self.k_norm = RMSNorm(hidden_size, eps=1e-5) + self.tp_size = tp_size + self.tp_rank = get_tensor_model_parallel_rank() + self.q_norm = RMSNorm(self.total_num_heads * self.head_dim, eps=1e-5) + self.k_norm = RMSNorm(self.total_num_kv_heads * self.head_dim, + eps=1e-5) self.o_proj = RowParallelLinear( self.total_num_heads * self.head_dim, hidden_size, @@ -165,6 +172,20 @@ class OlmoeAttention(nn.Module): quant_config=quant_config, prefix=f"{prefix}.attn") + def _apply_qk_norm(self, q: torch.Tensor, + k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + if self.tp_size > 1: + q = tensor_model_parallel_all_gather(q.contiguous()) + k = tensor_model_parallel_all_gather(k.contiguous()) + q = self.q_norm(q) + k = self.k_norm(k) + if self.tp_size > 1: + splitter = partial(split_tensor_along_last_dim, + num_partitions=self.tp_size) + q = splitter(q)[self.tp_rank] + k = splitter(k)[self.tp_rank] + return q, k + def forward( self, positions: torch.Tensor, @@ -172,7 +193,7 @@ class OlmoeAttention(nn.Module): ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = self.q_norm(q.contiguous()), self.k_norm(k.contiguous()) + q, k = self._apply_qk_norm(q, k) q, k = self.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v) output, _ = self.o_proj(attn_output) diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index e03705d48f3e8..232a63c506890 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -30,6 +30,9 @@ from vllm.config import VllmConfig from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.quantization.gptq import GPTQConfig +from vllm.model_executor.layers.quantization.gptq_marlin import ( + GPTQMarlinConfig) from vllm.model_executor.models.aimv2 import AIMv2Model from vllm.model_executor.models.siglip import SiglipVisionModel from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn, @@ -48,7 +51,7 @@ from vllm.transformers_utils.configs.ovis import (BaseVisualTokenizerConfig, OvisConfig) from vllm.transformers_utils.processors.ovis import OvisProcessor -from .interfaces import MultiModalEmbeddings, SupportsMultiModal +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .utils import merge_multimodal_embeddings # Cannot find the following number from hf config. @@ -106,12 +109,14 @@ class VisualTokenizer(torch.nn.Module): config: BaseVisualTokenizerConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", - ): + ) -> nn.Module: model_type = config.backbone_config.model_type if model_type == "aimv2": + # No post rms_norm in Ovis2's AIMv2 ViT. return AIMv2Model( config=config.backbone_config, quant_config=quant_config, + require_post_norm=False, prefix=prefix, ) elif model_type == "siglip_vision_model": @@ -124,14 +129,14 @@ class VisualTokenizer(torch.nn.Module): f"Unsupported visual tokenizer model_type: {model_type}") @property - def dtype(self): + def dtype(self) -> torch.dtype: return next(self.head.parameters()).dtype @property - def device(self): + def device(self) -> torch.device: return next(self.head.parameters()).device - def tokenize(self, logits): + def tokenize(self, logits: torch.Tensor) -> torch.Tensor: if self.config.tokenize_function == 'softmax': tokens = softmax(logits, dim=-1) elif self.config.tokenize_function == 'gumbel_argmax': @@ -144,7 +149,7 @@ class VisualTokenizer(torch.nn.Module): f'or st_argmax, but got {self.config.tokenize_function}') return tokens - def encode(self, pixel_values): + def encode(self, pixel_values: torch.Tensor) -> torch.Tensor: features = self.backbone(pixel_values) if self.config.drop_cls_token: features = features[:, 1:, :] @@ -395,7 +400,7 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]): @MULTIMODAL_REGISTRY.register_processor(OvisMultiModalProcessor, info=OvisProcessingInfo, dummy_inputs=OvisDummyInputsBuilder) -class Ovis(nn.Module, SupportsMultiModal): +class Ovis(nn.Module, SupportsMultiModal, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -410,7 +415,7 @@ class Ovis(nn.Module, SupportsMultiModal): self.visual_tokenizer = VisualTokenizer( config=config.visual_tokenizer_config, - quant_config=quant_config, + quant_config=self._maybe_ignore_quant_config(quant_config), prefix=f"{prefix}.visual_tokenizer", ) @@ -421,9 +426,16 @@ class Ovis(nn.Module, SupportsMultiModal): text_model_type = self.config.get_text_config().model_type self.image_pad_token_id = IMAGE_PAD_TOKEN_ID_MAP[text_model_type] - # TODO(Isotr0py): PP support - # self.make_empty_intermediate_tensors = ( - # self.language_model.make_empty_intermediate_tensors) + self.make_empty_intermediate_tensors = ( + self.get_language_model().make_empty_intermediate_tensors) + + def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): + # GPTQ configs do not have a list of ignored modules, however AutoGPTQ + # seems to avoid vision encoder sections for some models. + # See: https://huggingface.co/AIDC-AI/Ovis2-2B-GPTQ-Int4 + if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)): + return None + return quant_config def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[OvisImagePatchInputs]: diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index bb4d46be3f997..b757e661d7712 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -14,10 +14,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import re from collections.abc import Iterable, Mapping, Sequence from typing import Any, Literal, Optional, TypedDict, Union +import regex as re import torch import torch.nn as nn from transformers import (BatchFeature, CLIPVisionConfig, PretrainedConfig, diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index c664d2371e27c..9f28d4cef4251 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -9,7 +9,9 @@ from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn import torch.nn.functional as F -from mistral_common.protocol.instruct.messages import ImageChunk +from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk, + UserMessage) +from mistral_common.protocol.instruct.request import ChatCompletionRequest from mistral_common.tokens.tokenizers.multimodal import ImageEncoder from PIL import Image from transformers import PixtralVisionConfig, TensorType @@ -39,7 +41,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, MultiModalHashes, PromptReplacement, PromptUpdate, PromptUpdateDetails) -from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.tokenizer import (MistralTokenizer, cached_tokenizer_from_config) @@ -65,14 +67,14 @@ class PixtralImagePixelInputs(TypedDict): """ Shape: `(batch_size * num_images, num_channels, image_width, image_height)` - The result of stacking {attr}`ImageEncoding.tokens` from each prompt. + The result of stacking `ImageEncoding.tokens` from each prompt. """ class PixtralProcessorAdapter: """ Provide a HF-compatible interface for - {class}`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`. + `mistral_common.tokens.tokenizers.multimodal.ImageEncoder`. """ def __init__(self, tokenizer: MistralTokenizer) -> None: @@ -224,6 +226,28 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]): num_images=num_images) } + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + tokenizer = self.info.get_tokenizer() + + dummy_text = self.get_dummy_text(mm_counts) + dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts) + dummy_images = dummy_mm_data.get("image", []) + + request = ChatCompletionRequest(messages=[ + UserMessage(content=[ + TextChunk(text=dummy_text), + *(ImageChunk(image=image) for image in dummy_images), + ]), + ]) + res = tokenizer.mistral.encode_chat_completion(request) + dummy_tokens = res.tokens + + return ProcessorInputs(prompt=dummy_tokens, mm_data=dummy_mm_data) + class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo] ): @@ -275,8 +299,12 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo] *, return_mm_hashes: bool, ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: - prompt_ids, mm_kwargs, mm_hashes, _ = super( - )._cached_apply_hf_processor( + ( + prompt_ids, + mm_kwargs, + mm_hashes, + _, + ) = super()._cached_apply_hf_processor( prompt=prompt, mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 0d0d98c59dbc7..a664864ff898f 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -34,32 +34,27 @@ from vllm.attention import Attention, AttentionType from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, PoolerOutput +from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP -from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, - extract_layer_index, is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index, + is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -logger = init_logger(__name__) - class Qwen2MLP(nn.Module): @@ -499,69 +494,3 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): if self.config.tie_word_embeddings else None), ) return loader.load_weights(weights) - - -class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - - hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config - pooler_config = vllm_config.model_config.pooler_config - - self.config = config - self.lora_config = lora_config - - self.quant_config = quant_config - self.model = Qwen2Model(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - - # TODO: Replace this model class with as_embedding_model( - # Qwen2ForCausalLM) after changing the default pooling method - if pooler_config.pooling_type is None: - logger.warning( - "This embedding model will default to last-token pooling in " - "an upcoming version. To avoid breaking changes, you should " - "pass `--override-pooler-config '{\"pooling_type\": \"MEAN\"}'`" - " explicitly.") - - self._pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.MEAN, - normalize=True, - softmax=False) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - ) -> torch.Tensor: - return self.model(input_ids, positions, intermediate_tensors) - - def pooler( - self, - hidden_states: torch.Tensor, - pooling_metadata: PoolingMetadata, - ) -> Optional[PoolerOutput]: - return self._pooler(hidden_states, pooling_metadata) - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): - weights = self.hf_to_vllm_mapper.apply(weights) - weights = ((name, data) for name, data in weights - if not name.startswith("lm_head.")) - self.model.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 68dd07820189e..e3fa9f67ca078 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -821,17 +821,6 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor): dummy_inputs=Qwen2_5_VLDummyInputsBuilder) class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 0ff0836b08975..873baa56faf37 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1069,17 +1069,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo] dummy_inputs=Qwen2VLDummyInputsBuilder) class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index 3701153bace53..f5d242fdf1c26 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -7,12 +7,12 @@ import copy import math -import re import unicodedata from collections.abc import Collection, Mapping, Sequence, Set from functools import lru_cache, partial from typing import Callable, Literal, Optional, TypedDict, Union +import regex as re import torch from torch import nn from torchvision import transforms @@ -382,7 +382,8 @@ def _get_tokenizer_without_image_pad( tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer: """ The logic of adding image pad tokens should only be applied in - {class}`QwenVLProcessor`, so they are patched out here. + [`QwenVLProcessor`][vllm.model_executor.models.qwen_vl.QwenVLProcessor], + so they are patched out here. The definition of the wrapped tokenizer can be found here: https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index c55f7ccd344ff..8efd4825beea9 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -79,6 +79,7 @@ _TEXT_GENERATION_MODELS = { "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), "MambaForCausalLM": ("mamba", "MambaForCausalLM"), "FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"), + "FalconH1ForCausalLM":("falcon_h1", "FalconH1ForCausalLM"), "Mamba2ForCausalLM": ("mamba2", "Mamba2ForCausalLM"), "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"), "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"), @@ -141,7 +142,7 @@ _EMBEDDING_MODELS = { "ModernBertModel": ("modernbert", "ModernBertModel"), "NomicBertModel": ("bert_with_rope", "NomicBertModel"), "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"), - "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"), + "Qwen2Model": ("qwen2", "Qwen2ForCausalLM"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"), @@ -207,6 +208,7 @@ _MULTIMODAL_MODELS = { "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), # noqa: E501 "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"), # noqa: E501 "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501 + "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501 "UltravoxModel": ("ultravox", "UltravoxModel"), "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"), # [Encoder-decoder] @@ -221,6 +223,7 @@ _SPECULATIVE_DECODING_MODELS = { "MiMoMTPModel": ("mimo_mtp", "MiMoMTP"), "EAGLEModel": ("eagle", "EAGLE"), "EagleLlamaForCausalLM": ("llama_eagle", "EagleLlamaForCausalLM"), + "EagleMiniCPMForCausalLM": ("minicpm_eagle", "EagleMiniCPMForCausalLM"), "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), "MedusaModel": ("medusa", "Medusa"), @@ -381,7 +384,7 @@ class _ModelRegistry: `model_cls` can be either: - - A {class}`torch.nn.Module` class directly referencing the model. + - A [`torch.nn.Module`][] class directly referencing the model. - A string in the format `<module>:<class>` which can be used to lazily import the model. This is useful to avoid initializing CUDA when importing the model and thus the related error diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 9a4d0ab2dd4d7..76008b72941da 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -9,7 +9,7 @@ from torch import nn from transformers import RobertaConfig from vllm.config import VllmConfig -from vllm.model_executor.layers.pooler import CrossEncodingPooler +from vllm.model_executor.layers.pooler import ClassifierPooler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -186,7 +186,9 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding, embedding_class=RobertaEmbedding, add_pooling_layer=False) self.classifier = RobertaClassificationHead(config) - self._pooler = CrossEncodingPooler(config, self.classifier) + + self._pooler = ClassifierPooler(vllm_config.model_config, + self.classifier) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): bert_weights, task_weights = roberta_task_weights_filter(weights) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 3b5334afa7af8..4803da2956ef1 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -130,11 +130,10 @@ class SiglipVisionEmbeddings(nn.Module): embeddings = patch_embeds.flatten(2).transpose(1, 2) if interpolate_pos_encoding: - embeddings = embeddings + self.interpolate_pos_encoding( + embeddings += self.interpolate_pos_encoding( embeddings, height, width) else: - embeddings = embeddings + self.position_embedding( - self.position_ids) + embeddings += self.position_embedding(self.position_ids) return embeddings @@ -271,12 +270,12 @@ class SiglipEncoderLayer(nn.Module): hidden_states = self.layer_norm1(hidden_states) hidden_states, _ = self.self_attn(hidden_states=hidden_states) - hidden_states = residual + hidden_states + hidden_states += residual residual = hidden_states hidden_states = self.layer_norm2(hidden_states) hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states + hidden_states += residual return hidden_states, None @@ -354,7 +353,8 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module): residual = hidden_state hidden_state = self.layernorm(hidden_state) - hidden_state = residual + self.mlp(hidden_state) + hidden_state = self.mlp(hidden_state) + hidden_state += residual return hidden_state[:, 0] diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index 91f6c7753c68b..eefadda918f62 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -24,6 +24,7 @@ from vllm.model_executor.models.intern_vit import (InternVisionModel, InternVisionPatchModel) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargs, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, @@ -78,7 +79,7 @@ SkyworkR1VImageInputs = Union[SkyworkR1VImagePixelInputs, def build_transform(input_size: int): MEAN, STD = IMAGENET_MEAN, IMAGENET_STD return T.Compose([ - T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), + T.Lambda(lambda img: convert_image_mode(img, 'RGB')), T.Resize((input_size, input_size), interpolation=T.InterpolationMode.BICUBIC), T.ToTensor(), diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 53e5274aa5740..fcd17cc1c2ba4 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -126,8 +126,9 @@ class SolarAttention(nn.Module): assert tp_size % self.total_num_kv_heads == 0 self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) # MistralConfig has an optional head_dim introduced by Mistral-Nemo - self.head_dim = getattr(config, "head_dim", - self.hidden_size // self.total_num_heads) + self.head_dim = getattr(config, "head_dim", None) + if self.head_dim is None: + self.head_dim = self.hidden_size // self.total_num_heads self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index a8f30b2f27bfe..b87a2ebf211ac 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -14,10 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. """Wrapper around `transformers` models""" -import re from collections.abc import Iterable +from contextlib import nullcontext from typing import Literal, Optional, Union +import regex as re import torch from torch import nn from transformers import AutoModel, PretrainedConfig, PreTrainedModel @@ -110,6 +111,33 @@ def replace_linear_class( ) +class ConfigOverride: + """Context manager to temporarily override config attributes.""" + + def __init__(self, config: PretrainedConfig, **kwargs): + self.config = config + self.kwargs = kwargs + self.kwargs_original = {} + self.kwargs_delete = set() + + def __enter__(self): + """Override config attributes.""" + for key, value in self.kwargs.items(): + if not hasattr(self.config, key): + self.kwargs_delete.add(key) + self.kwargs_original[key] = getattr(self.config, key, None) + setattr(self.config, key, value) + return self.config + + def __exit__(self, exc_type, exc_value, traceback): + """Restore original config attributes.""" + for key, value in self.kwargs_original.items(): + if key in self.kwargs_delete: + delattr(self.config, key) + else: + setattr(self.config, key, value) + + class TransformersModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -135,8 +163,17 @@ class TransformersModel(nn.Module): self.pp_rank = self.pp_group.rank_in_group self.tp_size = get_tensor_model_parallel_world_size() + # vLLM handles interleaved sliding window attention by creating a new + # interleaved_sliding_window attribute and deleting the sliding_window + # attribute. This breaks the constructors in Transformers so we + # temporarily add the attribute back to construct the model. + config_override = nullcontext() + if hasattr(config, "interleaved_sliding_window"): + config_override = ConfigOverride( + config, sliding_window=config.interleaved_sliding_window) + # Use meta device to delay allocating GPU tensors - with torch.device("meta"): + with torch.device("meta"), config_override: # FIXME(Isotr0py): We need to refactor this part in the future to # avoid registering an extra model layer, otherwise we will need a # weights mapper to rename weights. @@ -262,9 +299,17 @@ class TransformersModel(nn.Module): num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) start, end = get_pp_indices(self.config.num_hidden_layers, self.pp_rank, self.pp_size) - return { - i: - Attention( + + attention_instances = {} + for i in range(start, end): + # Handle interleaved sliding window attention + sliding_window = None + if (hasattr(self.config, "interleaved_sliding_window") + and hasattr(self.config, "sliding_window_pattern") + and ((i + 1) % self.config.sliding_window_pattern > 0)): + sliding_window = self.config.interleaved_sliding_window + + attention_instances[i] = Attention( num_heads=num_heads, head_size=head_size, # NOTE: We use Llama scale as default, if it's set by @@ -273,9 +318,9 @@ class TransformersModel(nn.Module): num_kv_heads=num_kv_heads, cache_config=self.cache_config, quant_config=self.quant_config, + per_layer_sliding_window=sliding_window, prefix=f"{i}.attn") - for i in range(start, end) - } + return attention_instances def init_buffers(self, module: nn.Module): """ diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 027cd748e9def..3d821d3dc6b58 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -66,7 +66,7 @@ class WeightsMapper: class AutoWeightsLoader: """ - Helper class to load weights into a {class}`torch.nn.Module`. It is able + Helper class to load weights into a [`torch.nn.Module`][]. It is able to automatically detect child modules and parameters while iterating over the weights only once. diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index f9d89e64bd9db..1b120c3545a56 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """Utils for model executor.""" +import copy from typing import Any, Optional import torch @@ -51,3 +52,23 @@ def _make_synced_weight_loader(original_weight_loader): torch._sync(param) return _synced_weight_loader + + +def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]: + parent_map = copy.deepcopy(getattr(model, "packed_modules_mapping", {})) + + # don't infer mapping if the model has defined it explicitly. + if parent_map: + return parent_map + + # We only check main components instead of whole model submodules + for child in model.children(): + child_map = getattr(child, "packed_modules_mapping", {}) + if any((k in parent_map and parent_map[k] != v) + for k, v in child_map.items()): + raise ValueError( + f"Can't update {type(model).__name__}'s packed_modules_mapping " + f"safely because of conflicts from {type(child).__name__}.") + else: + parent_map.update(child_map) + return parent_map \ No newline at end of file diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 756ea11311daf..815e34d5ac5db 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -8,12 +8,12 @@ from .registry import MultiModalRegistry MULTIMODAL_REGISTRY = MultiModalRegistry() """ -The global {class}`~MultiModalRegistry` is used by model runners to -dispatch data processing according to the target model. +The global [`MultiModalRegistry`][vllm.multimodal.registry.MultiModalRegistry] +is used by model runners to dispatch data processing according to the target +model. -:::{seealso} -{ref}`mm-processing` -::: +Info: + [mm_processing](../../../design/mm_processing.html) """ __all__ = [ diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index 53e289370a9f4..b4cd6a90834c0 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -10,6 +10,7 @@ from blake3 import blake3 from PIL import Image from vllm.logger import init_logger +from vllm.multimodal.image import convert_image_mode if TYPE_CHECKING: from vllm.inputs import TokensPrompt @@ -35,7 +36,8 @@ class MultiModalHasher: return np.array(obj).tobytes() if isinstance(obj, Image.Image): - return cls.item_to_bytes("image", np.array(obj.convert("RGBA"))) + return cls.item_to_bytes( + "image", np.asarray(convert_image_mode(obj, "RGBA"))) if isinstance(obj, torch.Tensor): return cls.item_to_bytes("tensor", obj.numpy()) if isinstance(obj, np.ndarray): @@ -43,7 +45,7 @@ class MultiModalHasher: "ndarray", { "dtype": obj.dtype.str, "shape": obj.shape, - "data": obj.data.tobytes(), + "data": obj.tobytes(), }) logger.warning( diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 939928bbf108b..a63ec0bd8ada4 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -22,6 +22,25 @@ def rescale_image_size(image: Image.Image, return image +# TODO: Support customizable background color to fill in. +def rgba_to_rgb( + image: Image.Image, background_color=(255, 255, 255)) -> Image.Image: + """Convert an RGBA image to RGB with filled background color.""" + assert image.mode == "RGBA" + converted = Image.new("RGB", image.size, background_color) + converted.paste(image, mask=image.split()[3]) # 3 is the alpha channel + return converted + + +def convert_image_mode(image: Image.Image, to_mode: str): + if image.mode == to_mode: + return image + elif image.mode == "RGBA" and to_mode == "RGB": + return rgba_to_rgb(image) + else: + return image.convert(to_mode) + + class ImageMediaIO(MediaIO[Image.Image]): def __init__(self, *, image_mode: str = "RGB") -> None: @@ -32,7 +51,7 @@ class ImageMediaIO(MediaIO[Image.Image]): def load_bytes(self, data: bytes) -> Image.Image: image = Image.open(BytesIO(data)) image.load() - return image.convert(self.image_mode) + return convert_image_mode(image, self.image_mode) def load_base64(self, media_type: str, data: str) -> Image.Image: return self.load_bytes(base64.b64decode(data)) @@ -40,7 +59,7 @@ class ImageMediaIO(MediaIO[Image.Image]): def load_file(self, filepath: Path) -> Image.Image: image = Image.open(filepath) image.load() - return image.convert(self.image_mode) + return convert_image_mode(image, self.image_mode) def encode_base64( self, @@ -51,7 +70,7 @@ class ImageMediaIO(MediaIO[Image.Image]): image = media with BytesIO() as buffer: - image = image.convert(self.image_mode) + image = convert_image_mode(image, self.image_mode) image.save(buffer, image_format) data = buffer.getvalue() diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 2335af843ed5e..600a34d39ef68 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -29,14 +29,14 @@ _T = TypeVar("_T") HfImageItem: TypeAlias = Union["Image", np.ndarray, "torch.Tensor"] """ -A {class}`transformers.image_utils.ImageInput` representing a single image +A `transformers.image_utils.ImageInput` representing a single image item, which can be passed to a HuggingFace `ImageProcessor`. """ HfVideoItem: TypeAlias = Union[list["Image"], np.ndarray, "torch.Tensor", list[np.ndarray], list["torch.Tensor"]] """ -A {class}`transformers.image_utils.VideoInput` representing a single video +A `transformers.image_utils.VideoInput` representing a single video item, which can be passed to a HuggingFace `VideoProcessor`. """ @@ -48,7 +48,7 @@ item, which can be passed to a HuggingFace `AudioProcessor`. ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor"] """ -A {class}`transformers.image_utils.ImageInput` representing a single image +A `transformers.image_utils.ImageInput` representing a single image item, which can be passed to a HuggingFace `ImageProcessor`. Alternatively, a 3-D tensor or batch of 2-D tensors, @@ -58,7 +58,7 @@ these are directly passed to the model without HF processing. VideoItem: TypeAlias = Union[HfVideoItem, "torch.Tensor"] """ -A {class}`transformers.image_utils.VideoInput` representing a single video +A `transformers.image_utils.VideoInput` representing a single video item, which can be passed to a HuggingFace `VideoProcessor`. Alternatively, a 3-D tensor or batch of 2-D tensors, @@ -108,7 +108,8 @@ MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]] """ A dictionary containing an entry for each modality type to input. -The built-in modalities are defined by {class}`MultiModalDataBuiltins`. +The built-in modalities are defined by +[`MultiModalDataBuiltins`][vllm.multimodal.inputs.MultiModalDataBuiltins]. """ @@ -169,7 +170,8 @@ Uses a list instead of a tensor if the dimensions of each element do not match. def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: - """Equality check between {data}`NestedTensors` objects.""" + """Equality check between + [`NestedTensors`][vllm.multimodal.inputs.NestedTensors] objects.""" if isinstance(a, torch.Tensor): return isinstance(b, torch.Tensor) and torch.equal(a, b) elif isinstance(b, torch.Tensor): @@ -189,7 +191,7 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors] """ A dictionary containing nested tensors which have been batched via -{meth}`MultiModalKwargs.batch`. +[`MultiModalKwargs.batch`][vllm.multimodal.inputs.MultiModalKwargs.batch]. """ @@ -197,7 +199,7 @@ A dictionary containing nested tensors which have been batched via class MultiModalFieldElem: """ Represents a keyword argument corresponding to a multi-modal item - in {class}`MultiModalKwargs`. + in [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs]. """ modality: str @@ -208,13 +210,15 @@ class MultiModalFieldElem: key: str """ - The key of this field in {class}`MultiModalKwargs`, + The key of this field in + [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs], i.e. the name of the keyword argument to be passed to the model. """ data: NestedTensors """ - The tensor data of this field in {class}`MultiModalKwargs`, + The tensor data of this field in + [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs], i.e. the value of the keyword argument to be passed to the model. """ @@ -237,7 +241,8 @@ class MultiModalFieldElem: class BaseMultiModalField(ABC): """ Defines how to interpret tensor data belonging to a keyword argument in - {class}`MultiModalKwargs` for multiple multi-modal items, and vice versa. + [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs] for multiple + multi-modal items, and vice versa. """ def _field_factory(self, *, modality: str, key: str): @@ -262,10 +267,12 @@ class BaseMultiModalField(ABC): data: NestedTensors, ) -> Sequence[MultiModalFieldElem]: """ - Construct {class}`MultiModalFieldElem` instances to represent - the provided data. + Construct + [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem] + instances to represent the provided data. - This is the inverse of {meth}`reduce_data`. + This is the inverse of + [`reduce_data`][vllm.multimodal.inputs.BaseMultiModalField.reduce_data]. """ raise NotImplementedError @@ -275,9 +282,11 @@ class BaseMultiModalField(ABC): def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors: """ - Merge the data from multiple instances of {class}`MultiModalFieldElem`. + Merge the data from multiple instances of + [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem]. - This is the inverse of {meth}`build_elems`. + This is the inverse of + [`build_elems`][vllm.multimodal.inputs.BaseMultiModalField.build_elems]. """ field_types = [type(item.field) for item in elems] if len(set(field_types)) > 1: @@ -289,9 +298,8 @@ class BaseMultiModalField(ABC): @dataclass(frozen=True) class MultiModalBatchedField(BaseMultiModalField): """ - :::{seealso} - {func}`MultiModalFieldConfig.batched` - ::: + Info: + [`MultiModalFieldConfig.batched`][vllm.multimodal.inputs.MultiModalFieldConfig.batched] """ def build_elems( @@ -320,10 +328,9 @@ class MultiModalBatchedField(BaseMultiModalField): @dataclass(frozen=True) class MultiModalFlatField(BaseMultiModalField): """ - :::{seealso} - {func}`MultiModalFieldConfig.flat` - {func}`MultiModalFieldConfig.flat_from_sizes` - ::: + Info: + [`MultiModalFieldConfig.flat`][vllm.multimodal.inputs.MultiModalFieldConfig.flat] + [`MultiModalFieldConfig.flat_from_sizes`][vllm.multimodal.inputs.MultiModalFieldConfig.flat_from_sizes] """ slices: Union[Sequence[slice], Sequence[Sequence[slice]]] dim: int = 0 @@ -363,9 +370,8 @@ class MultiModalFlatField(BaseMultiModalField): @dataclass(frozen=True) class MultiModalSharedField(BaseMultiModalField): """ - :::{seealso} - {func}`MultiModalFieldConfig.shared` - ::: + Info: + [`MultiModalFieldConfig.shared`][vllm.multimodal.inputs.MultiModalFieldConfig.shared] """ batch_size: int @@ -510,9 +516,8 @@ class MultiModalFieldConfig: Element 3: [[C],[C]] ``` - :::{seealso} - {func}`MultiModalFieldConfig.flat` - ::: + Info: + [`MultiModalFieldConfig.flat`][vllm.multimodal.inputs.MultiModalFieldConfig.flat] """ if size_per_item.ndim != 1: @@ -576,8 +581,10 @@ class MultiModalFieldConfig: class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): """ - A collection of {class}`MultiModalFieldElem` - corresponding to a data item in {class}`MultiModalDataItems`. + A collection of + [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem] + corresponding to a data item in + [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems]. """ @staticmethod @@ -596,11 +603,13 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): class MultiModalKwargs(UserDict[str, NestedTensors]): """ A dictionary that represents the keyword arguments to - {meth}`~torch.nn.Module.forward`. + [`torch.nn.Module.forward`][]. The metadata `items` enables us to obtain the keyword arguments - corresponding to each data item in {class}`MultiModalDataItems`, via - {meth}`get_item` and {meth}`get_items`. + corresponding to each data item in + [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems], via + [`get_item`][vllm.multimodal.inputs.MultiModalKwargs.get_item] and + [`get_items`][vllm.multimodal.inputs.MultiModalKwargs.get_items]. """ @staticmethod @@ -639,7 +648,9 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): @staticmethod def from_items(items: Sequence[MultiModalKwargsItem]): - """Construct a new {class}`MultiModalKwargs` from multiple items.""" + """Construct a new + [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs] + from multiple items.""" elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list) for item in items: for key, elem in item.items(): @@ -735,11 +746,17 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): batched_inputs: BatchedTensorInputs, *, device: torch.types.Device, + dtype: Optional[torch.dtype] = None, ) -> BatchedTensorInputs: json_inputs = cast(JSONTree[torch.Tensor], batched_inputs) + def maybe_cast_dtype(x: torch.Tensor): + # This mimics the behavior of transformers.BatchFeature + return x.to(dtype=dtype) if x.is_floating_point() else x + json_mapped = json_map_leaves( - lambda x: x.to(device, non_blocking=True), + # NOTE: Cast the dtype before sending it to device + lambda x: maybe_cast_dtype(x).to(device=device, non_blocking=True), json_inputs, ) @@ -804,7 +821,7 @@ A dictionary containing placeholder ranges for each modality. class MultiModalInputs(TypedDict): """ Represents the outputs of - {class}`vllm.multimodal.processing.BaseMultiModalProcessor`, + [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor], ready to be passed to vLLM internals. """ @@ -840,7 +857,8 @@ class MultiModalInputs(TypedDict): class MultiModalEncDecInputs(MultiModalInputs): """ - Represents the outputs of {class}`vllm.multimodal.EncDecMultiModalProcessor` + Represents the outputs of + [`EncDecMultiModalProcessor`][vllm.multimodal.processing.EncDecMultiModalProcessor] ready to be passed to vLLM internals. """ diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 6e9ec95558020..63af842747a54 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -28,7 +28,8 @@ else: class ModalityDataItems(ABC, Generic[_T, _I]): """ - Represents data items for a modality in {class}`MultiModalDataItems`. + Represents data items for a modality in + [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems]. """ def __init__(self, data: _T, modality: str) -> None: @@ -251,15 +252,15 @@ _D = TypeVar("_D", bound=ModalityDataItems[Any, Any]) class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]): """ - As {data}`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized - such that each entry corresponds to a list. + As [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict], but + normalized such that each entry corresponds to a list. """ def get_count(self, modality: str, *, strict: bool = True) -> int: """ Get the number of data items belonging to a modality. - If `strict=False`, return `0` instead of raising {exc}`KeyError` + If `strict=False`, return `0` instead of raising [`KeyError`][] even if the modality is not found. """ if modality not in self: @@ -305,8 +306,8 @@ ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]], class MultiModalDataParser: """ - Parses {data}`~vllm.multimodal.inputs.MultiModalDataDict` into - {class}`MultiModalDataItems`. + Parses [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict] + into [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems]. Args: target_sr (float, optional): Enables automatic resampling of audio diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 320a26f375557..aa7914e40cbff 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 import json -import re import sys from abc import ABC, abstractmethod from collections import defaultdict @@ -12,6 +11,7 @@ from functools import lru_cache from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol, TypeVar, Union, cast) +import regex as re import torch from typing_extensions import assert_never @@ -114,13 +114,14 @@ class PromptUpdateDetails(Generic[_S]): is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None """ - Given {attr}`full`, return a boolean mask of shape `(len(full),)` - indicating which positions of `full` to assign embeddings to. + Given [`full`][vllm.multimodal.processing.PromptUpdateDetails.full], + return a boolean mask of shape `(len(full),)` indicating which positions + of `full` to assign embeddings to. `None` (default) means to assign embeddings to all positions of `full`. The embeddings are obtained by calling - {class}`SupportsMultiModal.get_multimodal_embeddings`. + [`SupportsMultiModal.get_multimodal_embeddings`][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings]. """ @staticmethod @@ -159,13 +160,15 @@ PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails] The token sequence or text that are part of the update. If only part of the content corresponds to feature placeholders, you can -use {class}`PromptUpdateDetails` to specify which part. +use [`PromptUpdateDetails`][vllm.multimodal.processing.PromptUpdateDetails] to +specify which part. """ PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo], PromptUpdateInfo] """ -Given the index of the processed item within {attr}`modality`, +Given the index of the processed item within +[`modality`][vllm.multimodal.processing.PromptUpdate.modality], output the corresponding token sequence (or text). For convenience, you can directly pass in the token sequence (or text) @@ -260,8 +263,10 @@ class PromptInsertion(PromptUpdate): insertion: PromptUpdateContent = field(repr=False) """ - Given the index of the processed item within {attr}`modality`, - output the token sequence (or text) to insert right after {attr}`target`. + Given the index of the processed item within + [`modality`][vllm.multimodal.processing.PromptUpdate.modality], + output the token sequence (or text) to insert right after + [`target`][vllm.multimodal.processing.PromptUpdate.target]. For convenience, you can directly pass in the token sequence (or text) instead of a function if it does not depend on the input. @@ -332,8 +337,10 @@ class PromptReplacement(PromptUpdate): replacement: PromptUpdateContent = field(repr=False) """ - Given the index of the processed item within {attr}`modality`, - output the token sequence (or text) to replace {attr}`target`. + Given the index of the processed item within + [`modality`][vllm.multimodal.processing.PromptUpdate.modality], + output the token sequence (or text) to replace + [`target`][vllm.multimodal.processing.PromptUpdate.target]. For convenience, you can directly pass in the token sequence (or text) instead of a function if it does not depend on the input. @@ -387,14 +394,16 @@ _M = TypeVar("_M", bound=Union[_HasModalityAttr, _HasModalityProp]) def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]: - """Convenience function to apply {func}`full_groupby` based on modality.""" + """Convenience function to apply [`full_groupby`][vllm.utils.full_groupby] + based on modality.""" return full_groupby(values, key=lambda x: x.modality) @dataclass class _BoundPromptSequence: """ - A {data}`_PromptSeq` bound to a tokenizer to automatically + A [`_PromptSeq`][vllm.multimodal.processing.PromptSeq] bound + to a tokenizer to automatically convert between token sequence and text representations. """ tokenizer: AnyTokenizer = field(repr=False) @@ -446,9 +455,11 @@ class _BoundPromptContent: @dataclass class BoundPromptUpdate: """ - A {class}`PromptUpdate` bound to a tokenizer to automatically convert - {attr}`target` and the result of {meth}`get_content` between - token sequence and text representations. + A [`PromptUpdate`][vllm.multimodal.processing.PromptUpdate] bound + to a tokenizer to automatically convert + [`target`][vllm.multimodal.processing.PromptUpdate.target] and the result of + [`get_content`][vllm.multimodal.processing.BoundPromptUpdate.get_content] + between token sequence and text representations. """ _origin: PromptUpdate tokenizer: AnyTokenizer = field(repr=False) @@ -482,7 +493,8 @@ class BoundPromptUpdate: def get_content(self, item_idx: int) -> _BoundPromptContent: """ - Given the index of the processed item within {attr}`modality`, + Given the index of the processed item within + [`modality`][vllm.multimodal.processing.PromptUpdate.modality], output the token sequence (or text) to update. """ content = self.content @@ -1019,7 +1031,8 @@ class ProcessingCache: ) -> None: """ Put a processed multi-modal item into the cache - according to its dependencies (see {meth}`get`). + according to its dependencies + (see [`get`][vllm.multimodal.processing.ProcessingCache.get]). """ cache_key = MultiModalHasher.hash_kwargs(model_id=model_id, **{modality: input_item}, @@ -1091,7 +1104,8 @@ _I = TypeVar("_I", bound=BaseProcessingInfo) MultiModalHashes = dict[str, list[str]] """ -A collection of hashes with a similar structure as {class}`MultiModalKwargs`. +A collection of hashes with a similar structure as +[`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs]. """ @@ -1099,7 +1113,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): """ Abstract base class to process multi-modal inputs to be used in vLLM. - Not to be confused with {class}`transformers.ProcessorMixin`. + Not to be confused with `transformers.ProcessorMixin`. """ def __init__(self, @@ -1126,10 +1140,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): def _get_data_parser(self) -> MultiModalDataParser: """ Construct a parser to preprocess multi-modal data items - before passing them to {meth}`_get_hf_mm_data`. + before passing them to + [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data]. You can support additional modalities by creating a subclass - of {class}`MultiModalDataParser` that has additional subparsers. + of [`MultiModalDataParser`][vllm.multimodal.parse.MultiModalDataParser] + that has additional subparsers. """ return MultiModalDataParser() @@ -1138,8 +1154,11 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_data: MultiModalDataDict, ) -> MultiModalDataItems: """ - Normalize {class}`MultiModalDataDict` to {class}`MultiModalDataItems` - before passing them to {meth}`_get_hf_mm_data`. + Normalize + [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict] + to [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems] + before passing them to + [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data]. """ mm_items = self.data_parser.parse_mm_data(mm_data) supported_mm_limits = self.info.get_supported_mm_limits() @@ -1191,7 +1210,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): inputs. Moreover, this information is critical to determine the token positions - in order to construct {class}`~vllm-multimodal.input.PlaceholderRange` + in order to construct + [`PlaceholderRange`][vllm.multimodal.inputs.PlaceholderRange] for each multi-modal item. """ raise NotImplementedError @@ -1315,7 +1335,9 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): Most HF processors accept prompt text but not prompt tokens. If the HF processor adds or removes tokens that are not related to multi-modal data, you should override this method so it is consistent - with the output of {meth}`_apply_hf_processor_text_only` on the + with the output of + [`_apply_hf_processor_text_only`][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_text_only] + on the corresponding text. """ return prompt_tokens @@ -1330,7 +1352,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): Since HF processor requires that text and multi-modal items correspond to each other, we generate dummy text using - {class}`DummyInputsBuilder` to go along with the multi-modal data. + [`DummyInputsBuilder`][vllm.multimodal.profiling.BaseDummyInputsBuilder] + to go along with the multi-modal data. """ mm_counts = mm_items.get_all_counts() diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index b5875124c1266..53f5b243d4967 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -1,9 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 - -from abc import ABC +from abc import ABC, abstractmethod from collections.abc import Mapping from dataclasses import dataclass, field -from typing import Generic, NamedTuple, Optional, TypeVar, cast +from typing import Generic, NamedTuple, Optional, TypeVar, Union, cast import numpy as np import numpy.typing as npt @@ -25,9 +24,9 @@ logger = init_logger(__name__) class ProcessorInputs: """ Represents the keyword arguments to - {meth}`vllm.multimodal.processing.BaseMultiModalProcessor.apply`. + [`vllm.multimodal.processing.BaseMultiModalProcessor.apply`][]. """ - prompt_text: str + prompt: Union[str, list[int]] mm_data: MultiModalDataDict hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict) @@ -60,24 +59,14 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]): self.info = info - # TODO: @abstractmethod after transition + @abstractmethod def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: """ Build the text input corresponding to `mm_counts`. """ - if (type(self).get_dummy_processor_inputs == - BaseDummyInputsBuilder.get_dummy_processor_inputs): - raise NotImplementedError + raise NotImplementedError - logger.warning_once("`get_dummy_processor_inputs` has been split up " - "into `get_dummy_text` and `get_dummy_mm_data`. " - "These two methods will be marked as abstract " - "in an upcoming release.") - - seq_len = self.info.ctx.model_config.max_model_len - return self.get_dummy_processor_inputs(seq_len, mm_counts).prompt_text - - # TODO: @abstractmethod after transition + @abstractmethod def get_dummy_mm_data( self, seq_len: int, @@ -101,7 +90,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]): dummy_text = self.get_dummy_text(mm_counts) dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts) - return ProcessorInputs(prompt_text=dummy_text, mm_data=dummy_mm_data) + return ProcessorInputs(prompt=dummy_text, mm_data=dummy_mm_data) def _get_dummy_audios( self, @@ -177,7 +166,7 @@ class MultiModalProfiler(Generic[_I]): seq_len, mm_counts) return self.processor.apply( - prompt=processor_inputs.prompt_text, + prompt=processor_inputs.prompt, mm_data=processor_inputs.mm_data, hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, ) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 67d0d7fc11834..b9f5cee922a70 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -29,7 +29,11 @@ _I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True) class ProcessingInfoFactory(Protocol[_I_co]): - """Constructs a {class}`MultiModalProcessor` instance from the context.""" + """ + Constructs a + [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor] + instance from the context. + """ def __call__( self, @@ -40,7 +44,9 @@ class ProcessingInfoFactory(Protocol[_I_co]): class DummyInputsBuilderFactory(Protocol[_I]): """ - Constructs a {class}`BaseDummyInputsBuilder` instance from the context. + Constructs a + [`BaseDummyInputsBuilder`][vllm.multimodal.profiling.BaseDummyInputsBuilder] + instance from the context. """ def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]: @@ -48,7 +54,11 @@ class DummyInputsBuilderFactory(Protocol[_I]): class MultiModalProcessorFactory(Protocol[_I]): - """Constructs a {class}`MultiModalProcessor` instance from the context.""" + """ + Constructs a + [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor] + instance from the context. + """ def __call__( self, @@ -155,8 +165,6 @@ class MultiModalRegistry: """ Get the maximum number of tokens from each modality for profiling the memory usage of a model. - - See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details. """ mm_limits = self.get_mm_limits_per_prompt(model_config) @@ -170,8 +178,6 @@ class MultiModalRegistry: """ Get the maximum number of multi-modal tokens for profiling the memory usage of a model. - - See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details. """ return sum(self.get_max_tokens_by_modality(model_config).values()) @@ -213,10 +219,6 @@ class MultiModalRegistry: When the model receives multi-modal data, the provided function is invoked to transform the data into a dictionary of model inputs. - - :::{seealso} - {ref}`mm-processing` - ::: """ def wrapper(model_cls: N) -> N: @@ -259,10 +261,6 @@ class MultiModalRegistry: ) -> BaseMultiModalProcessor[BaseProcessingInfo]: """ Create a multi-modal processor for a specific model and tokenizer. - - :::{seealso} - {ref}`mm-processing` - ::: """ if not model_config.is_multimodal_model: raise ValueError(f"{model_config.model} is not a multimodal model") diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index aef5f669ac689..1d838f66f1dec 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -12,6 +12,9 @@ from PIL import Image import vllm.envs as envs from vllm.connections import HTTPConnection, global_http_connection +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather) from .audio import AudioMediaIO from .base import MediaIO @@ -259,7 +262,8 @@ class MediaConnector: global_media_connector = MediaConnector() -"""The global {class}`MediaConnector` instance used by vLLM.""" +"""The global [`MediaConnector`][vllm.multimodal.utils.MediaConnector] +instance used by vLLM.""" fetch_audio = global_media_connector.fetch_audio fetch_image = global_media_connector.fetch_image @@ -389,3 +393,35 @@ def group_mm_inputs_by_modality( return [ list(group) for _, group in groupby(mm_inputs, key=modality_group_func) ] + + +def run_dp_sharded_vision_model(image_input: torch.Tensor, + vision_model: torch.nn.Module) -> torch.Tensor: + """Run a vision model with data parallelism (DP) sharding. The function + will shard the input image tensor on the first dimension and run the vision + model + + Args: + image_input (torch.Tensor): Image input tensor. + vision_model (torch.nn.Module): Vision model. + + Returns: + torch.Tensor: Output image embeddings + """ + + num_chunks = image_input.shape[0] + mp_world_size = get_tensor_model_parallel_world_size() + num_chunks_per_rank = (num_chunks + mp_world_size - 1) // mp_world_size + num_padded_chunks = num_chunks_per_rank * mp_world_size - num_chunks + pad = (0, ) * (2 * (image_input.dim() - 1)) + (0, num_padded_chunks) + image_input_padded = torch.nn.functional.pad(image_input, pad) + rank = get_tensor_model_parallel_rank() + image_input_per_rank = image_input_padded[rank * + num_chunks_per_rank:(rank + 1) * + num_chunks_per_rank, ...] + + vision_embeddings = vision_model(image_input_per_rank) + vision_embeddings = tensor_model_parallel_all_gather(vision_embeddings, + dim=0) + vision_embeddings = vision_embeddings[:num_chunks, ...] + return vision_embeddings diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 3685fd4c34580..261d56abad9c6 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -164,7 +164,7 @@ class VideoMediaIO(MediaIO[npt.NDArray]): ) return np.stack([ - np.array(load_frame(frame_data)) + np.asarray(load_frame(frame_data)) for frame_data in data.split(",") ]) diff --git a/vllm/outputs.py b/vllm/outputs.py index 6cd60575b00df..3960388bf73c6 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -7,14 +7,17 @@ from dataclasses import dataclass from typing import Any, Generic, Optional, Union import torch -from typing_extensions import TypeVar, deprecated +from typing_extensions import TypeVar +from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import MultiModalPlaceholderDict from vllm.sampling_params import RequestOutputKind from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs, SequenceGroup, SequenceGroupBase, SequenceStatus) +logger = init_logger(__name__) + @dataclass class CompletionOutput: @@ -73,14 +76,6 @@ class PoolingOutput: return (isinstance(other, self.__class__) and bool( (self.data == other.data).all())) - @property - @deprecated("`LLM.encode()` now stores raw outputs in the `data` " - "attribute. To return embeddings, use `LLM.embed()`. " - "To return class probabilities, use `LLM.classify()` " - "and access the `probs` attribute. ") - def embedding(self) -> list[float]: - return self.data.tolist() - class RequestOutput: """The output data of a completion request to the LLM. @@ -122,7 +117,13 @@ class RequestOutput: *, multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None, kv_transfer_params: Optional[dict[str, Any]] = None, + # Forward compatibility, code that uses args added in new release can + # still run with older versions of vLLM without breaking. + **kwargs: Any, ) -> None: + if kwargs: + logger.warning_once("RequestOutput: Ignoring extra arguments: %s", + str(kwargs)) self.request_id = request_id self.prompt = prompt self.prompt_token_ids = prompt_token_ids @@ -382,15 +383,6 @@ class PoolingRequestOutput(Generic[_O]): prompt_token_ids, finished) def __repr__(self): - """ - Returns a string representation of an PoolingRequestOutput instance. - - The representation includes the request_id and the number of outputs, - providing a quick overview of the pooling request's results. - - Returns: - str: A string representation of the PoolingRequestOutput instance. - """ return (f"{type(self).__name__}(request_id={self.request_id!r}, " f"outputs={self.outputs!r}, " f"prompt_token_ids={self.prompt_token_ids}, " @@ -506,12 +498,6 @@ class ScoringOutput: def __repr__(self) -> str: return f"ScoringOutput(score={self.score})" - @property - @deprecated("`LLM.score()` now returns scalar scores. " - "Please access it via the `score` attribute. ") - def embedding(self) -> list[float]: - return [self.score] - class ScoringRequestOutput(PoolingRequestOutput[ScoringOutput]): diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index b1df4fd1339b1..00d00d05f47ae 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -42,7 +42,6 @@ def tpu_platform_plugin() -> Optional[str]: logger.debug("Confirmed TPU platform is available.") except Exception as e: logger.debug("TPU platform is not available because: %s", str(e)) - pass return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None @@ -112,7 +111,6 @@ def rocm_platform_plugin() -> Optional[str]: amdsmi.amdsmi_shut_down() except Exception as e: logger.debug("ROCm platform is not available because: %s", str(e)) - pass return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None @@ -130,7 +128,6 @@ def hpu_platform_plugin() -> Optional[str]: "habana_frameworks is not found.") except Exception as e: logger.debug("HPU platform is not available because: %s", str(e)) - pass return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None @@ -148,7 +145,6 @@ def xpu_platform_plugin() -> Optional[str]: logger.debug("Confirmed XPU platform is available.") except Exception as e: logger.debug("XPU platform is not available because: %s", str(e)) - pass return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None @@ -170,7 +166,6 @@ def cpu_platform_plugin() -> Optional[str]: except Exception as e: logger.debug("CPU platform is not available because: %s", str(e)) - pass return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 2d48af397636c..eaffaac78cce9 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -9,6 +9,7 @@ import psutil import torch from vllm.logger import init_logger +from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS from .interface import CpuArchEnum, Platform, PlatformEnum, _Backend @@ -27,7 +28,7 @@ class CpuPlatform(Platform): dispatch_key: str = "CPU" @property - def supported_dtypes(self) -> list: + def supported_dtypes(self) -> list[torch.dtype]: if self.get_cpu_architecture() == CpuArchEnum.POWERPC: return [torch.bfloat16, torch.float32] elif sys.platform.startswith( @@ -74,7 +75,7 @@ class CpuPlatform(Platform): import vllm.envs as envs from vllm.utils import GiB_bytes model_config = vllm_config.model_config - # Reminder: Please update docs/source/features/compatibility_matrix.md + # Reminder: Please update docs/features/compatibility_matrix.md # If the feature combo become valid if not model_config.enforce_eager: model_config.enforce_eager = True @@ -177,6 +178,16 @@ class CpuPlatform(Platform): " set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly.") os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' + if vllm_config.model_config and vllm_config.model_config.use_mla: + logger.info( + "MLA is enabled on a non-GPU platform; forcing chunked " + "prefill and prefix caching to be disabled.") + vllm_config.scheduler_config.enable_chunked_prefill = False + vllm_config.scheduler_config.chunked_prefill_enabled = False + vllm_config.scheduler_config.max_num_batched_tokens = max( + vllm_config.scheduler_config.max_model_len, + DEFAULT_MAX_NUM_BATCHED_TOKENS) + @classmethod def is_pin_memory_available(cls) -> bool: logger.warning("Pin memory is not supported on CPU.") diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index bdee8b2f821d4..9f833cbb587d8 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -4,10 +4,13 @@ pynvml. However, it should not initialize cuda context. """ import os +from datetime import timedelta from functools import wraps from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union import torch +from torch.distributed import PrefixStore, ProcessGroup +from torch.distributed.distributed_c10d import is_nccl_available from typing_extensions import ParamSpec # import custom ops, trigger op registration @@ -103,7 +106,6 @@ class CudaPlatformBase(Platform): def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: parallel_config = vllm_config.parallel_config scheduler_config = vllm_config.scheduler_config - compilation_config = vllm_config.compilation_config model_config = vllm_config.model_config if parallel_config.worker_cls == "auto": @@ -151,15 +153,6 @@ class CudaPlatformBase(Platform): logger.info( "Forcing kv cache block size to 64 for FlashMLA backend.") - if (parallel_config.data_parallel_size > 1 - and compilation_config.use_cudagraph): - logger.info( - "Data Parallel: Forcing enforce eager to be True since DP is " - "currently not supported with CUDA Graphs.") - vllm_config.model_config.enforce_eager = True - compilation_config.use_cudagraph = False - compilation_config.use_inductor = False - @classmethod def get_current_memory_usage(cls, device: Optional[torch.types.Device] = None @@ -311,6 +304,40 @@ class CudaPlatformBase(Platform): def use_custom_allreduce(cls) -> bool: return True + @classmethod + def get_piecewise_backend_cls(cls) -> str: + return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend" # noqa + + @classmethod + def stateless_init_device_torch_dist_pg( + cls, + backend: str, + prefix_store: PrefixStore, + group_rank: int, + group_size: int, + timeout: timedelta, + ) -> ProcessGroup: + assert is_nccl_available() + pg: ProcessGroup = ProcessGroup( + prefix_store, + group_rank, + group_size, + ) + from torch.distributed.distributed_c10d import ProcessGroupNCCL + + backend_options = ProcessGroupNCCL.Options() + backend_options._timeout = timeout + + backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size, + backend_options) + backend_type = ProcessGroup.BackendType.NCCL + device = torch.device("cuda") + pg._set_default_backend(backend_type) + backend_class._set_sequence_number_for_group() + + pg._register_backend(device, backend_type, backend_class) + return pg + # NVML utils # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 456b054b2b43a..a8dd7df9f2e3e 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -7,6 +7,7 @@ import torch from vllm import envs from vllm.logger import init_logger +from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS from .interface import Platform, PlatformEnum, _Backend @@ -38,8 +39,8 @@ class HpuPlatform(Platform): def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: return True - @staticmethod - def inference_mode(): + @classmethod + def inference_mode(cls): return torch.no_grad() @classmethod @@ -80,6 +81,16 @@ class HpuPlatform(Platform): "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.") os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + if vllm_config.model_config and vllm_config.model_config.use_mla: + logger.info( + "MLA is enabled on a non-GPU platform; forcing chunked " + "prefill and prefix caching to be disabled.") + vllm_config.scheduler_config.enable_chunked_prefill = False + vllm_config.scheduler_config.chunked_prefill_enabled = False + vllm_config.scheduler_config.max_num_batched_tokens = max( + vllm_config.scheduler_config.max_model_len, + DEFAULT_MAX_NUM_BATCHED_TOKENS) + @classmethod def is_pin_memory_available(cls): logger.warning("Pin memory is not supported on HPU.") diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index b09e31e9ed46c..5c4f7a2f7dc76 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -3,11 +3,13 @@ import enum import os import platform import random +from datetime import timedelta from platform import uname from typing import TYPE_CHECKING, NamedTuple, Optional, Union import numpy as np import torch +from torch.distributed import PrefixStore, ProcessGroup from vllm.inputs import ProcessorInputs, PromptType from vllm.logger import init_logger @@ -84,7 +86,7 @@ class DeviceCapability(NamedTuple): def to_int(self) -> int: """ - Express device capability as an integer ``<major><minor>``. + Express device capability as an integer `<major><minor>`. It is assumed that the minor version is always a single digit. """ @@ -157,7 +159,7 @@ class Platform: return self._enum == PlatformEnum.OOT def is_cuda_alike(self) -> bool: - """Stateless version of {func}`torch.cuda.is_available`.""" + """Stateless version of [torch.cuda.is_available][].""" return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM) def is_sleep_mode_available(self) -> bool: @@ -194,7 +196,7 @@ class Platform: cls, device_id: int = 0, ) -> Optional[DeviceCapability]: - """Stateless version of {func}`torch.cuda.get_device_capability`.""" + """Stateless version of [torch.cuda.get_device_capability][].""" return None @classmethod @@ -206,10 +208,11 @@ class Platform: """ Test whether this platform is compatible with a device capability. - The ``capability`` argument can either be: + The `capability` argument can either be: - - A tuple ``(major, minor)``. - - An integer ``<major><minor>``. (See {meth}`DeviceCapability.to_int`) + - A tuple `(major, minor)`. + - An integer `<major><minor>`. (See + [`DeviceCapability.to_int`][vllm.platforms.interface.DeviceCapability.to_int]) """ current_capability = cls.get_device_capability(device_id=device_id) if current_capability is None: @@ -478,6 +481,27 @@ class Platform: """ raise NotImplementedError + @classmethod + def get_piecewise_backend_cls(cls) -> str: + """ + Get piecewise backend class for piecewise graph. + """ + return "vllm.compilation.base_piecewise_backend.AbstractPiecewiseBackend" # noqa + + @classmethod + def stateless_init_device_torch_dist_pg( + cls, + backend: str, + prefix_store: PrefixStore, + group_rank: int, + group_size: int, + timeout: timedelta, + ) -> ProcessGroup: + """ + Init platform-specific torch distributed process group. + """ + raise RuntimeError(f"Unsupported torch distributed backend: {backend}") + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index e08337b8391d3..56f204e71da17 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Optional from vllm import envs from vllm.logger import init_logger +from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS from .interface import Platform, PlatformEnum @@ -27,7 +28,7 @@ class NeuronPlatform(Platform): device_name: str = "neuron" device_type: str = "neuron" ray_device_key: str = "neuron_cores" - supported_quantization: list[str] = ["neuron_quant"] + supported_quantization: list[str] = ["neuron_quant", "fbgemm_fp8"] device_control_env_var: str = "NEURON_RT_VISIBLE_CORES" @classmethod @@ -48,14 +49,21 @@ class NeuronPlatform(Platform): if parallel_config.world_size > 1: parallel_config.distributed_executor_backend = "uni" - assert (vllm_config.lora_config - is None), "LoRA is not supported for Neuron backend." - if vllm_config.cache_config and vllm_config.model_config: # neuron needs block_size = max_model_len vllm_config.cache_config.block_size = \ vllm_config.model_config.max_model_len # type: ignore + if vllm_config.model_config and vllm_config.model_config.use_mla: + logger.info( + "MLA is enabled on a non-GPU platform; forcing chunked " + "prefill and prefix caching to be disabled.") + vllm_config.scheduler_config.enable_chunked_prefill = False + vllm_config.scheduler_config.chunked_prefill_enabled = False + vllm_config.scheduler_config.max_num_batched_tokens = max( + vllm_config.scheduler_config.max_model_len, + DEFAULT_MAX_NUM_BATCHED_TOKENS) + @classmethod def is_pin_memory_available(cls) -> bool: logger.warning("Pin memory is not supported on Neuron.") diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index c8b86087578db..ef1c632a53989 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -1,10 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 import os +from datetime import timedelta from functools import cache, lru_cache, wraps from typing import TYPE_CHECKING, Optional import torch +from torch.distributed import PrefixStore, ProcessGroup +from torch.distributed.distributed_c10d import is_nccl_available import vllm.envs as envs from vllm.logger import init_logger @@ -96,32 +99,60 @@ def with_amdsmi_context(fn): @cache -def on_mi250_mi300() -> bool: +def on_gfx1x() -> bool: GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName - return any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942"]) + return any(arch in GPU_ARCH for arch in ["gfx11", "gfx12"]) @cache -def use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int, - block_size: int, gqa_ratio: int, - max_seq_len: int, - sliding_window: int) -> bool: +def on_mi3xx() -> bool: + GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName + return any(arch in GPU_ARCH for arch in ["gfx942", "gfx950"]) + + +@cache +def on_gfx9() -> bool: + GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName + return any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"]) + + +@cache +def use_rocm_custom_paged_attention( + qtype: torch.dtype, + head_size: int, + block_size: int, + gqa_ratio: int, + max_seq_len: int, + sliding_window: int, + kv_cache_dtype: str, + alibi_slopes: Optional[torch.Tensor] = None) -> bool: GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName ON_GFX9 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"]) + ON_GFX11_GFX12 = any(arch in GPU_ARCH for arch in ["gfx11", "gfx12"]) - # rocm custom page attention not support on gfx1* # custom paged attn always supported on V0. On V1, requires sliding window # disabled due to observed numerical discrepancy. - return (ON_GFX9 and (not envs.VLLM_USE_V1 or sliding_window == 0 - or sliding_window == (-1, -1)) - and (qtype == torch.half or qtype == torch.bfloat16) - and (head_size == 64 or head_size == 128) - and (block_size == 16 or block_size == 32) - and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768 - and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN) - and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN - and envs.VLLM_ROCM_USE_AITER)) + if ON_GFX9: + return ((not envs.VLLM_USE_V1 or sliding_window == 0 + or sliding_window == (-1, -1)) + and (qtype == torch.half or qtype == torch.bfloat16) + and (head_size == 64 or head_size == 128) + and (block_size == 16 or block_size == 32) + and (gqa_ratio >= 1 and gqa_ratio <= 16) + and max_seq_len <= 32768 and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN) + and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN + and envs.VLLM_ROCM_USE_AITER)) + + else: + return (ON_GFX11_GFX12 and (not envs.VLLM_USE_V1 or sliding_window == 0 + or sliding_window == (-1, -1)) + and (qtype == torch.half or qtype == torch.bfloat16) + and head_size == 128 and block_size == 16 + and (gqa_ratio >= 3 and gqa_ratio <= 16) + and max_seq_len <= 32768 and alibi_slopes is None + and kv_cache_dtype == "auto" + and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN) class RocmPlatform(Platform): @@ -178,8 +209,9 @@ class RocmPlatform(Platform): f" The selected backend, {selected_backend.name}," f"is not MLA type while requested for MLA backend.") - selected_backend = (_Backend.ROCM_FLASH if selected_backend - == _Backend.FLASH_ATTN else selected_backend) + if selected_backend is None or selected_backend == _Backend.FLASH_ATTN: + selected_backend = _Backend.ROCM_FLASH + if envs.VLLM_USE_V1: logger.info("Using Triton Attention backend on V1 engine.") return ("vllm.v1.attention.backends." @@ -201,9 +233,9 @@ class RocmPlatform(Platform): major, minor = torch.cuda.get_device_capability(device_id) return DeviceCapability(major=major, minor=minor) - @staticmethod + @classmethod @with_amdsmi_context - def is_fully_connected(physical_device_ids: list[int]) -> bool: + def is_fully_connected(cls, physical_device_ids: list[int]) -> bool: """ Query if the set of gpus are fully connected by xgmi (1 hop) """ @@ -362,3 +394,41 @@ class RocmPlatform(Platform): def get_cu_count(cls, device_id: int = 0) -> int: return torch.cuda.get_device_properties( device_id).multi_processor_count + + @classmethod + def is_navi(cls) -> bool: + return 'gfx1' in torch.cuda.get_device_properties(0).gcnArchName + + @classmethod + def get_piecewise_backend_cls(cls) -> str: + return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend" # noqa + + @classmethod + def stateless_init_device_torch_dist_pg( + cls, + backend: str, + prefix_store: PrefixStore, + group_rank: int, + group_size: int, + timeout: timedelta, + ) -> ProcessGroup: + assert is_nccl_available() + pg: ProcessGroup = ProcessGroup( + prefix_store, + group_rank, + group_size, + ) + from torch.distributed.distributed_c10d import ProcessGroupNCCL + + backend_options = ProcessGroupNCCL.Options() + backend_options._timeout = timeout + + backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size, + backend_options) + backend_type = ProcessGroup.BackendType.NCCL + device = torch.device("cuda") + pg._set_default_backend(backend_type) + backend_class._set_sequence_number_for_group() + + pg._register_backend(device, backend_type, backend_class) + return pg diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 6c573c1b3635e..0173b15697cfe 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -9,6 +9,7 @@ import vllm.envs as envs from vllm.inputs import ProcessorInputs, PromptType from vllm.logger import init_logger from vllm.sampling_params import SamplingParams, SamplingType +from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS from .interface import Platform, PlatformEnum, _Backend @@ -161,6 +162,16 @@ class TpuPlatform(Platform): "Forcing --disable_chunked_mm_input.") scheduler_config.disable_chunked_mm_input = True + if vllm_config.model_config and vllm_config.model_config.use_mla: + logger.info( + "MLA is enabled on a non-GPU platform; forcing chunked " + "prefill and prefix caching to be disabled.") + vllm_config.scheduler_config.enable_chunked_prefill = False + vllm_config.scheduler_config.chunked_prefill_enabled = False + vllm_config.scheduler_config.max_num_batched_tokens = max( + vllm_config.scheduler_config.max_model_len, + DEFAULT_MAX_NUM_BATCHED_TOKENS) + @classmethod def is_pin_memory_available(cls): logger.warning("Pin memory is not supported on TPU.") diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 225e756cd7ce8..b2a6ad5d77db6 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, Optional import torch from vllm.logger import init_logger +from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS from .interface import DeviceCapability, Platform, PlatformEnum, _Backend @@ -36,15 +37,17 @@ class XPUPlatform(Platform): logger.info("Using IPEX attention backend.") return "vllm.attention.backends.ipex_attn.IpexAttnBackend" - @staticmethod + @classmethod def get_device_capability( - device_id: int = 0) -> Optional[DeviceCapability]: + cls, + device_id: int = 0, + ) -> Optional[DeviceCapability]: # capacity format differs from cuda's and will cause unexpected # failure, so use None directly return None - @staticmethod - def get_device_name(device_id: int = 0) -> str: + @classmethod + def get_device_name(cls, device_id: int = 0) -> str: return torch.xpu.get_device_name(device_id) @classmethod @@ -56,8 +59,8 @@ class XPUPlatform(Platform): def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: return True - @staticmethod - def inference_mode(): + @classmethod + def inference_mode(cls): return torch.no_grad() @classmethod @@ -113,6 +116,16 @@ class XPUPlatform(Platform): parallel_config.distributed_executor_backend) parallel_config.distributed_executor_backend = "ray" + if vllm_config.model_config and vllm_config.model_config.use_mla: + logger.info( + "MLA is enabled on a non-GPU platform; forcing chunked " + "prefill and prefix caching to be disabled.") + vllm_config.scheduler_config.enable_chunked_prefill = False + vllm_config.scheduler_config.chunked_prefill_enabled = False + vllm_config.scheduler_config.max_num_batched_tokens = max( + vllm_config.scheduler_config.max_model_len, + DEFAULT_MAX_NUM_BATCHED_TOKENS) + @classmethod def is_pin_memory_available(cls): logger.warning("Pin memory is not supported on XPU.") diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index d72ab2bd088c7..2884cb46fecd7 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -2,7 +2,7 @@ import logging import os -from typing import Callable +from typing import Any, Callable import torch @@ -14,7 +14,7 @@ logger = logging.getLogger(__name__) plugins_loaded = False -def load_plugins_by_group(group: str) -> dict[str, Callable]: +def load_plugins_by_group(group: str) -> dict[str, Callable[[], Any]]: import sys if sys.version_info < (3, 10): from importlib_metadata import entry_points @@ -27,23 +27,27 @@ def load_plugins_by_group(group: str) -> dict[str, Callable]: if len(discovered_plugins) == 0: logger.debug("No plugins for group %s found.", group) return {} + logger.info("Available plugins for group %s:", group) for plugin in discovered_plugins: - logger.info("name=%s, value=%s", plugin.name, plugin.value) + logger.info("- %s -> %s", plugin.name, plugin.value) + if allowed_plugins is None: - logger.info("all available plugins for group %s will be loaded.", - group) - logger.info("set environment variable VLLM_PLUGINS to control" - " which plugins to load.") - plugins = {} + logger.info("All plugins in this group will be loaded. " + "Set `VLLM_PLUGINS` to control which plugins to load.") + + plugins = dict[str, Callable[[], Any]]() for plugin in discovered_plugins: if allowed_plugins is None or plugin.name in allowed_plugins: + if allowed_plugins is not None: + logger.info("Loading plugin %s", plugin.name) + try: func = plugin.load() plugins[plugin.name] = func - logger.info("plugin %s loaded.", plugin.name) except Exception: logger.exception("Failed to load plugin %s", plugin.name) + return plugins diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py index 0dae02d33fec7..07a63e294df49 100644 --- a/vllm/reasoning/granite_reasoning_parser.py +++ b/vllm/reasoning/granite_reasoning_parser.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 -import re from collections.abc import Sequence from typing import Optional, Union +import regex as re from transformers import PreTrainedTokenizerBase from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, diff --git a/vllm/sequence.py b/vllm/sequence.py index f5f9c56a7db23..d359f897da25e 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -27,7 +27,7 @@ VLLM_INVALID_TOKEN_ID = -1 def array_full(token_id: int, count: int): - """{class}`array` equivalent of {func}`numpy.full`.""" + """[`array`][] equivalent of [numpy.full][].""" return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count @@ -192,8 +192,8 @@ class SequenceData(msgspec.Struct, def from_prompt_token_counts( *token_counts: tuple[int, int]) -> "SequenceData": """ - Construct a {class}`SequenceData` instance by concatenating - prompt token sequences. + Construct a [`SequenceData`][vllm.sequence.SequenceData] instance + by concatenating prompt token sequences. Each tuple represents one token sequence, expressed in the form `(token_id, count)`. @@ -216,8 +216,8 @@ class SequenceData(msgspec.Struct, prompt_embeds: Optional[torch.Tensor] = None, ) -> "SequenceData": """ - Construct a {class}`SequenceData` instance from prompt and output - token sequences. + Construct a [`SequenceData`][vllm.sequence.SequenceData] instance + from prompt and output token sequences. """ prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE, prompt_token_ids) @@ -452,9 +452,11 @@ class SequenceData(msgspec.Struct, class Sequence: """Stores the data, status, and block information of a sequence. - The sequence is constructed from the {data}`DecoderOnlyInputs` - (for decoder-only) or {data}`EncoderDecoderInputs` (for encoder-decoder) - instance passed in through the `inputs` constructor argument. + The sequence is constructed from the + [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] (for decoder-only) + or [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs] + (for encoder-decoder) instance passed in through the `inputs` + constructor argument. Args: seq_id: The ID of the sequence. @@ -1494,7 +1496,7 @@ class ParallelSampleSequenceGroup(SequenceGroupBase): for i in range(original_params.n): request_id_i = f"{request_id}_parallel_sample_{i}" group.seq_id_to_index[request_id_i] = i - params = copy.deepcopy(original_params) + params = original_params.clone() params.n = 1 if params.seed is not None: params.seed += i diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index a6276c5633945..991d2040a878a 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -294,8 +294,11 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase): inputs_embeds=None, positions=model_input.input_positions, intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs(multi_modal_kwargs, - device=self.device), + **MultiModalKwargs.as_kwargs( + multi_modal_kwargs, + dtype=self.model_runner.model_config.dtype, + device=self.device, + ), **model_execute_kwargs, ) diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index 0bb8d602ec8f1..4430da26c0493 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -126,12 +126,12 @@ class AsyncMetricsCollector: """Copy rejection/typical-acceptance sampling metrics (number of accepted tokens, etc) to CPU asynchronously. - Returns a CUDA event recording when the copy is complete. + Returns a device event recording when the copy is complete. """ assert self._copy_stream is not None - self._copy_stream.wait_stream(torch.cuda.current_stream()) + self._copy_stream.wait_stream(current_platform.current_stream()) - with torch.cuda.stream(self._copy_stream): + with current_platform.stream(self._copy_stream): self._aggregate_num_accepted_tokens.copy_( self.spec_decode_sampler.num_accepted_tokens, non_blocking=True) @@ -142,7 +142,7 @@ class AsyncMetricsCollector: self._aggregate_num_draft_tokens = ( self.spec_decode_sampler.num_draft_tokens) - aggregate_metrics_ready = torch.cuda.Event() + aggregate_metrics_ready = current_platform.Event() aggregate_metrics_ready.record(self._copy_stream) return aggregate_metrics_ready diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 6ba5a51007b4d..252c80957305b 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -114,7 +114,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": return spec_decode_worker -# Reminder: Please update docs/source/features/compatibility_matrix.md +# Reminder: Please update docs/features/compatibility_matrix.md # If the feature combo become valid class SpecDecodeWorker(LoRANotSupportedWorkerBase): """Worker which implements speculative decoding. diff --git a/vllm/transformers_utils/__init__.py b/vllm/transformers_utils/__init__.py index b556976a51ba7..84bd7a7476564 100644 --- a/vllm/transformers_utils/__init__.py +++ b/vllm/transformers_utils/__init__.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 -from vllm.envs import VLLM_USE_MODELSCOPE +from vllm import envs -if VLLM_USE_MODELSCOPE: +if envs.VLLM_USE_MODELSCOPE: try: # Patch here, before each import happens import modelscope diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 5f45ff133855c..8774f95a2f60b 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -4,12 +4,12 @@ import enum import json import os import time -from functools import cache +from functools import cache, partial from pathlib import Path -from typing import Any, Callable, Literal, Optional, Union +from typing import Any, Callable, Literal, Optional, TypeVar, Union import huggingface_hub -from huggingface_hub import hf_hub_download +from huggingface_hub import get_safetensors_metadata, hf_hub_download from huggingface_hub import list_repo_files as hf_list_repo_files from huggingface_hub import try_to_load_from_cache from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError, @@ -24,7 +24,7 @@ from transformers.models.auto.modeling_auto import ( MODEL_FOR_CAUSAL_LM_MAPPING_NAMES) from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME -from vllm.envs import VLLM_USE_MODELSCOPE +from vllm import envs from vllm.logger import init_logger # yapf conflicts with isort for this block # yapf: disable @@ -45,13 +45,12 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config, from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import resolve_obj_by_qualname -if VLLM_USE_MODELSCOPE: +if envs.VLLM_USE_MODELSCOPE: from modelscope import AutoConfig else: from transformers import AutoConfig MISTRAL_CONFIG_NAME = "params.json" -HF_TOKEN = os.getenv('HF_TOKEN', None) logger = init_logger(__name__) @@ -94,10 +93,15 @@ class ConfigFormat(str, enum.Enum): MISTRAL = "mistral" -def with_retry(func: Callable[[], Any], - log_msg: str, - max_retries: int = 2, - retry_delay: int = 2): +_R = TypeVar("_R") + + +def with_retry( + func: Callable[[], _R], + log_msg: str, + max_retries: int = 2, + retry_delay: int = 2, +) -> _R: for attempt in range(max_retries): try: return func() @@ -110,6 +114,8 @@ def with_retry(func: Callable[[], Any], time.sleep(retry_delay) retry_delay *= 2 + raise AssertionError("Should not be reached") + # @cache doesn't cache exceptions @cache @@ -130,7 +136,7 @@ def list_repo_files( ] # if model is remote, use hf_hub api to list files try: - if VLLM_USE_MODELSCOPE: + if envs.VLLM_USE_MODELSCOPE: from vllm.transformers_utils.utils import ( modelscope_list_repo_files) return modelscope_list_repo_files(repo_id, @@ -185,7 +191,7 @@ def file_or_path_exists(model: Union[str, Path], config_name: str, return file_exists(str(model), config_name, revision=revision, - token=HF_TOKEN) + token=os.getenv('HF_TOKEN', None)) def patch_rope_scaling(config: PretrainedConfig) -> None: @@ -300,7 +306,10 @@ def get_config( " - For Hugging Face models: ensure the presence of a " "'config.json'.\n" " - For Mistral models: ensure the presence of a " - "'params.json'.\n").format(model=model) + "'params.json'.\n" + "3. For GGUF: pass the local path of the GGUF checkpoint.\n" + " Loading GGUF from a remote repo directly is not yet " + "supported.\n").format(model=model) raise ValueError(error_message) from e @@ -309,7 +318,7 @@ def get_config( model, revision=revision, code_revision=code_revision, - token=HF_TOKEN, + token=os.getenv('HF_TOKEN', None), **kwargs, ) @@ -321,7 +330,7 @@ def get_config( model, revision=revision, code_revision=code_revision, - token=HF_TOKEN, + token=os.getenv('HF_TOKEN', None), **kwargs, ) else: @@ -331,7 +340,7 @@ def get_config( trust_remote_code=trust_remote_code, revision=revision, code_revision=code_revision, - token=HF_TOKEN, + token=os.getenv('HF_TOKEN', None), **kwargs, ) except ValueError as e: @@ -349,7 +358,7 @@ def get_config( raise e elif config_format == ConfigFormat.MISTRAL: - config = load_params_config(model, revision, token=HF_TOKEN, **kwargs) + config = load_params_config(model, revision, **kwargs) else: supported_formats = [ fmt.value for fmt in ConfigFormat if fmt != ConfigFormat.AUTO @@ -558,7 +567,7 @@ def get_sentence_transformer_tokenizer_config(model: str, # If model is on HuggingfaceHub, get the repo files repo_files = list_repo_files(model, revision=revision, - token=HF_TOKEN) + token=os.getenv('HF_TOKEN', None)) except Exception: repo_files = [] @@ -765,7 +774,7 @@ def get_hf_image_processor_config( **kwargs, ) -> dict[str, Any]: # ModelScope does not provide an interface for image_processor - if VLLM_USE_MODELSCOPE: + if envs.VLLM_USE_MODELSCOPE: return dict() # Separate model folder from file path for GGUF models if check_gguf_file(model): @@ -821,13 +830,39 @@ def try_get_generation_config( def get_cross_encoder_activation_function(config: PretrainedConfig): - if (hasattr(config, "sbert_ce_default_activation_function") - and config.sbert_ce_default_activation_function is not None): + function_name: Optional[str] = None + if hasattr(config, "sentence_transformers") and "activation_fn" in \ + config.sentence_transformers: + function_name = config.sentence_transformers["activation_fn"] + + elif (hasattr(config, "sbert_ce_default_activation_function") + and config.sbert_ce_default_activation_function is not None): function_name = config.sbert_ce_default_activation_function + + if function_name is not None: assert function_name.startswith("torch.nn.modules."), \ "Loading of activation functions is restricted to " \ "torch.nn.modules for security reasons" return resolve_obj_by_qualname(function_name)() else: return nn.Sigmoid() if config.num_labels == 1 else nn.Identity() + + +def try_get_safetensors_metadata( + model: str, + *, + revision: Optional[str] = None, +): + get_safetensors_metadata_partial = partial( + get_safetensors_metadata, + model, + revision=revision, + token=os.getenv('HF_TOKEN', None), + ) + + try: + return with_retry(get_safetensors_metadata_partial, + "Error retrieving safetensors") + except Exception: + return None diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py index 586d5c7f5e54b..a43e4746cb6c6 100644 --- a/vllm/transformers_utils/configs/eagle.py +++ b/vllm/transformers_utils/configs/eagle.py @@ -52,13 +52,15 @@ class EAGLEConfig(PretrainedConfig): assert self.model is not None, \ "model should not be None when method is eagle" kwargs["architectures"] = [ - f"Eagle{arch}" for arch in self.model.architectures + f"Eagle{arch}" if not arch.startswith("Eagle") \ + else arch for arch in self.model.architectures ] elif method == "eagle3": assert self.model is not None, \ "model should not be None when method is eagle3" kwargs["architectures"] = [ - f"Eagle3{arch}" for arch in self.model.architectures + f"Eagle3{arch}" if not arch.startswith("Eagle3") \ + else arch for arch in self.model.architectures ] else: raise ValueError(f"Invalid method {method}. \ @@ -68,7 +70,7 @@ class EAGLEConfig(PretrainedConfig): if self.model is not None: for k, v in self.model.to_dict().items(): - if not hasattr(self, k): + if k not in kwargs: setattr(self, k, v) @classmethod diff --git a/vllm/transformers_utils/processors/ovis.py b/vllm/transformers_utils/processors/ovis.py index a35d32999991d..f1c6407e1f3a3 100644 --- a/vllm/transformers_utils/processors/ovis.py +++ b/vllm/transformers_utils/processors/ovis.py @@ -33,6 +33,8 @@ from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin, Unpack) from transformers.tokenization_utils_base import PreTokenizedInput, TextInput +from vllm.multimodal.image import convert_image_mode + __all__ = ['OvisProcessor'] IGNORE_ID = -100 @@ -361,8 +363,8 @@ class OvisProcessor(ProcessorMixin): # pick the partition with maximum covering_ratio and break the tie using #sub_images return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0] - if convert_to_rgb and image.mode != 'RGB': - image = image.convert('RGB') + if convert_to_rgb: + image = convert_image_mode(image, 'RGB') sides = self.get_image_size() diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index e31580ede57ba..fa7a208c48ed7 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -13,7 +13,7 @@ import huggingface_hub from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast) -from vllm.envs import VLLM_USE_MODELSCOPE +from vllm import envs from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.transformers_utils.tokenizer_base import (TokenizerBase, @@ -168,7 +168,7 @@ def get_tokenizer( ) -> AnyTokenizer: """Gets a tokenizer for the given model name via HuggingFace or ModelScope. """ - if VLLM_USE_MODELSCOPE: + if envs.VLLM_USE_MODELSCOPE: # download model from ModelScope hub, # lazy import so that modelscope is not required for normal use. # pylint: disable=C. diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 551c2d55b4fc6..23b6f67f09df7 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -1,12 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 import os -import re from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Any, Optional, Union, cast import huggingface_hub +import regex as re from huggingface_hub import HfApi, hf_hub_download from vllm.logger import init_logger @@ -156,7 +156,11 @@ def make_mistral_chat_completion_request( # # [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80 for message in messages: - if message.get("role") == "assistant": + # Remove reasoning_content as unsupported by Mistral + _ = message.pop("reasoning_content", None) # type: ignore + + # Convert list text content to string + if message.get("role") in ("assistant", "tool"): content = message.get("content") if isinstance(content, list): content = "\n".join(chunk.get("text") for chunk in content) diff --git a/vllm/utils.py b/vllm/utils.py index 0cd90c130d3ef..c879b38d065aa 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -19,7 +19,6 @@ import json import multiprocessing import os import pickle -import re import signal import socket import subprocess @@ -34,11 +33,12 @@ import uuid import warnings import weakref from argparse import (Action, ArgumentDefaultsHelpFormatter, ArgumentParser, - ArgumentTypeError, _ArgumentGroup) + ArgumentTypeError, RawDescriptionHelpFormatter, + _ArgumentGroup) from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task from collections import UserDict, defaultdict -from collections.abc import (AsyncGenerator, Awaitable, Generator, Hashable, - Iterable, Iterator, KeysView, Mapping) +from collections.abc import (AsyncGenerator, Awaitable, Collection, Generator, + Hashable, Iterable, Iterator, KeysView, Mapping) from concurrent.futures.process import ProcessPoolExecutor from dataclasses import dataclass, field from functools import cache, lru_cache, partial, wraps @@ -54,6 +54,7 @@ import cloudpickle import numpy as np import numpy.typing as npt import psutil +import regex as re import torch import torch.types import yaml @@ -77,9 +78,15 @@ if TYPE_CHECKING: logger = init_logger(__name__) +# This value is chosen to have a balance between ITL and TTFT. Note it is +# not optimized for throughput. +DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048 +POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768 +MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120 + # Exception strings for non-implemented encoder/decoder scenarios -# Reminder: Please update docs/source/features/compatibility_matrix.md +# Reminder: Please update docs/features/compatibility_matrix.md # If the feature combo become valid STR_NOT_IMPL_ENC_DEC_SWA = \ @@ -100,7 +107,7 @@ STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP = ( "currently not supported for encoder/decoder " "models.") -STR_NOT_IMPL_ENC_DEC_LORA = ("LoRA is currently not currently " +STR_NOT_IMPL_ENC_DEC_LORA = ("LoRA is not currently " "supported with encoder/decoder " "models.") @@ -752,16 +759,15 @@ def get_kv_cache_torch_dtype( model_dtype: Optional[Union[str, torch.dtype]] = None) -> torch.dtype: if isinstance(cache_dtype, str): if cache_dtype == "auto": - if isinstance(model_dtype, str): + if isinstance(model_dtype, + str) and model_dtype in STR_DTYPE_TO_TORCH_DTYPE: torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype] elif isinstance(model_dtype, torch.dtype): torch_dtype = model_dtype else: raise ValueError(f"Invalid model dtype: {model_dtype}") - elif cache_dtype in ["half", "bfloat16", "float"]: + elif cache_dtype in STR_DTYPE_TO_TORCH_DTYPE: torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] - elif cache_dtype == "fp8": - torch_dtype = torch.uint8 else: raise ValueError(f"Invalid kv cache dtype: {cache_dtype}") elif isinstance(cache_dtype, torch.dtype): @@ -973,6 +979,53 @@ def get_dtype_size(dtype: torch.dtype) -> int: return torch.tensor([], dtype=dtype).element_size() +# bool = 0, int = 1, float = 2, complex = 3 +def _get_precision_level(dtype: torch.dtype) -> int: + # NOTE: Complex dtypes return `is_floating_point=False` + return ((dtype != torch.bool) + dtype.is_floating_point + + dtype.is_complex * 2) + + +def is_lossless_cast(src_dtype: torch.dtype, tgt_dtype: torch.dtype): + """ + Test whether it is lossless to cast a tensor from + `src_dtype` to `tgt_dtype`. + """ + if src_dtype == tgt_dtype: + return True + + src_level = _get_precision_level(src_dtype) + tgt_level = _get_precision_level(tgt_dtype) + + if src_level < tgt_level: + return True + if src_level > tgt_level: + return False + + # Compare integral types + if not src_dtype.is_floating_point and not src_dtype.is_complex: + src_info = torch.iinfo(src_dtype) + tgt_info = torch.iinfo(tgt_dtype) + return src_info.min >= tgt_info.min and src_info.max <= tgt_info.max + + # Compare floating-point types + src_info = torch.finfo(src_dtype) + tgt_info = torch.finfo(tgt_dtype) + return (src_info.min >= tgt_info.min and src_info.max <= tgt_info.max + and src_info.resolution >= tgt_info.resolution) + + +def common_broadcastable_dtype(dtypes: Collection[torch.dtype]): + """ + Get the common `dtype` where all of the other `dtypes` can be + cast to it without losing any information. + """ + return max( + dtypes, + key=lambda dtype: sum(is_lossless_cast(dt, dtype) for dt in dtypes), + ) + + # `collections` helpers def is_list_of( value: object, @@ -998,7 +1051,7 @@ def flatten_2d_lists(lists: Iterable[Iterable[T]]) -> list[T]: def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]): """ - Unlike {class}`itertools.groupby`, groups are not broken by + Unlike [`itertools.groupby`][], groups are not broken by non-contiguous data. """ groups = defaultdict[_K, list[_V]](list) @@ -1317,7 +1370,8 @@ class StoreBoolean(Action): "Expected 'true' or 'false'.") -class SortedHelpFormatter(ArgumentDefaultsHelpFormatter): +class SortedHelpFormatter(ArgumentDefaultsHelpFormatter, + RawDescriptionHelpFormatter): """SortedHelpFormatter that sorts arguments by their option strings.""" def _split_lines(self, text, width): @@ -1871,14 +1925,6 @@ def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor: return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor) -def is_in_doc_build() -> bool: - try: - from sphinx.ext.autodoc.mock import _MockModule - return isinstance(zmq, _MockModule) - except ModuleNotFoundError: - return False - - def import_from_path(module_name: str, file_path: Union[str, os.PathLike]): """ Import a Python file according to its file path. @@ -1918,11 +1964,11 @@ class _PlaceholderBase: Disallows downstream usage of placeholder modules. We need to explicitly override each dunder method because - {meth}`__getattr__` is not called when they are accessed. + [`__getattr__`][vllm.utils._PlaceholderBase.__getattr__] + is not called when they are accessed. - :::{seealso} - [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup) - ::: + Info: + [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup) """ def __getattr__(self, key: str) -> Never: @@ -2421,6 +2467,7 @@ def make_zmq_socket( socket_type: Any, bind: Optional[bool] = None, identity: Optional[bytes] = None, + linger: Optional[int] = None, ) -> Union[zmq.Socket, zmq.asyncio.Socket]: # type: ignore[name-defined] """Make a ZMQ socket with the proper bind/connect semantics.""" @@ -2440,7 +2487,7 @@ def make_zmq_socket( buf_size = -1 # Use system default buffer size if bind is None: - bind = socket_type != zmq.PUSH + bind = socket_type not in (zmq.PUSH, zmq.SUB, zmq.XSUB) if socket_type in (zmq.PULL, zmq.DEALER, zmq.ROUTER): socket.setsockopt(zmq.RCVHWM, 0) @@ -2453,6 +2500,9 @@ def make_zmq_socket( if identity is not None: socket.setsockopt(zmq.IDENTITY, identity) + if linger is not None: + socket.setsockopt(zmq.LINGER, linger) + # Determine if the path is a TCP socket with an IPv6 address. # Enable IPv6 on the zmq socket if so. scheme, host, _ = split_zmq_path(path) @@ -2524,7 +2574,7 @@ def _maybe_force_spawn(): logger.warning( "We must use the `spawn` multiprocessing start method. " "Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " - "See https://docs.vllm.ai/en/latest/getting_started/" + "See https://docs.vllm.ai/en/latest/usage/" "troubleshooting.html#python-multiprocessing " "for more information. Reason: %s", reason) os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @@ -2789,14 +2839,17 @@ def cprofile(save_file: Optional[str] = None, enabled: bool = True): # Only relevant for models using ALiBi (e.g, MPT) def check_use_alibi(model_config: ModelConfig) -> bool: - return (getattr(model_config.hf_text_config, "alibi", False) # Falcon + cfg = model_config.hf_text_config + return (getattr(cfg, "alibi", False) # Falcon or ("BloomForCausalLM" in getattr(model_config.hf_config, "architectures", [])) # Bloom - or getattr(model_config.hf_text_config, "position_encoding_type", - "") == "alibi" # codellm_1b_alibi - or - (hasattr(model_config.hf_text_config, "attn_config") # MPT - and model_config.hf_text_config.attn_config.get("alibi", False))) + or getattr(cfg, "position_encoding_type", "") == + "alibi" # codellm_1b_alibi + or (hasattr(cfg, "attn_config") # MPT + and ((isinstance(cfg.attn_config, dict) + and cfg.attn_config.get("alibi", False)) or + (not isinstance(cfg.attn_config, dict) + and getattr(cfg.attn_config, "alibi", False))))) def sha256(input) -> int: diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index ac231db7d8b1c..9b88400a473e7 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -697,10 +697,6 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): if isinstance(attn_out, tuple): attn_out, lse = attn_out[0], attn_out[1] - # unpad if necessary - if self._pad_v: - attn_out = attn_out[..., :v.shape[-1]] - # Remain consistent with old `flash_attn_varlen_func` where there # is only one output tensor if `return_softmax_lse` is False. if return_softmax_lse: @@ -883,6 +879,10 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): suffix_lse=suffix_lse, ) + # unpad if necessary + if self._pad_v: + output = output[..., :v.shape[-1]] + return output.flatten(start_dim=-2) @abstractmethod diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 7ce39110ac01d..d1e823bbe3965 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -53,6 +53,8 @@ class AiterMLADecodeMetadata(MLACommonDecodeMetadata): # The number of entries in the last page of each request in # the paged kv cache, shape: [batch_size] paged_kv_last_page_len: Optional[torch.Tensor] = None + # The query indptr, shape : [num_decode + 1] + qo_indptr: Optional[torch.Tensor] = None class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]): @@ -64,9 +66,6 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): def __init__(self, runner, kv_cache_spec: AttentionSpec, block_table: BlockTable): super().__init__(runner, kv_cache_spec, block_table) - max_model_len = self.runner.model_config.max_model_len - assert max_model_len == 32768,\ - "AITER MLA requires max_model_len=32768" assert self.kv_cache_spec.block_size == 1, "AITER MLA" \ "only supports block size 1." @@ -75,27 +74,33 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): seq_lens: torch.Tensor) -> tuple[torch.Tensor, ...]: page_size = self.kv_cache_spec.block_size block_table_bounds = (seq_lens + page_size - 1) // page_size + device = self.runner.device mask = (torch.arange(block_table.size(1), dtype=block_table.dtype, - device=block_table.device).unsqueeze(0) + device=device).unsqueeze(0) < block_table_bounds.unsqueeze(1)) paged_kv_indices = block_table[mask] paged_kv_indptr = torch.cat([ - torch.zeros(1, - dtype=block_table_bounds.dtype, - device=block_table_bounds.device), + torch.zeros(1, dtype=block_table_bounds.dtype, device=device), block_table_bounds.cumsum(dim=0, dtype=torch.int32) ]) paged_kv_last_page_len = seq_lens % page_size paged_kv_last_page_len = torch.where(paged_kv_last_page_len == 0, page_size, paged_kv_last_page_len) + qo_indptr = torch.arange(0, + self._num_decodes + 1, + step=1, + dtype=torch.int32, + device=device) + return ( paged_kv_indices, paged_kv_indptr, paged_kv_last_page_len, + qo_indptr, ) def _build_decode(self, block_table_tensor: torch.Tensor, @@ -105,6 +110,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): paged_kv_indices, paged_kv_indptr, paged_last_page_len, + qo_indptr, ) = self._get_paged_kv_tensors(block_table_tensor, seq_lens) attn_metadata = AiterMLADecodeMetadata( @@ -112,7 +118,8 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): seq_lens=seq_lens, paged_kv_indptr=paged_kv_indptr, paged_kv_indices=paged_kv_indices, - paged_kv_last_page_len=paged_last_page_len) + paged_kv_last_page_len=paged_last_page_len, + qo_indptr=qo_indptr) return attn_metadata @@ -137,7 +144,10 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]): alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap, attn_type, **mla_args) - + assert (num_heads == 16 or num_heads == 128), ( + f"Aiter MLA only supports 16 or 128 number of heads.\n" + f"Provided {num_heads} number of heads.\n" + "Try adjusting tensor_parallel_size value.") unsupported_features = [ alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap ] @@ -189,7 +199,18 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]): kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2) + if self.num_heads == 16: + # AITER MLA decode kernel only supports + # max_seqlen_q=1 when using 16 heads. + max_seqlen_qo = 1 + else: + # AITER MLA decode Kernel handles arbitrary + # max_seqlen_q values when using 128 heads. + assert attn_metadata.prefill is not None + max_seqlen_qo = attn_metadata.prefill.max_query_len + aiter_mla_decode_fwd(q, kv_buffer, o, self.scale, + attn_metadata.decode.qo_indptr, max_seqlen_qo, attn_metadata.decode.paged_kv_indptr, attn_metadata.decode.paged_kv_indices, attn_metadata.decode.paged_kv_last_page_len) diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 4000f93984d39..a97bb85004f6f 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, Any, Optional import torch from vllm import _custom_ops as ops +from vllm import envs from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) from vllm.attention.ops.chunked_prefill_paged_decode import ( @@ -126,6 +127,8 @@ class TritonAttentionImpl(AttentionImpl): "TritonAttentionImpl") self.fp8_dtype = current_platform.fp8_dtype() + self.force_prefill_decode_attn = \ + envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION def forward( self, @@ -166,9 +169,9 @@ class TritonAttentionImpl(AttentionImpl): # performance to make sure it does not introduce any overhead. num_queries_per_kv = query.shape[1] // key.shape[1] - use_prefill_decode_attn = (num_queries_per_kv & - (num_queries_per_kv - 1)) != 0 - + num_q_is_pow2 = (num_queries_per_kv & (num_queries_per_kv - 1)) == 0 + use_prefill_decode_attn = (self.force_prefill_decode_attn + or not num_q_is_pow2) num_actual_tokens = attn_metadata.num_actual_tokens if use_prefill_decode_attn: diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index da18ece7555a2..0f6098d2b4005 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -174,6 +174,7 @@ class KVCacheManager: num_new_tokens: int, num_new_computed_tokens: int = 0, new_computed_blocks: Optional[KVCacheBlocks] = None, + num_draft_tokens: int = 0, num_lookahead_tokens: int = 0, delay_cache_blocks: bool = False, ) -> Optional[KVCacheBlocks]: @@ -273,7 +274,7 @@ class KVCacheManager: # generated (accepted) tokens. self.single_type_manager.cache_blocks( request, self.req_to_block_hashes[request.request_id], - num_computed_tokens + num_new_tokens - len(request.spec_token_ids)) + num_computed_tokens + num_new_tokens - num_draft_tokens) return KVCacheBlocks(new_blocks) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 403b5401be75a..a41fe48818702 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -544,16 +544,17 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig, available_memory) estimated_msg = "" if estimated_max_len > 0: - estimated_msg = " Based on the available memory," - f" the estimated maximum model length is {estimated_max_len}." + estimated_msg = ( + "Based on the available memory, " + f"the estimated maximum model length is {estimated_max_len}.") raise ValueError( f"To serve at least one request with the models's max seq len " f"({max_model_len}), ({needed_memory/GiB_bytes:.2f} GiB KV " f"cache is needed, which is larger than the available KV cache " - f"memory ({available_memory/GiB_bytes:.2f} GiB)." + f"memory ({available_memory/GiB_bytes:.2f} GiB). " f"{estimated_msg} " - f" Try increasing `gpu_memory_utilization` or decreasing " + f"Try increasing `gpu_memory_utilization` or decreasing " f"`max_model_len` when initializing the engine.") diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py index c17f80b6ae78a..055ce446051ef 100644 --- a/vllm/v1/core/sched/interface.py +++ b/vllm/v1/core/sched/interface.py @@ -45,7 +45,7 @@ class SchedulerInterface(ABC): self, scheduler_output: "SchedulerOutput", model_runner_output: "ModelRunnerOutput", - ) -> "EngineCoreOutputs": + ) -> dict[int, "EngineCoreOutputs"]: """Update the scheduler state based on the model runner output. This method is called after the model runner has processed the scheduled @@ -55,7 +55,8 @@ class SchedulerInterface(ABC): for each request. Returns: - A EngineCoreOutputs object containing the outputs for each request. + A dict of client index to EngineCoreOutputs object containing the + outputs for each request originating from that client. """ raise NotImplementedError @@ -126,6 +127,11 @@ class SchedulerInterface(ABC): """ raise NotImplementedError + @abstractmethod + def get_request_counts(self) -> tuple[int, int]: + """Returns (num_running_reqs, num_waiting_reqs).""" + raise NotImplementedError + @abstractmethod def make_stats(self) -> Optional["SchedulerStats"]: """Make a SchedulerStats object for logging. diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index d8fd67e232cb5..ce16a1ed5a096 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -58,7 +58,8 @@ class Scheduler(SchedulerInterface): # request ids should be included in the EngineCoreOutputs returned # by update_from_outputs(). This is currently used in the multi-engine # case to track request lifetimes efficiently. - self.include_finished_set = include_finished_set + self.finished_req_ids_dict: Optional[dict[int, set[str]]] = ( + defaultdict(set) if include_finished_set else None) # Scheduling constraints. self.max_num_running_reqs = self.scheduler_config.max_num_seqs @@ -227,10 +228,15 @@ class Scheduler(SchedulerInterface): req_index += 1 continue + num_draft_tokens = max( + num_new_tokens + request.num_computed_tokens - + request.num_tokens, 0) + while True: new_blocks = self.kv_cache_manager.allocate_slots( request, num_new_tokens, + num_draft_tokens=num_draft_tokens, num_lookahead_tokens=self.num_lookahead_tokens) if new_blocks is None: # The request cannot be scheduled. @@ -310,15 +316,16 @@ class Scheduler(SchedulerInterface): break request = self.waiting[0] - num_prealloc_computed_tokens = 0 - # P/D: skip request if still waiting for remote kvs. + + # KVTransfer: skip request if still waiting for remote kvs. if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS: is_ready = self._update_waiting_for_remote_kv(request) if is_ready: request.status = RequestStatus.WAITING - num_prealloc_computed_tokens = ( - request.num_computed_tokens) else: + logger.debug( + "%s is still in WAITING_FOR_REMOTE_KVS state.", + request.request_id) self.waiting.popleft() skipped_waiting_requests.appendleft(request) continue @@ -349,8 +356,9 @@ class Scheduler(SchedulerInterface): load_kv_async = False # Get already-cached tokens. - if num_prealloc_computed_tokens == 0: - new_computed_blocks, num_native_computed_tokens = \ + if request.num_computed_tokens == 0: + # Get locally-cached tokens. + new_computed_blocks, num_new_local_computed_tokens = \ self.kv_cache_manager.get_computed_blocks( request) @@ -358,23 +366,22 @@ class Scheduler(SchedulerInterface): if self.connector is not None: num_external_computed_tokens, load_kv_async = ( self.connector.get_num_new_matched_tokens( - request, num_native_computed_tokens)) + request, num_new_local_computed_tokens)) # Total computed tokens (local + external). - num_computed_tokens = (num_native_computed_tokens + + num_computed_tokens = (num_new_local_computed_tokens + num_external_computed_tokens) + # KVTransfer: WAITING reqs have num_computed_tokens > 0 + # after async KV recvs are completed. else: - # P/D: skip checking prefix cache if loaded from remote kvs. new_computed_blocks = KVCacheBlocks.create_empty() - num_native_computed_tokens = 0 - - # Total computed tokens (allocated in prior step). - num_computed_tokens = num_prealloc_computed_tokens + num_new_local_computed_tokens = 0 + num_computed_tokens = request.num_computed_tokens encoder_inputs_to_schedule = None new_encoder_budget = encoder_budget - # P/D: loading remote KV, do not allocate for new work. + # KVTransfer: loading remote KV, do not allocate for new work. if load_kv_async: assert num_external_computed_tokens > 0 num_new_tokens = 0 @@ -405,7 +412,7 @@ class Scheduler(SchedulerInterface): new_blocks = self.kv_cache_manager.allocate_slots( request, num_new_tokens + num_external_computed_tokens, - num_native_computed_tokens, + num_new_local_computed_tokens, new_computed_blocks, num_lookahead_tokens=self.num_lookahead_tokens, delay_cache_blocks=load_kv_async, @@ -457,7 +464,9 @@ class Scheduler(SchedulerInterface): token_budget -= num_new_tokens request.status = RequestStatus.RUNNING request.num_computed_tokens = num_computed_tokens - + # Count the number of prifix cached tokens. + if request.num_cached_tokens < 0: + request.num_cached_tokens = num_computed_tokens # Encoder-related. if encoder_inputs_to_schedule: scheduled_encoder_inputs[request.request_id] = ( @@ -685,7 +694,7 @@ class Scheduler(SchedulerInterface): self, scheduler_output: SchedulerOutput, model_runner_output: ModelRunnerOutput, - ) -> EngineCoreOutputs: + ) -> dict[int, EngineCoreOutputs]: sampled_token_ids = model_runner_output.sampled_token_ids spec_token_ids = model_runner_output.spec_token_ids logprobs = model_runner_output.logprobs @@ -693,7 +702,7 @@ class Scheduler(SchedulerInterface): num_scheduled_tokens = scheduler_output.num_scheduled_tokens new_running: list[Request] = [] - outputs: list[EngineCoreOutput] = [] + outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list) spec_decoding_stats: Optional[SpecDecodingStats] = None # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below @@ -789,7 +798,7 @@ class Scheduler(SchedulerInterface): if new_token_ids or kv_transfer_params: # Add EngineCoreOutput for this Request. - outputs.append( + outputs[request.client_index].append( EngineCoreOutput( request_id=req_id, new_token_ids=new_token_ids, @@ -799,6 +808,7 @@ class Scheduler(SchedulerInterface): stop_reason=request.stop_reason, events=request.take_events(), kv_transfer_params=kv_transfer_params, + num_cached_tokens=request.num_cached_tokens, )) else: @@ -819,17 +829,38 @@ class Scheduler(SchedulerInterface): self._cached_reqs_data[req_data.req_id].append(req_data) self.running = new_running - engine_core_outputs = EngineCoreOutputs( - outputs=outputs, - scheduler_stats=self.make_stats(spec_decoding_stats), - ) - if self.include_finished_set: - #TODO currently sending duplicates here, improve this - engine_core_outputs.finished_requests = ( - scheduler_output.finished_req_ids | self.finished_req_ids) + + # Create EngineCoreOutputs for all clients that have requests with + # outputs in this step. + engine_core_outputs = { + client_index: EngineCoreOutputs(outputs=outs) + for client_index, outs in outputs.items() + } + + finished_req_ids = self.finished_req_ids_dict + if finished_req_ids is not None: + # Include ids of requests that finished since last outputs + # were sent. + for client_index, finished_set in finished_req_ids.items(): + # Set finished request set in EngineCoreOutputs for this client. + if (eco := engine_core_outputs.get(client_index)) is not None: + eco.finished_requests = finished_set + else: + engine_core_outputs[client_index] = EngineCoreOutputs( + finished_requests=finished_set) + finished_req_ids.clear() + + if engine_core_outputs: + # Return stats to only one of the front-ends. + next(iter(engine_core_outputs.values())).scheduler_stats = ( + self.make_stats(spec_decoding_stats)) return engine_core_outputs + def get_request_counts(self) -> tuple[int, int]: + """Returns (num_running_reqs, num_waiting_reqs).""" + return len(self.running), len(self.waiting) + def add_request(self, request: Request) -> None: self.waiting.append(request) self.requests[request.request_id] = request @@ -871,8 +902,11 @@ class Scheduler(SchedulerInterface): delay_free_blocks, kv_xfer_params = self._connector_finished(request) self.encoder_cache_manager.free(request) - self._cached_reqs_data.pop(request.request_id, None) - self.finished_req_ids.add(request.request_id) + request_id = request.request_id + self._cached_reqs_data.pop(request_id, None) + self.finished_req_ids.add(request_id) + if self.finished_req_ids_dict is not None: + self.finished_req_ids_dict[request.client_index].add(request_id) if not delay_free_blocks: self._free_blocks(request) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 122a5a72cc36a..0c9f61a764279 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -44,10 +44,6 @@ class EngineCoreRequest( omit_defaults=True, # type: ignore[call-arg] gc=False): # type: ignore[call-arg] - # NOTE: prompt and prompt_token_ids should be DecoderOnlyInput, - # but this object is currently not playing well with msgspec - # due to circular imports and typing we have in data.py - request_id: str prompt_token_ids: list[int] mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]] @@ -59,6 +55,10 @@ class EngineCoreRequest( lora_request: Optional[LoRARequest] cache_salt: Optional[str] + # Index of the client, used to ensure outputs are sent back to the same + # client for this request when scaling out the front-end. + client_index: int = 0 + # Used in DP case to indicate which wave of requests this is expected to # belong to, to cover a race condition where the request is sent before # a wave finished notification is received. @@ -107,6 +107,9 @@ class EngineCoreOutput( events: Optional[list[EngineCoreEvent]] = None kv_transfer_params: Optional[dict[str, Any]] = None + # The number of tokens with prefix cache hits. + num_cached_tokens: int = 0 + @property def finished(self) -> bool: return self.finish_reason is not None diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 0d646d8dd575f..86781e7528fa3 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -20,6 +20,8 @@ from vllm.outputs import RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams +from vllm.transformers_utils.config import ( + maybe_register_config_serialize_by_value) from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext @@ -34,6 +36,7 @@ from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor from vllm.v1.metrics.loggers import (StatLoggerBase, StatLoggerFactory, setup_default_loggers) +from vllm.v1.metrics.prometheus import shutdown_prometheus from vllm.v1.metrics.stats import IterationStats, SchedulerStats logger = init_logger(__name__) @@ -52,6 +55,8 @@ class AsyncLLM(EngineClient): log_requests: bool = True, start_engine_loop: bool = True, stat_loggers: Optional[list[StatLoggerFactory]] = None, + client_addresses: Optional[dict[str, str]] = None, + client_index: int = 0, ) -> None: """ Create an AsyncLLM. @@ -80,6 +85,9 @@ class AsyncLLM(EngineClient): "AsyncLLMEngine.from_vllm_config(...) or explicitly set " "VLLM_USE_V1=0 or 1 and report this issue on Github.") + # Ensure we can serialize custom transformer configs + maybe_register_config_serialize_by_value() + self.model_config = vllm_config.model_config self.vllm_config = vllm_config self.log_requests = log_requests @@ -119,6 +127,8 @@ class AsyncLLM(EngineClient): vllm_config=vllm_config, executor_class=executor_class, log_stats=self.log_stats, + client_addresses=client_addresses, + client_index=client_index, ) if self.stat_loggers: for stat_logger in self.stat_loggers[0]: @@ -140,6 +150,8 @@ class AsyncLLM(EngineClient): stat_loggers: Optional[list[StatLoggerFactory]] = None, disable_log_requests: bool = False, disable_log_stats: bool = False, + client_addresses: Optional[dict[str, str]] = None, + client_index: int = 0, ) -> "AsyncLLM": if not envs.VLLM_USE_V1: raise ValueError( @@ -157,6 +169,8 @@ class AsyncLLM(EngineClient): log_requests=not disable_log_requests, log_stats=not disable_log_stats, usage_context=usage_context, + client_addresses=client_addresses, + client_index=client_index, ) @classmethod @@ -190,6 +204,8 @@ class AsyncLLM(EngineClient): def shutdown(self): """Shutdown, cleaning up the background proc and IPC.""" + shutdown_prometheus() + if engine_core := getattr(self, "engine_core", None): engine_core.shutdown() @@ -393,7 +409,6 @@ class AsyncLLM(EngineClient): # TODO(rob): make into a coroutine and launch it in # background thread once Prometheus overhead is non-trivial. if stat_loggers: - assert outputs.scheduler_stats is not None AsyncLLM._record_stats( stat_loggers[outputs.engine_index], scheduler_stats=outputs.scheduler_stats, @@ -417,7 +432,7 @@ class AsyncLLM(EngineClient): @staticmethod def _record_stats( stat_loggers: list[StatLoggerBase], - scheduler_stats: SchedulerStats, + scheduler_stats: Optional[SchedulerStats], iteration_stats: Optional[IterationStats], ): """static so that it can be used from the output_handler task diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py new file mode 100644 index 0000000000000..b84d4b144b5f2 --- /dev/null +++ b/vllm/v1/engine/coordinator.py @@ -0,0 +1,252 @@ +# SPDX-License-Identifier: Apache-2.0 +import multiprocessing +import time +import weakref +from typing import Optional + +import msgspec.msgpack +import zmq + +from vllm.config import ParallelConfig +from vllm.logger import init_logger +from vllm.utils import get_mp_context, get_open_zmq_ipc_path, make_zmq_socket +from vllm.v1.engine import EngineCoreOutputs, EngineCoreRequestType +from vllm.v1.serial_utils import MsgpackDecoder +from vllm.v1.utils import get_engine_client_zmq_addr, shutdown + +logger = init_logger(__name__) + + +class DPCoordinator: + """Coordinator process used for data-parallel deployments (DP>1). + + Intermediates between multiple DP engine rank processes and one or more + front-end API server processes. + + * Collects stats from each DP engine (currently just waiting and running + queue lengths), and publishes these to all front-ends for use in + load-balancing decisions. + + * Keeps track of the current DP "request wave" number and running state + of the engines. This is received from the DP rank 0 engine and published + to the front-end processes along with the current load stats. + + The engines alternate between a global running/paused state. The global + "request wave" number is a count of the number of times that the workers + collectively move from a running state to a paused state. This transition + is synchronized via the all-reduce operation performed in the + DPEngineCoreProc._has_global_unfinished_reqs method. + + * Broadcasts the START_DP_WAVE message to engines to move them from paused + to running state when one engine receives a new request. This can happen + in two cases: + 1) A front-end sending a new request while the engines are paused will + concurrently notify the coordinator. + 2) An engine receiving a request for a stale request wave while in paused + state will notify the coordinator. + + Engines will move into running state when receiving a new request or + START_DP_WAVE message. + """ + + def __init__(self, parallel_config: ParallelConfig): + + # Assume coordinator is colocated with front-end procs. + front_publish_address = get_open_zmq_ipc_path() + + dp_size = parallel_config.data_parallel_size + assert dp_size > 1, "Coordinator only used for data parallel" + + local_only = dp_size == parallel_config.data_parallel_size_local + host = parallel_config.data_parallel_master_ip + back_publish_address = get_engine_client_zmq_addr(local_only, host) + back_output_address = get_engine_client_zmq_addr(local_only, host) + + context = get_mp_context() + self.proc: multiprocessing.Process = context.Process( + target=CoordinatorProc.run_coordinator, + name="VLLM_DP_Coordinator", + kwargs={ + "engine_count": parallel_config.data_parallel_size, + "front_publish_address": front_publish_address, + "back_output_address": back_output_address, + "back_publish_address": back_publish_address, + }, + daemon=True) + self.proc.start() + + self.stats_publish_address = front_publish_address + self.coord_in_address = back_publish_address + self.coord_out_address = back_output_address + self._finalizer = weakref.finalize(self, shutdown, [self.proc]) + + def get_stats_publish_address(self) -> str: + return self.stats_publish_address + + def get_engine_socket_addresses(self) -> tuple[str, str]: + """Returns tuple of ZMQ input address, output address.""" + return self.coord_in_address, self.coord_out_address + + def close(self): + self._finalizer() + + +class EngineState: + + def __init__(self): + self.request_counts = [0, 0] # [waiting, running] + + +class CoordinatorProc: + + def __init__(self, engine_count: int): + + self.ctx = zmq.Context() + + self.engines = [EngineState() for _ in range(engine_count)] + + self.current_wave = 0 + self.engines_running = False + self.stats_changed = False + + @staticmethod + def run_coordinator( + engine_count: int, + front_publish_address: str, + back_output_address: str, + back_publish_address: str, + ): + coordinator = CoordinatorProc(engine_count=engine_count) + try: + coordinator.process_input_socket( + front_publish_address, + back_output_address, + back_publish_address, + ) + except KeyboardInterrupt: + logger.info("DP Coordinator process exiting") + + def process_input_socket(self, front_publish_address: str, + back_output_address: str, + back_publish_address: str): + + decoder = MsgpackDecoder(EngineCoreOutputs) + + with make_zmq_socket( + path=front_publish_address, # IPC + ctx=self.ctx, + socket_type=zmq.XPUB, + bind=True, + ) as publish_front, make_zmq_socket( + path=back_output_address, # IPC or TCP + ctx=self.ctx, + socket_type=zmq.PULL, + bind=True, + ) as output_back, make_zmq_socket( + path=back_publish_address, # IPC or TCP + ctx=self.ctx, + socket_type=zmq.XPUB, + bind=True, + ) as publish_back: + + poller = zmq.Poller() + poller.register(publish_front, zmq.POLLIN) + poller.register(output_back, zmq.POLLIN) + last_publish_time = 0 + while True: + elapsed = int(time.time() * 1000) - last_publish_time + # Send at 100 ms interval if the stats have changed, + # or otherwise every 3 seconds. + wait_for = 100 if self.stats_changed else 3000 + events = poller.poll(timeout=max(0, wait_for - elapsed)) + if not events: + # Poller timeout - publish current stats to front-ends. + engine_req_counts_list = self._get_engine_counts() + to_publish = (engine_req_counts_list, self.current_wave, + self.engines_running) + publish_front.send(msgspec.msgpack.encode(to_publish)) + last_publish_time = int(time.time() * 1000) + self.stats_changed = False + continue + + events = dict(events) + + if publish_front in events: + buffer = publish_front.recv() + if buffer == b'\x01': + # Ignore subscription messages. + continue + + # We received a message on the front-end XPUB socket, + # from an API server sending a new request while the + # engines are paused, so that we can wake the other + # engines. + engine_to_exclude, wave = msgspec.msgpack.decode(buffer) + if wave < self.current_wave: + # If the wave number is stale, ensure the message is + # handled by all the engines. + engine_to_exclude = None + if not self.engines_running: + self.engines_running = True + self.stats_changed = True + self._send_start_wave(publish_back, self.current_wave, + engine_to_exclude) + + if output_back in events: + # We received a message from one of the engines. + + buffer = output_back.recv() + outputs: EngineCoreOutputs = decoder.decode(buffer) + + assert not outputs.outputs + assert outputs.utility_output is None + + eng_index = outputs.engine_index + if outputs.scheduler_stats: + # 1. Updated request load stats - update our local + # state with these. + stats = self.engines[eng_index].request_counts + stats[0] = outputs.scheduler_stats.num_waiting_reqs + stats[1] = outputs.scheduler_stats.num_running_reqs + self.stats_changed = True + + if (wave := outputs.wave_complete) is not None: + # 2. Notification from rank 0 engine that we've + # moved into the global paused state + # (engines_running==False) + if self.current_wave <= wave: + logger.debug("Moving DP wave from %d to %d.", + self.current_wave, wave) + self.current_wave = wave + 1 + self.engines_running = False + self.stats_changed = True + elif (wave := outputs.start_wave) is not None and ( + wave > self.current_wave or + (wave == self.current_wave + and not self.engines_running)): + # 3. The engine received request for a non-current wave + # so we must ensure that other engines progress to the + # next wave (race condition handling). + logger.debug( + "Starting wave %d after notification of " + "stale wave request from engine.", wave) + self.current_wave = wave + self.engines_running = True + self.stats_changed = True + self._send_start_wave(publish_back, wave, eng_index) + + @staticmethod + def _send_start_wave(socket: zmq.Socket, wave: int, + exclude_engine_index: Optional[int]): + """Broadcast the START_DP_WAVE message to all the engines. + It includes the current wave number and index of engine which + has already received a request with this wave number and so doesn't + require additional notification. + """ + wave_encoded = msgspec.msgpack.encode((wave, exclude_engine_index)) + socket.send_multipart( + (EngineCoreRequestType.START_DP_WAVE.value, wave_encoded)) + + def _get_engine_counts(self) -> list[list[int]]: + """Return list of [waiting, running] count lists for each engine.""" + return [e.request_counts for e in self.engines] diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 0cf2383af1c9b..a02abb62b1f36 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -7,6 +7,7 @@ import threading import time from collections import deque from concurrent.futures import Future +from contextlib import ExitStack from inspect import isclass, signature from logging import DEBUG from typing import Any, Callable, Optional, TypeVar, Union @@ -22,7 +23,7 @@ from vllm.logging_utils.dump_input import dump_engine_exception from vllm.lora.request import LoRARequest from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) -from vllm.utils import make_zmq_socket, resolve_obj_by_qualname, zmq_socket_ctx +from vllm.utils import make_zmq_socket, resolve_obj_by_qualname from vllm.v1.core.kv_cache_utils import (get_kv_cache_config, unify_kv_cache_configs) from vllm.v1.core.sched.interface import SchedulerInterface @@ -33,10 +34,12 @@ from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, from vllm.v1.engine.mm_input_cache import MirroredProcessingCache from vllm.v1.executor.abstract import Executor from vllm.v1.kv_cache_interface import KVCacheConfig +from vllm.v1.metrics.stats import SchedulerStats from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder from vllm.v1.structured_output import StructuredOutputManager +from vllm.v1.utils import EngineHandshakeMetadata, EngineZmqAddresses from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) @@ -57,6 +60,10 @@ class EngineCore: executor_fail_callback: Optional[Callable] = None): assert vllm_config.model_config.runner_type != "pooling" + # plugins need to be loaded at the engine/scheduler level too + from vllm.plugins import load_general_plugins + load_general_plugins() + self.vllm_config = vllm_config logger.info("Initializing a V1 LLM engine (v%s) with config: %s", VLLM_VERSION, vllm_config) @@ -119,7 +126,6 @@ class EngineCore: logger.info("Batch queue is enabled with size %d", self.batch_queue_size) self.batch_queue = queue.Queue(self.batch_queue_size) - self.vllm_config = vllm_config def _initialize_kv_caches( self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]: @@ -208,24 +214,27 @@ class EngineCore: # Re-raise exception raise err - def step(self) -> EngineCoreOutputs: - """Schedule, execute, and make output.""" + def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]: + """Schedule, execute, and make output. + + Returns tuple of outputs and a flag indicating whether the model + was executed. + """ # Check for any requests remaining in the scheduler - unfinished, # or finished and not yet removed from the batch. if not self.scheduler.has_requests(): - return EngineCoreOutputs( - outputs=[], - scheduler_stats=self.scheduler.make_stats(), - ) + return {}, False scheduler_output = self.scheduler.schedule() model_output = self.execute_model(scheduler_output) engine_core_outputs = self.scheduler.update_from_output( scheduler_output, model_output) # type: ignore - return engine_core_outputs + return (engine_core_outputs, + scheduler_output.total_num_scheduled_tokens > 0) - def step_with_batch_queue(self) -> Optional[EngineCoreOutputs]: + def step_with_batch_queue( + self) -> tuple[Optional[dict[int, EngineCoreOutputs]], bool]: """Schedule and execute batches with the batch queue. Note that if nothing to output in this step, None is returned. @@ -267,10 +276,10 @@ class EngineCore: # Blocking until the first result is available. model_output = future.result() self.batch_queue.task_done() - engine_core_outputs = self.scheduler.update_from_output( - scheduler_output, model_output) + engine_core_outputs = (self.scheduler.update_from_output( + scheduler_output, model_output)) - return engine_core_outputs + return engine_core_outputs, scheduled_batch def shutdown(self): self.structured_output_manager.clear_backend() @@ -336,6 +345,13 @@ class EngineCore: return self.model_executor.collective_rpc(method, timeout, args, kwargs) + def save_tensorized_model( + self, + tensorizer_config, + ) -> None: + self.model_executor.save_tensorized_model( + tensorizer_config=tensorizer_config, ) + class EngineCoreProc(EngineCore): """ZMQ-wrapper for running EngineCore in background process.""" @@ -346,7 +362,7 @@ class EngineCoreProc(EngineCore): self, vllm_config: VllmConfig, on_head_node: bool, - input_address: str, + handshake_address: str, executor_class: type[Executor], log_stats: bool, engine_index: int = 0, @@ -359,65 +375,70 @@ class EngineCoreProc(EngineCore): # Create input socket. input_ctx = zmq.Context() identity = engine_index.to_bytes(length=2, byteorder="little") - input_socket = make_zmq_socket(input_ctx, - input_address, - zmq.DEALER, - identity=identity, - bind=False) - try: + with make_zmq_socket(input_ctx, + handshake_address, + zmq.DEALER, + identity=identity, + linger=5000, + bind=False) as handshake_socket: + # Register engine with front-end. - output_address = self.startup_handshake( - input_socket, on_head_node, vllm_config.parallel_config) + addresses = self.startup_handshake(handshake_socket, on_head_node, + vllm_config.parallel_config) + self.client_count = len(addresses.outputs) # Update config which may have changed from the handshake. vllm_config.__post_init__() # Set up data parallel environment. + self.has_coordinator = addresses.coordinator_output is not None self._init_data_parallel(vllm_config) # Initialize engine core and model. super().__init__(vllm_config, executor_class, log_stats, executor_fail_callback) + self.engine_index = engine_index self.step_fn = (self.step if self.batch_queue is None else self.step_with_batch_queue) self.engines_running = False + self.last_counts = (0, 0) # Send ready message. num_gpu_blocks = vllm_config.cache_config.num_gpu_blocks - input_socket.send( + handshake_socket.send( msgspec.msgpack.encode({ "status": "READY", "local": on_head_node, "num_gpu_blocks": num_gpu_blocks, })) - # Background Threads and Queues for IO. These enable us to - # overlap ZMQ socket IO with GPU since they release the GIL, - # and to overlap some serialization/deserialization with the - # model forward pass. - # Threads handle Socket <-> Queues and core_busy_loop uses Queue. - self.input_queue = input_queue - self.output_queue = queue.Queue[Union[EngineCoreOutputs, bytes]]() - threading.Thread(target=self.process_input_socket, - args=(input_socket, ), - daemon=True).start() - input_socket = None - self.output_thread = threading.Thread( - target=self.process_output_socket, - args=(output_address, engine_index), - daemon=True) - self.output_thread.start() - finally: - if input_socket is not None: - input_socket.close(linger=0) + # Background Threads and Queues for IO. These enable us to + # overlap ZMQ socket IO with GPU since they release the GIL, + # and to overlap some serialization/deserialization with the + # model forward pass. + # Threads handle Socket <-> Queues and core_busy_loop uses Queue. + self.input_queue = input_queue + self.output_queue = queue.Queue[Union[tuple[int, EngineCoreOutputs], + bytes]]() + threading.Thread(target=self.process_input_sockets, + args=(addresses.inputs, addresses.coordinator_input, + identity), + daemon=True).start() + self.output_thread = threading.Thread( + target=self.process_output_sockets, + args=(addresses.outputs, addresses.coordinator_output, + engine_index), + daemon=True) + self.output_thread.start() @staticmethod - def startup_handshake(input_socket: zmq.Socket, on_head_node: bool, - parallel_config: ParallelConfig) -> str: + def startup_handshake( + handshake_socket: zmq.Socket, on_head_node: bool, + parallel_config: ParallelConfig) -> EngineZmqAddresses: # Send registration message. - input_socket.send( + handshake_socket.send( msgspec.msgpack.encode({ "status": "HELLO", "local": on_head_node, @@ -425,22 +446,20 @@ class EngineCoreProc(EngineCore): # Receive initialization message. logger.info("Waiting for init message from front-end.") - if not input_socket.poll(timeout=HANDSHAKE_TIMEOUT_MINS * 60 * 1000): + if not handshake_socket.poll(timeout=HANDSHAKE_TIMEOUT_MINS * 60_000): raise RuntimeError("Did not receive response from front-end " f"process within {HANDSHAKE_TIMEOUT_MINS} " f"minutes") - init_bytes = input_socket.recv() - init_message = msgspec.msgpack.decode(init_bytes) + init_bytes = handshake_socket.recv() + init_message: EngineHandshakeMetadata = msgspec.msgpack.decode( + init_bytes, type=EngineHandshakeMetadata) logger.debug("Received init message: %s", init_message) - output_socket_address = init_message["output_socket_address"] - #TBD(nick) maybe replace IP with configured head node address - - received_parallel_config = init_message["parallel_config"] + received_parallel_config = init_message.parallel_config for key, value in received_parallel_config.items(): setattr(parallel_config, key, value) - return output_socket_address + return init_message.addresses @staticmethod def run_engine_core(*args, @@ -512,7 +531,7 @@ class EngineCoreProc(EngineCore): """Exits when an engine step needs to be performed.""" waited = False - while not self.engines_running and not (self.scheduler.has_requests()): + while not self.engines_running and not self.scheduler.has_requests(): if logger.isEnabledFor(DEBUG) and self.input_queue.empty(): logger.debug("EngineCore waiting for work.") waited = True @@ -527,14 +546,16 @@ class EngineCoreProc(EngineCore): req = self.input_queue.get_nowait() self._handle_client_request(*req) - def _process_engine_step(self): + def _process_engine_step(self) -> bool: """Called only when there are unfinished local requests.""" # Step the engine core. - outputs = self.step_fn() + outputs, model_executed = self.step_fn() # Put EngineCoreOutputs into the output queue. - if outputs is not None: - self.output_queue.put_nowait(outputs) + for output in (outputs.items() if outputs else ()): + self.output_queue.put_nowait(output) + + return model_executed def _handle_client_request(self, request_type: EngineCoreRequestType, request: Any) -> None: @@ -545,7 +566,7 @@ class EngineCoreProc(EngineCore): elif request_type == EngineCoreRequestType.ABORT: self.abort_requests(request) elif request_type == EngineCoreRequestType.UTILITY: - call_id, method_name, args = request + client_idx, call_id, method_name, args = request output = UtilityOutput(call_id) try: method = getattr(self, method_name) @@ -556,7 +577,7 @@ class EngineCoreProc(EngineCore): output.failure_message = (f"Call to {method_name} method" f" failed: {str(e)}") self.output_queue.put_nowait( - EngineCoreOutputs(utility_output=output)) + (client_idx, EngineCoreOutputs(utility_output=output))) elif request_type == EngineCoreRequestType.EXECUTOR_FAILED: raise RuntimeError("Executor failed.") else: @@ -589,27 +610,68 @@ class EngineCoreProc(EngineCore): logger.fatal("vLLM shutdown signal from EngineCore failed " "to send. Please report this issue.") - def process_input_socket(self, input_socket: zmq.Socket): + def process_input_sockets(self, input_addresses: list[str], + coord_input_address: Optional[str], + identity: bytes): """Input socket IO thread.""" # Msgpack serialization decoding. add_request_decoder = MsgpackDecoder(EngineCoreRequest) generic_decoder = MsgpackDecoder() - while True: - # (RequestType, RequestData) - type_frame, *data_frames = input_socket.recv_multipart(copy=False) - request_type = EngineCoreRequestType(bytes(type_frame.buffer)) + with ExitStack() as stack, zmq.Context() as ctx: + input_sockets = [ + stack.enter_context( + make_zmq_socket(ctx, + input_address, + zmq.DEALER, + identity=identity, + bind=False)) + for input_address in input_addresses + ] + if coord_input_address is None: + coord_socket = None + else: + coord_socket = stack.enter_context( + make_zmq_socket(ctx, + coord_input_address, + zmq.XSUB, + identity=identity, + bind=False)) + # Send subscription message to coordinator. + coord_socket.send(b'\x01') - # Deserialize the request data. - decoder = add_request_decoder if ( - request_type == EngineCoreRequestType.ADD) else generic_decoder - request = decoder.decode(data_frames) + # Register sockets with poller. + poller = zmq.Poller() + for input_socket in input_sockets: + # Send initial message to each input socket - this is required + # before the front-end ROUTER socket can send input messages + # back to us. + input_socket.send(b'') + poller.register(input_socket, zmq.POLLIN) + if coord_socket is not None: + poller.register(coord_socket, zmq.POLLIN) - # Push to input queue for core busy loop. - self.input_queue.put_nowait((request_type, request)) + while True: + for input_socket, _ in poller.poll(): + # (RequestType, RequestData) + type_frame, *data_frames = input_socket.recv_multipart( + copy=False) + request_type = EngineCoreRequestType( + bytes(type_frame.buffer)) - def process_output_socket(self, output_path: str, engine_index: int): + # Deserialize the request data. + decoder = add_request_decoder if ( + request_type + == EngineCoreRequestType.ADD) else generic_decoder + request = decoder.decode(data_frames) + + # Push to input queue for core busy loop. + self.input_queue.put_nowait((request_type, request)) + + def process_output_sockets(self, output_paths: list[str], + coord_output_path: Optional[str], + engine_index: int): """Output socket IO thread.""" # Msgpack serialization encoding. @@ -623,30 +685,49 @@ class EngineCoreProc(EngineCore): # We must set linger to ensure the ENGINE_CORE_DEAD # message is sent prior to closing the socket. - with zmq_socket_ctx(output_path, zmq.constants.PUSH, - linger=4000) as socket: + with ExitStack() as stack, zmq.Context() as ctx: + sockets = [ + stack.enter_context( + make_zmq_socket(ctx, output_path, zmq.PUSH, linger=4000)) + for output_path in output_paths + ] + coord_socket = stack.enter_context( + make_zmq_socket( + ctx, coord_output_path, zmq.PUSH, bind=False, + linger=4000)) if coord_output_path is not None else None + max_reuse_bufs = len(sockets) + 1 + while True: - outputs = self.output_queue.get() - if outputs == EngineCoreProc.ENGINE_CORE_DEAD: - socket.send(outputs, copy=False) + output = self.output_queue.get() + if output == EngineCoreProc.ENGINE_CORE_DEAD: + for socket in sockets: + socket.send(output) break - assert not isinstance(outputs, bytes) + assert not isinstance(output, bytes) + client_index, outputs = output outputs.engine_index = engine_index + if client_index == -1: + # Don't reuse buffer for coordinator message + # which will be very small. + assert coord_socket is not None + coord_socket.send_multipart(encoder.encode(outputs)) + continue + # Reclaim buffers that zmq is finished with. while pending and pending[-1][0].done: reuse_buffers.append(pending.pop()[2]) buffer = reuse_buffers.pop() if reuse_buffers else bytearray() buffers = encoder.encode_into(outputs, buffer) - tracker = socket.send_multipart(buffers, - copy=False, - track=True) + tracker = sockets[client_index].send_multipart(buffers, + copy=False, + track=True) if not tracker.done: ref = outputs if len(buffers) > 1 else None pending.appendleft((tracker, ref, buffer)) - elif len(reuse_buffers) < 2: - # Keep at most 2 buffers to reuse. + elif len(reuse_buffers) < max_reuse_bufs: + # Limit the number of buffers to reuse. reuse_buffers.append(buffer) @@ -658,7 +739,7 @@ class DPEngineCoreProc(EngineCoreProc): self, vllm_config: VllmConfig, on_head_node: bool, - input_address: str, + handshake_address: str, executor_class: type[Executor], log_stats: bool, ): @@ -673,10 +754,11 @@ class DPEngineCoreProc(EngineCoreProc): # Counts forward-passes of the model so that we can synchronize # finished with DP peers every N steps. self.counter = 0 + self.current_wave = 0 # Initialize the engine. dp_rank = vllm_config.parallel_config.data_parallel_rank - super().__init__(vllm_config, on_head_node, input_address, + super().__init__(vllm_config, on_head_node, handshake_address, executor_class, log_stats, dp_rank) def _init_data_parallel(self, vllm_config: VllmConfig): @@ -689,6 +771,15 @@ class DPEngineCoreProc(EngineCoreProc): assert dp_size > 1 assert 0 <= local_dp_rank <= dp_rank < dp_size + if vllm_config.kv_transfer_config is not None: + # modify the engine_id and append the local_dp_rank to it to ensure + # that the kv_transfer_config is unique for each DP rank. + vllm_config.kv_transfer_config.engine_id = ( + f"{vllm_config.kv_transfer_config.engine_id}_dp{local_dp_rank}" + ) + logger.debug("Setting kv_transfer_config.engine_id to %s", + vllm_config.kv_transfer_config.engine_id) + from vllm.platforms import current_platform device_control_env_var = current_platform.device_control_env_var world_size = vllm_config.parallel_config.world_size @@ -697,9 +788,8 @@ class DPEngineCoreProc(EngineCoreProc): for i in range(local_dp_rank * world_size, (local_dp_rank + 1) * world_size)) - self.local_dp_rank = local_dp_rank + self.dp_rank = dp_rank self.dp_group = vllm_config.parallel_config.stateless_init_dp_group() - self.current_wave = 0 def shutdown(self): super().shutdown() @@ -707,22 +797,23 @@ class DPEngineCoreProc(EngineCoreProc): stateless_destroy_torch_distributed_process_group(dp_group) def add_request(self, request: EngineCoreRequest): - if request.current_wave != self.current_wave: + if self.has_coordinator and request.current_wave != self.current_wave: if request.current_wave > self.current_wave: self.current_wave = request.current_wave elif not self.engines_running: # Request received for an already-completed wave, notify # front-end that we need to start the next one. self.output_queue.put_nowait( - EngineCoreOutputs(start_wave=self.current_wave)) + (-1, EngineCoreOutputs(start_wave=self.current_wave))) super().add_request(request) def _handle_client_request(self, request_type: EngineCoreRequestType, request: Any) -> None: if request_type == EngineCoreRequestType.START_DP_WAVE: - new_wave: int = request - if new_wave >= self.current_wave: + new_wave, exclude_eng_index = request + if exclude_eng_index != self.engine_index and ( + new_wave >= self.current_wave): self.current_wave = new_wave if not self.engines_running: logger.debug("EngineCore starting idle loop for wave %d.", @@ -731,6 +822,18 @@ class DPEngineCoreProc(EngineCoreProc): else: super()._handle_client_request(request_type, request) + def _maybe_publish_request_counts(self): + if not self.has_coordinator: + return + + # Publish our request counts (if they've changed). + counts = self.scheduler.get_request_counts() + if counts != self.last_counts: + self.last_counts = counts + stats = SchedulerStats(*counts) + self.output_queue.put_nowait( + (-1, EngineCoreOutputs(scheduler_stats=stats))) + def run_busy_loop(self): """Core busy loop of the EngineCore for data parallel case.""" @@ -739,30 +842,18 @@ class DPEngineCoreProc(EngineCoreProc): # 1) Poll the input queue until there is work to do. self._process_input_queue() + # 2) Step the engine core. + executed = self._process_engine_step() + self._maybe_publish_request_counts() + local_unfinished_reqs = self.scheduler.has_unfinished_requests() - - if local_unfinished_reqs: - # 2) Step the engine core. - self._process_engine_step() - - # Check if we have now finished all requests. - local_unfinished_reqs = ( - self.scheduler.has_unfinished_requests()) - else: - if self.scheduler.has_finished_requests(): - # There are no unfinished requests, but there are some - # finished requests remaining to be removed from the - # batch state. This engine step won't perform a forward - # pass but will flush the finished requests to ensure - # up-to-date state is returned in the engine outputs. - self._process_engine_step() - - if not self.engines_running: + if not executed: + if not local_unfinished_reqs and not self.engines_running: # All engines are idle. continue - # There must be unfinished requests in DP peers, run a - # dummy forward pass. + # We are in a running state and so must execute a dummy pass + # if the model didn't execute any ready requests. self.execute_dummy_batch() # 3) All-reduce operation to determine global unfinished reqs. @@ -770,12 +861,13 @@ class DPEngineCoreProc(EngineCoreProc): local_unfinished_reqs) if not self.engines_running: - if self.local_dp_rank == 0: + if self.dp_rank == 0: # Notify client that we are pausing the loop. logger.debug("Wave %d finished, pausing engine loop.", self.current_wave) self.output_queue.put_nowait( - EngineCoreOutputs(wave_complete=self.current_wave)) + (-1, + EngineCoreOutputs(wave_complete=self.current_wave))) self.current_wave += 1 def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool: diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 0d52bc9a68148..232d6742b7718 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -2,6 +2,7 @@ import asyncio import contextlib import queue +import sys import uuid import weakref from abc import ABC, abstractmethod @@ -9,26 +10,28 @@ from collections import deque from collections.abc import Awaitable, Sequence from concurrent.futures import Future from dataclasses import dataclass -from enum import Enum, auto from threading import Thread from typing import Any, Callable, Optional, TypeVar, Union -import msgspec +import msgspec.msgpack import zmq import zmq.asyncio -from vllm.config import ParallelConfig, VllmConfig +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.utils import (get_open_port, get_open_zmq_inproc_path, - get_open_zmq_ipc_path, get_tcp_uri, make_zmq_socket) +from vllm.utils import (get_open_zmq_inproc_path, make_zmq_socket, + zmq_socket_ctx) from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, EngineCoreRequestType, UtilityOutput) +from vllm.v1.engine.coordinator import DPCoordinator from vllm.v1.engine.core import EngineCore, EngineCoreProc from vllm.v1.engine.exceptions import EngineDeadError from vllm.v1.executor.abstract import Executor from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr -from vllm.v1.utils import CoreEngineProcManager +from vllm.v1.utils import (CoreEngine, CoreEngineProcManager, + EngineZmqAddresses, get_engine_client_zmq_addr, + wait_for_engine_startup) logger = init_logger(__name__) @@ -36,8 +39,6 @@ AnyFuture = Union[asyncio.Future[Any], Future[Any]] _R = TypeVar('_R') # Return type for collective_rpc -STARTUP_POLL_PERIOD_MS = 10000 - class EngineCoreClient(ABC): """ @@ -206,7 +207,8 @@ class InprocClient(EngineCoreClient): self.engine_core = EngineCore(*args, **kwargs) def get_output(self) -> EngineCoreOutputs: - return self.engine_core.step() + outputs, _ = self.engine_core.step() + return outputs.get(0) or EngineCoreOutputs() def add_request(self, request: EngineCoreRequest) -> None: self.engine_core.add_request(request) @@ -265,24 +267,6 @@ class InprocClient(EngineCoreClient): return self.engine_core.collective_rpc(method, timeout, args, kwargs) -class CoreEngineState(Enum): - NEW = auto() - CONNECTED = auto() - READY = auto() - - -class CoreEngine: - """One per data parallel rank.""" - - def __init__(self, index: int = 0, local: bool = True): - self.local = local - self.index = index - self.identity = index.to_bytes(length=2, byteorder="little") - - self.state = CoreEngineState.NEW - self.num_reqs_in_flight = 0 - - @dataclass class BackgroundResources: """Used as a finalizer for clean shutdown, avoiding @@ -290,9 +274,12 @@ class BackgroundResources: ctx: Union[zmq.Context] local_engine_manager: Optional[CoreEngineProcManager] = None + coordinator: Optional[DPCoordinator] = None output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None + first_req_send_socket: Optional[zmq.asyncio.Socket] = None output_queue_task: Optional[asyncio.Task] = None + stats_update_task: Optional[asyncio.Task] = None shutdown_path: Optional[str] = None # Set if any of the engines are dead. Here so that the output @@ -305,16 +292,21 @@ class BackgroundResources: self.engine_dead = True if self.local_engine_manager is not None: self.local_engine_manager.close() + if self.coordinator is not None: + self.coordinator.close() if self.output_queue_task is not None: self.output_queue_task.cancel() + if self.stats_update_task is not None: + self.stats_update_task.cancel() # ZMQ context termination can hang if the sockets # aren't explicitly closed first. - if self.output_socket is not None: - self.output_socket.close(linger=0) - if self.input_socket is not None: - self.input_socket.close(linger=0) + for socket in (self.output_socket, self.input_socket, + self.first_req_send_socket): + if socket is not None: + socket.close(linger=0) + if self.shutdown_path is not None: # We must ensure that the sync output socket is # closed cleanly in its own thread. @@ -349,6 +341,7 @@ class MPClient(EngineCoreClient): vllm_config: VllmConfig, executor_class: type[Executor], log_stats: bool, + client_addresses: Optional[dict[str, str]] = None, ): self.vllm_config = vllm_config # Serialization setup. @@ -368,8 +361,9 @@ class MPClient(EngineCoreClient): try: parallel_config = vllm_config.parallel_config local_engine_count = parallel_config.data_parallel_size_local - start_index = parallel_config.data_parallel_rank local_start_index = parallel_config.data_parallel_rank_local + dp_size = parallel_config.data_parallel_size + dp_rank = parallel_config.data_parallel_rank # SPMD mode is where there is an LLM instance per DP rank and # one core engine per LLM, see @@ -377,46 +371,55 @@ class MPClient(EngineCoreClient): spmd_mode = local_start_index is not None if spmd_mode: assert local_engine_count == 1 - self.core_engines = [ - CoreEngine(index=local_start_index, local=True) - ] + self.core_engines = [CoreEngine(index=dp_rank, local=True)] else: - assert start_index == 0 + assert dp_rank == 0 local_start_index = 0 self.core_engines = [ CoreEngine(index=i, local=(i < local_engine_count)) - for i in range(parallel_config.data_parallel_size) + for i in range(dp_size) ] - input_address, output_address = self._get_zmq_addresses( - parallel_config, spmd_mode) + local_only = spmd_mode or local_engine_count == dp_size + + self.stats_update_address: Optional[str] = None + if client_addresses is not None: + input_address = client_addresses["input_address"] + output_address = client_addresses["output_address"] + self.stats_update_address = client_addresses.get( + "stats_update_address") + else: + host = parallel_config.data_parallel_master_ip + input_address = get_engine_client_zmq_addr(local_only, host) + output_address = get_engine_client_zmq_addr(local_only, host) # Create input and output sockets. self.input_socket = self.resources.input_socket = make_zmq_socket( self.ctx, input_address, zmq.ROUTER, bind=True) - self.resources.output_socket = make_zmq_socket( - self.ctx, output_address, zmq.constants.PULL) - # Start local engines. - if local_engine_count: - # In server mode, start_index and local_start_index will - # both be 0. - self.resources.local_engine_manager = CoreEngineProcManager( - EngineCoreProc.run_engine_core, - vllm_config=vllm_config, - executor_class=executor_class, - log_stats=log_stats, - input_address=input_address, - on_head_node=True, - local_engine_count=local_engine_count, - start_index=start_index, - local_start_index=local_start_index) + self.ctx, output_address, zmq.PULL) + + if client_addresses is None: + self._init_engines_direct(vllm_config, local_only, + local_start_index, input_address, + output_address, executor_class, + log_stats) + coordinator = self.resources.coordinator + if coordinator: + self.stats_update_address = ( + coordinator.get_stats_publish_address()) + + # Wait for ready messages from each engine on the input socket. + identities = set(e.identity for e in self.core_engines) + sync_input_socket = zmq.Socket.shadow(self.input_socket) + while identities: + if not sync_input_socket.poll(timeout=600_000): + raise TimeoutError("Timed out waiting for engines to send" + "initial message on input socket.") + identity, _ = sync_input_socket.recv_multipart() + identities.remove(identity) self.core_engine = self.core_engines[0] - - # Wait for engine core process(es) to start. - self._wait_for_engine_startup(output_address, parallel_config) - self.utility_results: dict[int, AnyFuture] = {} # Request objects which may contain pytorch-allocated tensors @@ -429,116 +432,67 @@ class MPClient(EngineCoreClient): if not success: self._finalizer() - @staticmethod - def _get_zmq_addresses(parallel_config: ParallelConfig, - spmd_mode: bool) -> tuple[str, str]: - """Returns (input_address, output_address).""" - dp_size = parallel_config.data_parallel_size + def _init_engines_direct(self, vllm_config: VllmConfig, local_only: bool, + local_start_index: int, input_address: str, + output_address: str, + executor_class: type[Executor], log_stats: bool): + """Self-contained client mode, launch engine and coordinator process + as needed.""" + + parallel_config = vllm_config.parallel_config local_engine_count = parallel_config.data_parallel_size_local + start_index = parallel_config.data_parallel_rank + host = parallel_config.data_parallel_master_ip - if local_engine_count == dp_size or spmd_mode: - input_address = get_open_zmq_ipc_path() - output_address = get_open_zmq_ipc_path() - else: - host = parallel_config.data_parallel_master_ip - input_port = parallel_config.data_parallel_rpc_port - output_port = get_open_port() - input_address = get_tcp_uri(host, input_port) - output_address = get_tcp_uri(host, output_port) + if len(self.core_engines) > 1: + self.resources.coordinator = DPCoordinator(parallel_config) - return input_address, output_address + handshake_address = get_engine_client_zmq_addr( + local_only, host, parallel_config.data_parallel_rpc_port) - def _wait_for_engine_startup(self, output_address: str, - parallel_config: ParallelConfig): - # Get a sync handle to the socket which can be sync or async. - sync_input_socket = zmq.Socket.shadow(self.input_socket) + with zmq_socket_ctx(handshake_address, zmq.ROUTER, + bind=True) as handshake_socket: - # Wait for engine core process(es) to send ready messages. - local_count = parallel_config.data_parallel_size_local - remote_count = len(self.core_engines) - local_count - # [local, remote] counts - conn_pending, start_pending = [local_count, remote_count], [0, 0] + # Start local engines. + if local_engine_count: + # In server mode, start_index and local_start_index will + # both be 0. + self.resources.local_engine_manager = CoreEngineProcManager( + EngineCoreProc.run_engine_core, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=log_stats, + handshake_address=handshake_address, + on_head_node=True, + local_engine_count=local_engine_count, + start_index=start_index, + local_start_index=local_start_index) - poller = zmq.Poller() - poller.register(sync_input_socket, zmq.POLLIN) - proc_manager = self.resources.local_engine_manager - if proc_manager is not None: - for sentinel in proc_manager.sentinels(): - poller.register(sentinel, zmq.POLLIN) - while any(conn_pending) or any(start_pending): - events = poller.poll(STARTUP_POLL_PERIOD_MS) - if not events: - if any(conn_pending): - logger.debug( - "Waiting for %d local, %d remote core engine proc(s) " - "to connect.", *conn_pending) - if any(start_pending): - logger.debug( - "Waiting for %d local, %d remote core engine proc(s) " - "to start.", *start_pending) - continue - if len(events) > 1 or events[0][0] != sync_input_socket: - # One of the local core processes exited. - finished = proc_manager.finished_procs( - ) if proc_manager else {} - raise RuntimeError("Engine core initialization failed. " - "See root cause above. " - f"Failed core proc(s): {finished}") + # Wait for engine core process(es) to start. + self._wait_for_engine_startup(handshake_socket, input_address, + output_address) - # Receive HELLO and READY messages from the input socket. - eng_identity, ready_msg_bytes = sync_input_socket.recv_multipart() - eng_index = int.from_bytes(eng_identity, byteorder="little") - engine = next( - (e for e in self.core_engines if e.identity == eng_identity), - None) - if engine is None: - raise RuntimeError(f"Message from engine with unexpected data " - f"parallel rank: {eng_index}") - msg = msgspec.msgpack.decode(ready_msg_bytes) - status, local = msg["status"], msg["local"] - if local != engine.local: - raise RuntimeError(f"{status} message from " - f"{'local' if local else 'remote'} " - f"engine {eng_index}, expected it to be " - f"{'local' if engine.local else 'remote'}") + def _wait_for_engine_startup(self, handshake_socket: zmq.Socket, + input_address: str, output_address: str): + addresses = EngineZmqAddresses( + inputs=[input_address], + outputs=[output_address], + ) - if status == "HELLO" and engine.state == CoreEngineState.NEW: + coordinator = self.resources.coordinator + if coordinator is not None: + addresses.coordinator_input, addresses.coordinator_output = ( + coordinator.get_engine_socket_addresses()) - # Send init message with DP config info. - init_message = self.encoder.encode({ - "output_socket_address": output_address, - "parallel_config": { - "data_parallel_master_ip": - parallel_config.data_parallel_master_ip, - "data_parallel_master_port": - parallel_config.data_parallel_master_port, - "data_parallel_size": - parallel_config.data_parallel_size, - }, - }) - sync_input_socket.send_multipart((eng_identity, *init_message), - copy=False) - conn_pending[0 if local else 1] -= 1 - start_pending[0 if local else 1] += 1 - engine.state = CoreEngineState.CONNECTED - elif status == "READY" and (engine.state - == CoreEngineState.CONNECTED): - # Setup KV cache config with initialization state from - # engine core process. Sum values from all engines in DP case. - cache_config = self.vllm_config.cache_config - num_gpu_blocks = cache_config.num_gpu_blocks or 0 - num_gpu_blocks += msg['num_gpu_blocks'] - cache_config.num_gpu_blocks = num_gpu_blocks - - start_pending[0 if local else 1] -= 1 - engine.state = CoreEngineState.READY - else: - raise RuntimeError(f"Unexpected {status} message for " - f"{'local' if local else 'remote'} engine " - f"{eng_index} in {engine.state} state.") - - logger.debug("%s from %s core engine process %s.", status, - "local" if local else "remote", eng_index) + wait_for_engine_startup( + handshake_socket, + addresses, + self.core_engines, + self.vllm_config.parallel_config, + self.vllm_config.cache_config, + self.resources.local_engine_manager, + coordinator.proc if coordinator else None, + ) def shutdown(self): # Terminate background resources. @@ -604,8 +558,8 @@ class SyncMPClient(MPClient): try: shutdown_socket.bind(shutdown_path) poller = zmq.Poller() - poller.register(shutdown_socket) - poller.register(out_socket) + poller.register(shutdown_socket, zmq.POLLIN) + poller.register(out_socket, zmq.POLLIN) while True: socks = poller.poll() if not socks: @@ -667,7 +621,7 @@ class SyncMPClient(MPClient): future: Future[Any] = Future() self.utility_results[call_id] = future self._send_input(EngineCoreRequestType.UTILITY, - (call_id, method, args)) + (0, call_id, method, args)) return future.result() @@ -729,15 +683,21 @@ class SyncMPClient(MPClient): class AsyncMPClient(MPClient): """Asyncio-compatible client for multi-proc EngineCore.""" - def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], - log_stats: bool): + def __init__(self, + vllm_config: VllmConfig, + executor_class: type[Executor], + log_stats: bool, + client_addresses: Optional[dict[str, str]] = None, + client_index: int = 0): super().__init__( asyncio_mode=True, vllm_config=vllm_config, executor_class=executor_class, log_stats=log_stats, + client_addresses=client_addresses, ) + self.client_index = client_index self.outputs_queue = asyncio.Queue[Union[EngineCoreOutputs, Exception]]() try: @@ -853,12 +813,13 @@ class AsyncMPClient(MPClient): future = asyncio.get_running_loop().create_future() self.utility_results[call_id] = future message = (EngineCoreRequestType.UTILITY.value, *self.encoder.encode( - (call_id, method, args))) + (self.client_index, call_id, method, args))) await self._send_input_message(message, engine, args) self._ensure_output_queue_task() return await future async def add_request_async(self, request: EngineCoreRequest) -> None: + request.client_index = self.client_index await self._send_input(EngineCoreRequestType.ADD, request) self._ensure_output_queue_task() @@ -920,17 +881,120 @@ class DPAsyncMPClient(AsyncMPClient): """Asyncio-compatible client for multi-proc, multi-engine (data parallel) EngineCore.""" - def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], - log_stats: bool): + def __init__(self, + vllm_config: VllmConfig, + executor_class: type[Executor], + log_stats: bool, + client_addresses: Optional[dict[str, str]] = None, + client_index: int = 0): self.current_wave = 0 self.engines_running = False + # To route aborts to the correct engine. self.reqs_in_flight: dict[str, CoreEngine] = {} - super().__init__(vllm_config, executor_class, log_stats) + super().__init__(vllm_config, executor_class, log_stats, + client_addresses, client_index) assert len(self.core_engines) > 1 + # List of [waiting, running] pair per engine. + self.lb_engines: list[list[int]] = [] + + self.first_req_sock_addr = get_open_zmq_inproc_path() + self.first_req_send_socket = self.resources.first_req_send_socket = ( + make_zmq_socket(self.ctx, + self.first_req_sock_addr, + zmq.PAIR, + bind=True)) + try: + # If we are running in an asyncio event loop, start the stats task. + # Otherwise, it will be started lazily. + asyncio.get_running_loop() + self._ensure_stats_update_task() + except RuntimeError: + pass + + def _ensure_stats_update_task(self): + resources = self.resources + if resources.stats_update_task is not None: + return + + assert self.stats_update_address is not None + + async def run_engine_stats_update_task(): + with make_zmq_socket(self.ctx, self.stats_update_address, + zmq.XSUB) as socket, make_zmq_socket( + self.ctx, + self.first_req_sock_addr, + zmq.PAIR, + bind=False) as first_req_rcv_socket: + # Send subscription message. + await socket.send(b'\x01') + + poller = zmq.asyncio.Poller() + poller.register(socket, zmq.POLLIN) + poller.register(first_req_rcv_socket, zmq.POLLIN) + + while True: + events = await poller.poll() + if not self.engines_running and len(events) == 2 or ( + events[0][0] == first_req_rcv_socket): + # Send a message to notify the coordinator that + # we're sending a request while the engines are + # paused, so that it can wake the others up + # (to run dummy EP loop). + self.engines_running = True + buf = first_req_rcv_socket.recv( + flags=zmq.NOBLOCK).result() + target_eng_index = int.from_bytes(buf, "little") + msg = msgspec.msgpack.encode( + (target_eng_index, self.current_wave)) + await socket.send(msg) + + buf = None + while True: + # Drain all stats events (we only care about latest). + future: asyncio.Future[bytes] = socket.recv( + flags=zmq.NOBLOCK) + if isinstance(future.exception(), zmq.Again): + break + buf = future.result() + if buf is None: + continue + + # Update local load-balancing state. + counts, wave, running = msgspec.msgpack.decode(buf) + self.current_wave = wave + self.engines_running = running + self.lb_engines = counts + + resources.stats_update_task = asyncio.create_task( + run_engine_stats_update_task()) + + def get_core_engine_for_request(self) -> CoreEngine: + if not self.lb_engines: + return self.core_engines[0] + # TODO use P2C alg for larger DP sizes + num_engines = len(self.lb_engines) + min_counts = [sys.maxsize, sys.maxsize] + eng_index = 0 + for i in range(num_engines): + # Start from client_index to help with balancing when engines + # are empty. + idx = (self.client_index + i) % num_engines + counts = self.lb_engines[idx] + if counts < min_counts: + min_counts = counts + eng_index = idx + # Adjust local counts for better balancing between stats updates + # from the coordinator (which happen every 100ms). + if min_counts[0]: + min_counts[0] += 1 + else: + min_counts[1] += 1 + return self.core_engines[eng_index] + async def call_utility_async(self, method: str, *args) -> Any: # Only the result from the first engine is returned. return (await asyncio.gather(*[ @@ -939,62 +1003,30 @@ class DPAsyncMPClient(AsyncMPClient): ]))[0] async def add_request_async(self, request: EngineCoreRequest) -> None: + self._ensure_stats_update_task() + request.current_wave = self.current_wave + request.client_index = self.client_index chosen_engine = self.get_core_engine_for_request() self.reqs_in_flight[request.request_id] = chosen_engine - chosen_engine.num_reqs_in_flight += 1 to_await = self._send_input(EngineCoreRequestType.ADD, request, chosen_engine) if not self.engines_running: - # Send request to chosen engine and dp start loop - # control message to all other engines. - self.engines_running = True - to_await = asyncio.gather( - to_await, # type: ignore[assignment] - *self._start_wave_coros(exclude_index=chosen_engine.index)) + # Notify coordinator that we're sending a request + await self.first_req_send_socket.send(chosen_engine.identity) await to_await self._ensure_output_queue_task() - def get_core_engine_for_request(self) -> CoreEngine: - return min(self.core_engines, key=lambda e: e.num_reqs_in_flight) - @staticmethod async def process_engine_outputs(self: "DPAsyncMPClient", outputs: EngineCoreOutputs): - if self.reqs_in_flight: - for req_id in outputs.finished_requests or (): - if engine := self.reqs_in_flight.pop(req_id, None): - engine.num_reqs_in_flight -= 1 - - if outputs.wave_complete is not None: - # Current wave is complete, move to next wave number - # and mark engines as paused. - if self.current_wave <= outputs.wave_complete: - self.current_wave = outputs.wave_complete + 1 - self.engines_running = False - - elif outputs.start_wave is not None and ( - outputs.start_wave > self.current_wave or - (outputs.start_wave == self.current_wave - and not self.engines_running)): - # Engine received request for a non-current wave so we must ensure - # that other engines progress to the next wave. - self.current_wave = outputs.start_wave - self.engines_running = True - await asyncio.gather(*self._start_wave_coros( - exclude_index=outputs.engine_index)) - - def _start_wave_coros(self, exclude_index: int) -> list[Awaitable[None]]: - logger.debug("Sending start DP wave %d.", self.current_wave) - return [ - self._send_input(EngineCoreRequestType.START_DP_WAVE, - self.current_wave, engine) - for engine in self.core_engines if engine.index != exclude_index - ] + if outputs.finished_requests and self.reqs_in_flight: + for req_id in outputs.finished_requests: + self.reqs_in_flight.pop(req_id, None) async def abort_requests_async(self, request_ids: list[str]) -> None: if not request_ids: diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 112896d6c7678..c856e2645a2c9 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -27,7 +27,10 @@ from vllm.v1.engine.output_processor import OutputProcessor from vllm.v1.engine.parallel_sampling import ParentRequest from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor -from vllm.v1.metrics.loggers import StatLoggerFactory +from vllm.v1.metrics.loggers import (PrometheusStatLogger, StatLoggerBase, + StatLoggerFactory) +from vllm.v1.metrics.reader import Metric, get_metrics_snapshot +from vllm.v1.metrics.stats import IterationStats logger = init_logger(__name__) @@ -64,6 +67,11 @@ class LLMEngine: self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config + self.log_stats = log_stats + self.stat_logger: Optional[StatLoggerBase] = None + if self.log_stats: + self.stat_logger = PrometheusStatLogger(vllm_config) + # important: init dp group before init the engine_core # In the decoupled engine case this is handled in EngineCoreProc. parallel_config = vllm_config.parallel_config @@ -86,7 +94,7 @@ class LLMEngine: # OutputProcessor (convert EngineCoreOutputs --> RequestOutput). self.output_processor = OutputProcessor(self.tokenizer, - log_stats=False) + log_stats=self.log_stats) # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs) self.engine_core = EngineCoreClient.make_client( @@ -94,7 +102,7 @@ class LLMEngine: asyncio_mode=False, vllm_config=vllm_config, executor_class=executor_class, - log_stats=False, # FIXME: implement + log_stats=self.log_stats, ) if not multiprocess_mode: @@ -223,12 +231,21 @@ class LLMEngine: outputs = self.engine_core.get_output() # 2) Process EngineCoreOutputs. + iteration_stats = IterationStats() if self.log_stats else None processed_outputs = self.output_processor.process_outputs( - outputs.outputs) + outputs.outputs, + engine_core_timestamp=outputs.timestamp, + iteration_stats=iteration_stats) # 3) Abort any reqs that finished due to stop strings. self.engine_core.abort_requests(processed_outputs.reqs_to_abort) + # 4) Record stats + if self.stat_logger is not None: + assert outputs.scheduler_stats is not None + self.stat_logger.record(scheduler_stats=outputs.scheduler_stats, + iteration_stats=iteration_stats) + return processed_outputs.request_outputs def get_vllm_config(self): @@ -260,6 +277,10 @@ class LLMEngine: def is_sleeping(self) -> bool: return self.engine_core.is_sleeping() + def get_metrics(self) -> list[Metric]: + assert self.log_stats, "Stat logging disabled" + return get_metrics_snapshot() + def get_tokenizer_group(self) -> TokenizerGroup: if self.tokenizer is None: raise ValueError("Unable to get tokenizer because " diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index a7a9b0e4a1613..293c291b43410 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -147,6 +147,7 @@ class RequestState: finish_reason: Optional[FinishReason], stop_reason: Union[int, str, None], kv_transfer_params: Optional[dict[str, Any]] = None, + num_cached_tokens: int = 0, ) -> Optional[RequestOutput]: finished = finish_reason is not None @@ -169,7 +170,7 @@ class RequestState: return None return self._new_request_output(request_id, outputs, finished, - kv_transfer_params) + kv_transfer_params, num_cached_tokens) def _new_request_output( self, @@ -177,6 +178,7 @@ class RequestState: outputs: list[CompletionOutput], finished: bool, kv_transfer_params: Optional[dict[str, Any]] = None, + num_cached_tokens: int = 0, ) -> RequestOutput: if self.output_kind == RequestOutputKind.DELTA: @@ -193,6 +195,7 @@ class RequestState: outputs=outputs, finished=finished, kv_transfer_params=kv_transfer_params, + num_cached_tokens=num_cached_tokens, ) def _new_completion_output( @@ -340,7 +343,7 @@ class OutputProcessor: finish_reason = engine_core_output.finish_reason stop_reason = engine_core_output.stop_reason kv_transfer_params = engine_core_output.kv_transfer_params - + num_cached_tokens = engine_core_output.num_cached_tokens req_state.is_prefilling = False # 2) Detokenize the token ids into text and perform stop checks. @@ -356,7 +359,7 @@ class OutputProcessor: # 4) Create and handle RequestOutput objects. if request_output := req_state.make_request_output( new_token_ids, finish_reason, stop_reason, - kv_transfer_params): + kv_transfer_params, num_cached_tokens): if req_state.queue is not None: # AsyncLLM: put into queue for handling by generate(). req_state.queue.put(request_output) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 74b226b45424f..eb5f9d4bfe004 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -38,7 +38,7 @@ logger = init_logger(__name__) POLLING_TIMEOUT_MS = 5000 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000 -EXECUTE_MODEL_TIMEOUT_S = 40 +EXECUTE_MODEL_TIMEOUT_S = 300 class MultiprocExecutor(Executor): @@ -50,6 +50,7 @@ class MultiprocExecutor(Executor): self.is_failed = False self.shutdown_event = threading.Event() self.failure_callback: Optional[FailureCallback] = None + self.io_thread_pool: Optional[ThreadPoolExecutor] = None self.world_size = self.parallel_config.world_size tensor_parallel_size = self.parallel_config.tensor_parallel_size @@ -107,7 +108,6 @@ class MultiprocExecutor(Executor): # For pipeline parallel, we use a thread pool for asynchronous # execute_model. - self.io_thread_pool: Optional[ThreadPoolExecutor] = None if self.max_concurrent_batches > 1: # Note: must use only 1 IO thread to keep dequeue sequence # from the response queue diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 2b75a3a2ecbd3..665e5873d5891 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -12,13 +12,12 @@ from vllm.config import SupportsMetricsInfo, VllmConfig from vllm.logger import init_logger from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics from vllm.v1.engine import FinishReason +from vllm.v1.metrics.prometheus import unregister_vllm_metrics from vllm.v1.metrics.stats import IterationStats, SchedulerStats from vllm.v1.spec_decode.metrics import SpecDecodingLogging, SpecDecodingProm logger = init_logger(__name__) -_LOCAL_LOGGING_INTERVAL_SEC = 5.0 - StatLoggerFactory = Callable[[VllmConfig, int], "StatLoggerBase"] @@ -35,7 +34,7 @@ class StatLoggerBase(ABC): ... @abstractmethod - def record(self, scheduler_stats: SchedulerStats, + def record(self, scheduler_stats: Optional[SchedulerStats], iteration_stats: Optional[IterationStats]): ... @@ -78,20 +77,22 @@ class LoggingStatLogger(StatLoggerBase): # Compute summary metrics for tracked stats return float(np.sum(tracked_stats) / (now - self.last_log_time)) - def record(self, scheduler_stats: SchedulerStats, + def record(self, scheduler_stats: Optional[SchedulerStats], iteration_stats: Optional[IterationStats]): """Log Stats to standard output.""" if iteration_stats: self._track_iteration_stats(iteration_stats) - self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats) + if scheduler_stats is not None: + self.prefix_caching_metrics.observe( + scheduler_stats.prefix_cache_stats) - if scheduler_stats.spec_decoding_stats is not None: - self.spec_decoding_logging.observe( - scheduler_stats.spec_decoding_stats) + if scheduler_stats.spec_decoding_stats is not None: + self.spec_decoding_logging.observe( + scheduler_stats.spec_decoding_stats) - self.last_scheduler_stats = scheduler_stats + self.last_scheduler_stats = scheduler_stats def log(self): now = time.monotonic() @@ -131,10 +132,11 @@ class LoggingStatLogger(StatLoggerBase): self.spec_decoding_logging.log(log_fn=log_fn) def log_engine_initialized(self): - logger.info( - "vllm cache_config_info with initialization " \ - "after num_gpu_blocks is: %d", - self.vllm_config.cache_config.num_gpu_blocks) + if self.vllm_config.cache_config.num_gpu_blocks: + logger.info( + "Engine %03d: vllm cache_config_info with initialization " + "after num_gpu_blocks is: %d", self.engine_index, + self.vllm_config.cache_config.num_gpu_blocks) class PrometheusStatLogger(StatLoggerBase): @@ -144,7 +146,8 @@ class PrometheusStatLogger(StatLoggerBase): _spec_decoding_cls = SpecDecodingProm def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): - self._unregister_vllm_metrics() + + unregister_vllm_metrics() self.vllm_config = vllm_config self.engine_index = engine_index # Use this flag to hide metrics that were deprecated in @@ -169,11 +172,13 @@ class PrometheusStatLogger(StatLoggerBase): self.gauge_scheduler_running = self._gauge_cls( name="vllm:num_requests_running", documentation="Number of requests in model execution batches.", + multiprocess_mode="mostrecent", labelnames=labelnames).labels(*labelvalues) self.gauge_scheduler_waiting = self._gauge_cls( name="vllm:num_requests_waiting", documentation="Number of requests waiting to be processed.", + multiprocess_mode="mostrecent", labelnames=labelnames).labels(*labelvalues) # @@ -182,6 +187,7 @@ class PrometheusStatLogger(StatLoggerBase): self.gauge_gpu_cache_usage = self._gauge_cls( name="vllm:gpu_cache_usage_perc", documentation="GPU KV-cache usage. 1 means 100 percent usage.", + multiprocess_mode="mostrecent", labelnames=labelnames).labels(*labelvalues) self.counter_gpu_prefix_cache_queries = self._counter_cls( @@ -200,24 +206,24 @@ class PrometheusStatLogger(StatLoggerBase): # Counters # self.counter_num_preempted_reqs = self._counter_cls( - name="vllm:num_preemptions_total", + name="vllm:num_preemptions", documentation="Cumulative number of preemption from the engine.", labelnames=labelnames).labels(*labelvalues) self.counter_prompt_tokens = self._counter_cls( - name="vllm:prompt_tokens_total", + name="vllm:prompt_tokens", documentation="Number of prefill tokens processed.", labelnames=labelnames).labels(*labelvalues) self.counter_generation_tokens = self._counter_cls( - name="vllm:generation_tokens_total", + name="vllm:generation_tokens", documentation="Number of generation tokens processed.", labelnames=labelnames).labels(*labelvalues) self.counter_request_success: dict[FinishReason, prometheus_client.Counter] = {} counter_request_success_base = self._counter_cls( - name="vllm:request_success_total", + name="vllm:request_success", documentation="Count of successfully processed requests.", labelnames=labelnames + ["finished_reason"]) for reason in FinishReason: @@ -242,6 +248,9 @@ class PrometheusStatLogger(StatLoggerBase): buckets=build_1_2_5_buckets(max_model_len), labelnames=labelnames).labels(*labelvalues) + # TODO: This metric might be incorrect in case of using multiple + # api_server counts which uses prometheus mp. + # See: https://github.com/vllm-project/vllm/pull/18053 self.histogram_iteration_tokens = \ self._histogram_cls( name="vllm:iteration_tokens_total", @@ -340,6 +349,9 @@ class PrometheusStatLogger(StatLoggerBase): # # LoRA metrics # + + # TODO: This metric might be incorrect in case of using multiple + # api_server counts which uses prometheus mp. self.gauge_lora_info: Optional[prometheus_client.Gauge] = None if vllm_config.lora_config is not None: self.labelname_max_lora = "max_lora" @@ -350,13 +362,16 @@ class PrometheusStatLogger(StatLoggerBase): self._gauge_cls( name="vllm:lora_requests_info", documentation="Running stats on lora requests.", + multiprocess_mode="sum", labelnames=[ self.labelname_max_lora, self.labelname_waiting_lora_adapters, self.labelname_running_lora_adapters, - ]) + ], + ) def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo): + metrics_info = config_obj.metrics_info() metrics_info["engine"] = self.engine_index @@ -372,25 +387,28 @@ class PrometheusStatLogger(StatLoggerBase): info_gauge = self._gauge_cls( name=name, documentation=documentation, - labelnames=metrics_info.keys()).labels(**metrics_info) + multiprocess_mode="mostrecent", + labelnames=metrics_info.keys(), + ).labels(**metrics_info) info_gauge.set(1) - def record(self, scheduler_stats: SchedulerStats, + def record(self, scheduler_stats: Optional[SchedulerStats], iteration_stats: Optional[IterationStats]): """Log to prometheus.""" - self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) - self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) + if scheduler_stats is not None: + self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) + self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) - self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage) + self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage) - self.counter_gpu_prefix_cache_queries.inc( - scheduler_stats.prefix_cache_stats.queries) - self.counter_gpu_prefix_cache_hits.inc( - scheduler_stats.prefix_cache_stats.hits) + self.counter_gpu_prefix_cache_queries.inc( + scheduler_stats.prefix_cache_stats.queries) + self.counter_gpu_prefix_cache_hits.inc( + scheduler_stats.prefix_cache_stats.hits) - if scheduler_stats.spec_decoding_stats is not None: - self.spec_decoding_prom.observe( - scheduler_stats.spec_decoding_stats) + if scheduler_stats.spec_decoding_stats is not None: + self.spec_decoding_prom.observe( + scheduler_stats.spec_decoding_stats) if iteration_stats is None: return @@ -445,13 +463,6 @@ class PrometheusStatLogger(StatLoggerBase): self.gauge_lora_info.labels(**lora_info_labels)\ .set_to_current_time() - @staticmethod - def _unregister_vllm_metrics(): - # Unregister any existing vLLM collectors (for CI/CD - for collector in list(prometheus_client.REGISTRY._collector_to_names): - if hasattr(collector, "_name") and "vllm" in collector._name: - prometheus_client.REGISTRY.unregister(collector) - def log_engine_initialized(self): self.log_metrics_info("cache_config", self.vllm_config.cache_config) diff --git a/vllm/v1/metrics/prometheus.py b/vllm/v1/metrics/prometheus.py new file mode 100644 index 0000000000000..a364b286d21b9 --- /dev/null +++ b/vllm/v1/metrics/prometheus.py @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: Apache-2.0 + +import os +import tempfile +from typing import Optional + +from prometheus_client import REGISTRY, CollectorRegistry, multiprocess + +from vllm.logger import init_logger + +logger = init_logger(__name__) + +# Global temporary directory for prometheus multiprocessing +_prometheus_multiproc_dir: Optional[tempfile.TemporaryDirectory] = None + + +def setup_multiprocess_prometheus(): + """Set up prometheus multiprocessing directory if not already configured. + + """ + global _prometheus_multiproc_dir + + if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: + # Make TemporaryDirectory for prometheus multiprocessing + # Note: global TemporaryDirectory will be automatically + # cleaned up upon exit. + _prometheus_multiproc_dir = tempfile.TemporaryDirectory() + os.environ["PROMETHEUS_MULTIPROC_DIR"] = _prometheus_multiproc_dir.name + logger.debug("Created PROMETHEUS_MULTIPROC_DIR at %s", + _prometheus_multiproc_dir.name) + else: + logger.warning("Found PROMETHEUS_MULTIPROC_DIR was set by user. " + "This directory must be wiped between vLLM runs or " + "you will find inaccurate metrics. Unset the variable " + "and vLLM will properly handle cleanup.") + + +def get_prometheus_registry(): + """Get the appropriate prometheus registry based on multiprocessing + configuration. + + Returns: + Registry: A prometheus registry + """ + if os.getenv("PROMETHEUS_MULTIPROC_DIR") is not None: + logger.debug("Using multiprocess registry for prometheus metrics") + registry = CollectorRegistry() + multiprocess.MultiProcessCollector(registry) + return registry + + return REGISTRY + + +def unregister_vllm_metrics(): + """Unregister any existing vLLM collectors from the prometheus registry. + + This is useful for testing and CI/CD where metrics may be registered + multiple times across test runs. + + Also, in case of multiprocess, we need to unregister the metrics from the + global registry. + """ + registry = REGISTRY + # Unregister any existing vLLM collectors + for collector in list(registry._collector_to_names): + if hasattr(collector, "_name") and "vllm" in collector._name: + registry.unregister(collector) + + +def shutdown_prometheus(): + """Shutdown prometheus metrics.""" + + path = _prometheus_multiproc_dir + if path is None: + return + try: + pid = os.getpid() + multiprocess.mark_process_dead(pid, path) + logger.debug("Marked Prometheus metrics for process %d as dead", pid) + except Exception as e: + logger.error("Error during metrics cleanup: %s", str(e)) diff --git a/vllm/v1/metrics/reader.py b/vllm/v1/metrics/reader.py new file mode 100644 index 0000000000000..5ab78129a0094 --- /dev/null +++ b/vllm/v1/metrics/reader.py @@ -0,0 +1,245 @@ +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import dataclass +from typing import Optional + +from prometheus_client import REGISTRY +from prometheus_client import Metric as PromMetric +from prometheus_client.samples import Sample + + +@dataclass +class Metric: + """A base class for prometheus metrics. + + Each metric may be associated with key=value labels, and + in some cases a single vLLM instance may have multiple + metrics with the same name but different sets of labels. + """ + name: str + labels: dict[str, str] + + +@dataclass +class Counter(Metric): + """A monotonically increasing integer counter.""" + value: int + + +@dataclass +class Vector(Metric): + """An ordered array of integer counters. + + This type - which doesn't exist in Prometheus - models one very + specific metric, vllm:spec_decode_num_accepted_tokens_per_pos. + """ + values: list[int] + + +@dataclass +class Gauge(Metric): + """A numerical value that can go up or down.""" + value: float + + +@dataclass +class Histogram(Metric): + """Observations recorded in configurable buckets. + + Buckets are represented by a dictionary. The key is + the upper limit of the bucket, and the value is the + observed count in that bucket. A '+Inf' key always + exists. + + The count property is the total count across all + buckets, identical to the count of the '+Inf' bucket. + + The sum property is the total sum of all observed + values. + """ + count: int + sum: float + buckets: dict[str, int] + + +def get_metrics_snapshot() -> list[Metric]: + """An API for accessing in-memory Prometheus metrics. + + Example: + >>> for metric in llm.get_metrics(): + ... if isinstance(metric, Counter): + ... print(f"{metric} = {metric.value}") + ... elif isinstance(metric, Gauge): + ... print(f"{metric} = {metric.value}") + ... elif isinstance(metric, Histogram): + ... print(f"{metric}") + ... print(f" sum = {metric.sum}") + ... print(f" count = {metric.count}") + ... for bucket_le, value in metrics.buckets.items(): + ... print(f" {bucket_le} = {value}") + """ + collected: list[Metric] = [] + for metric in REGISTRY.collect(): + if not metric.name.startswith("vllm:"): + continue + if metric.type == "gauge": + samples = _get_samples(metric) + for s in samples: + collected.append( + Gauge(name=metric.name, labels=s.labels, value=s.value)) + elif metric.type == "counter": + samples = _get_samples(metric, "_total") + if metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos": + # + # Ugly vllm:num_accepted_tokens_per_pos special case. + # + # This metric is a vector of counters - for each spec + # decoding token position, we observe the number of + # accepted tokens using a Counter labeled with 'position'. + # We convert these into a vector of integer values. + # + for labels, values in _digest_num_accepted_by_pos_samples( + samples): + collected.append( + Vector(name=metric.name, labels=labels, values=values)) + else: + for s in samples: + collected.append( + Counter(name=metric.name, + labels=s.labels, + value=int(s.value))) + + elif metric.type == "histogram": + # + # A histogram has a number of '_bucket' samples where + # the 'le' label represents the upper limit of the bucket. + # We convert these bucketized values into a dict of values + # indexed by the value of the 'le' label. The 'le=+Inf' + # label is a special case, catching all values observed. + # + bucket_samples = _get_samples(metric, "_bucket") + count_samples = _get_samples(metric, "_count") + sum_samples = _get_samples(metric, "_sum") + for labels, buckets, count_value, sum_value in _digest_histogram( + bucket_samples, count_samples, sum_samples): + collected.append( + Histogram(name=metric.name, + labels=labels, + buckets=buckets, + count=count_value, + sum=sum_value)) + else: + raise AssertionError(f"Unknown metric type {metric.type}") + + return collected + + +def _get_samples(metric: PromMetric, + suffix: Optional[str] = None) -> list[Sample]: + name = (metric.name + suffix) if suffix is not None else metric.name + return [s for s in metric.samples if s.name == name] + + +def _strip_label(labels: dict[str, str], key_to_remove: str) -> dict[str, str]: + labels_copy = labels.copy() + labels_copy.pop(key_to_remove) + return labels_copy + + +def _digest_histogram( + bucket_samples: list[Sample], count_samples: list[Sample], + sum_samples: list[Sample] +) -> list[tuple[dict[str, str], dict[str, int], int, float]]: + # + # In the case of DP, we have an indigestable + # per-bucket-per-engine count as a list of labelled + # samples, along with total and sum samples + # + # bucket_samples (in): + # labels = {bucket: 100, idx: 0}, value = 2 + # labels = {bucket: 200, idx: 0}, value = 4 + # labels = {bucket: Inf, idx: 0}, value = 10 + # labels = {bucket: 100, idx: 1}, value = 1 + # labels = {bucket: 200, idx: 2}, value = 5 + # labels = {bucket: Inf, idx: 3}, value = 7 + # count_samples (in): + # labels = {idx: 0}, value = 10 + # labels = {idx: 1}, value = 7 + # sum_samples (in): + # labels = {idx: 0}, value = 2000 + # labels = {idx: 1}, value = 1200 + # + # output: [ + # {idx: 0}, {"100": 2, "200": 4, "Inf": 10}, 10, 2000 + # {idx: 1}, {"100": 1, "200": 5, "Inf": 7}, 7, 1200 + # ] + buckets_by_labels: dict[frozenset[tuple[str, str]], dict[str, int]] = {} + for s in bucket_samples: + bucket = s.labels["le"] + labels_key = frozenset(_strip_label(s.labels, "le").items()) + if labels_key not in buckets_by_labels: + buckets_by_labels[labels_key] = {} + buckets_by_labels[labels_key][bucket] = int(s.value) + + counts_by_labels: dict[frozenset[tuple[str, str]], int] = {} + for s in count_samples: + labels_key = frozenset(s.labels.items()) + counts_by_labels[labels_key] = int(s.value) + + sums_by_labels: dict[frozenset[tuple[str, str]], float] = {} + for s in sum_samples: + labels_key = frozenset(s.labels.items()) + sums_by_labels[labels_key] = s.value + + assert set(buckets_by_labels.keys()) == set( + counts_by_labels.keys()) == set(sums_by_labels.keys()) + + output = [] + label_keys = list(buckets_by_labels.keys()) + for k in label_keys: + labels = dict(k) + output.append((labels, buckets_by_labels[k], counts_by_labels[k], + sums_by_labels[k])) + return output + + +def _digest_num_accepted_by_pos_samples( + samples: list[Sample]) -> list[tuple[dict[str, str], list[int]]]: + # + # In the case of DP, we have an indigestable + # per-position-per-engine count as a list of + # labelled samples + # + # samples (in): + # labels = {pos: 0, idx: 0}, value = 10 + # labels = {pos: 1, idx: 0}, value = 7 + # labels = {pos: 2, idx: 0}, value = 2 + # labels = {pos: 0, idx: 1}, value = 5 + # labels = {pos: 1, idx: 1}, value = 3 + # labels = {pos: 2, idx: 1}, value = 1 + # + # output: [ + # {idx: 0}, [10, 7, 2] + # {idx: 1}, [5, 3, 1] + # ] + # + max_pos = 0 + values_by_labels: dict[frozenset[tuple[str, str]], dict[int, int]] = {} + + for s in samples: + position = int(s.labels["position"]) + max_pos = max(max_pos, position) + + labels_key = frozenset(_strip_label(s.labels, "position").items()) + if labels_key not in values_by_labels: + values_by_labels[labels_key] = {} + values_by_labels[labels_key][position] = int(s.value) + + output = [] + for labels_key, values_by_position in values_by_labels.items(): + labels = dict(labels_key) + values = [0] * (max_pos + 1) + for pos, val in values_by_position.items(): + values[pos] = val + output.append((labels, values)) + return output diff --git a/vllm/v1/request.py b/vllm/v1/request.py index d1cdd2c52750c..42c75ef964016 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -26,12 +26,13 @@ class Request: multi_modal_placeholders: Optional[list[PlaceholderRange]], sampling_params: SamplingParams, eos_token_id: Optional[int], - arrival_time: float, + client_index: int = 0, lora_request: Optional["LoRARequest"] = None, structured_output_request: Optional["StructuredOutputRequest"] = None, cache_salt: Optional[str] = None, ) -> None: self.request_id = request_id + self.client_index = client_index self.sampling_params = sampling_params # Because of LoRA, the eos token id can be different for each request. self.eos_token_id = eos_token_id @@ -77,6 +78,10 @@ class Request: self.output_token_ids = ConstantList(self._output_token_ids) self.all_token_ids = ConstantList(self._all_token_ids) + # State + # The number of tokens with prefix cache hits. + self.num_cached_tokens = -1 + @classmethod def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": if request.mm_inputs is not None: @@ -86,13 +91,13 @@ class Request: return cls( request_id=request.request_id, + client_index=request.client_index, prompt_token_ids=request.prompt_token_ids, multi_modal_inputs=request.mm_inputs, multi_modal_hashes=request.mm_hashes, multi_modal_placeholders=request.mm_placeholders, sampling_params=request.sampling_params, eos_token_id=request.eos_token_id, - arrival_time=request.arrival_time, lora_request=request.lora_request, structured_output_request=StructuredOutputRequest( sampling_params=request.sampling_params), diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index 5d8b3f423b025..4a5fbb10d408b 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -89,18 +89,18 @@ class TopKTopPSampler(nn.Module): p: Optional[torch.Tensor], ) -> torch.Tensor: """More optimized implementation for top-k and top-p sampling.""" - probs = logits.softmax(dim=-1, dtype=torch.float32) if k is None and p is None: # We prefer `random_sample` over `flashinfer_sample` when sorting is # not needed. This is because `random_sample` does not require # CPU-GPU synchronization while `flashinfer_sample` does. + probs = logits.softmax(dim=-1, dtype=torch.float32) return random_sample(probs, generators) if generators: logger.warning("FlashInfer 0.2.3+ does not support " "per-request generators. Falling back to " "PyTorch-native implementation.") return self.forward_native(logits, generators, k, p) - return flashinfer_sample(probs, k, p, generators) + return flashinfer_sample(logits, k, p, generators) def forward_tpu( self, @@ -254,17 +254,17 @@ def random_sample( def flashinfer_sample( - probs: torch.Tensor, + logits: torch.Tensor, k: Optional[torch.Tensor], p: Optional[torch.Tensor], generators: dict[int, torch.Generator], ) -> torch.Tensor: - """Sample from the probabilities using FlashInfer. + """Sample from the logits using FlashInfer. Statistically, this function is equivalent to the `random_sample` function. However, this function is faster because it avoids sorting the logits tensor via rejection sampling. - + NOTE: The outputs of this function do not necessarily match the outputs of the `random_sample` function. It only guarantees that the outputs are statistically equivalent. @@ -274,18 +274,19 @@ def flashinfer_sample( the synchronization overhead. """ assert not (k is None and p is None) - if k is None: # Top-p only. + probs = logits.softmax(dim=-1, dtype=torch.float32) next_token_ids = flashinfer.sampling.top_p_sampling_from_probs( probs, p, deterministic=True) elif p is None: # Top-k only. + probs = logits.softmax(dim=-1, dtype=torch.float32) next_token_ids = flashinfer.sampling.top_k_sampling_from_probs( probs, k, deterministic=True) else: # Both top-k and top-p. - next_token_ids = (flashinfer.sampling.top_k_top_p_sampling_from_probs( - probs, k, p, deterministic=True)) + next_token_ids = flashinfer.sampling.top_k_top_p_sampling_from_logits( + logits, k, p, deterministic=True) return next_token_ids.view(-1) diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index 0dcf02113f5a8..78f37c1e8b218 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -158,10 +158,8 @@ class MsgpackEncoder: self, obj: torch.Tensor ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]: assert self.aux_buffers is not None - # this creates a copy of the tensor if it's not already contiguous - obj = obj.contiguous() - # view the tensor as a 1D array of bytes - arr = obj.view((obj.numel(), )).view(torch.uint8).numpy() + # view the tensor as a contiguous 1D array of bytes + arr = obj.flatten().contiguous().view(torch.uint8).numpy() if obj.nbytes < self.size_threshold: # Smaller tensors are encoded inline, just like ndarrays. data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data) @@ -169,7 +167,7 @@ class MsgpackEncoder: # Otherwise encode index of backing buffer to avoid copy. data = len(self.aux_buffers) self.aux_buffers.append(arr.data) - dtype = str(obj.dtype)[6:] # remove 'torch.' prefix + dtype = str(obj.dtype).removeprefix("torch.") return dtype, obj.shape, data def _encode_nested_tensors(self, nt: NestedTensors) -> Any: @@ -245,7 +243,7 @@ class MsgpackDecoder: # zero-copy decode. We assume the ndarray will not be kept around, # as it now locks the whole received message buffer in memory. buffer = self.aux_buffers[data] if isinstance(data, int) else data - return np.ndarray(buffer=buffer, dtype=np.dtype(dtype), shape=shape) + return np.frombuffer(buffer, dtype=dtype).reshape(shape) def _decode_tensor(self, arr: Any) -> torch.Tensor: dtype, shape, data = arr @@ -254,12 +252,15 @@ class MsgpackDecoder: # not complain about a readonly memoryview. buffer = self.aux_buffers[data] if isinstance(data, int) \ else bytearray(data) - # Create numpy wrapper around the bytes - arr = np.ndarray(buffer=buffer, dtype=np.uint8, shape=(len(buffer), )) torch_dtype = getattr(torch, dtype) assert isinstance(torch_dtype, torch.dtype) + if not buffer: # torch.frombuffer doesn't like empty buffers + assert 0 in shape + return torch.empty(shape, dtype=torch_dtype) + # Create uint8 array + arr = torch.frombuffer(buffer, dtype=torch.uint8) # Convert back to proper shape & type - return torch.from_numpy(arr).view(torch_dtype).view(shape) + return arr.view(torch_dtype).view(shape) def _decode_mm_items(self, obj: list) -> list[MultiModalKwargsItem]: decoded_items = [] diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 5b84bc1f5ec39..1ca8564231659 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -4,17 +4,18 @@ import torch.nn as nn from vllm.attention.layer import Attention from vllm.config import (CompilationLevel, VllmConfig, - get_layers_from_vllm_config, set_current_vllm_config) + get_layers_from_vllm_config) from vllm.distributed.parallel_state import get_pp_group from vllm.forward_context import set_forward_context from vllm.logger import init_logger -from vllm.model_executor.model_loader import get_model_loader -from vllm.model_executor.model_loader.utils import set_default_torch_dtype -from vllm.model_executor.models import ModelRegistry +from vllm.model_executor.model_loader import get_model +from vllm.model_executor.models import supports_multimodal from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM -from vllm.triton_utils import tl, triton -from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata +from vllm.v1.attention.backends.flash_attn import (CommonAttentionMetadata, + FlashAttentionMetadata) +from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.spec_decode.utils import prepare_eagle_input_kernel logger = init_logger(__name__) @@ -27,12 +28,15 @@ class EagleProposer: self, vllm_config: VllmConfig, device: torch.device, + runner=None, ): self.vllm_config = vllm_config self.speculative_config = vllm_config.speculative_config self.draft_model_config = self.speculative_config.draft_model_config self.method = self.speculative_config.method + self.runner = runner + self.dtype = vllm_config.model_config.dtype self.max_model_len = vllm_config.model_config.max_model_len self.block_size = vllm_config.cache_config.block_size @@ -108,24 +112,51 @@ class EagleProposer: # FA requires seq_len to have dtype int32. seq_lens = (target_positions[last_token_indices] + 1).int() - # FIXME(woosuk): The below two ops cause synchronization. Optimize. - max_seq_len = seq_lens.max().item() - max_num_tokens = (cu_num_tokens[1:] - cu_num_tokens[:-1]).max().item() - attn_metadata = FlashAttentionMetadata( - num_actual_tokens=num_tokens, - max_query_len=max_num_tokens, - query_start_loc=cu_num_tokens, - max_seq_len=max_seq_len, - seq_lens=seq_lens, - block_table=block_table, - slot_mapping=target_slot_mapping, - # TODO(woosuk): Support cascade attention. - use_cascade=False, - common_prefix_len=0, - cu_prefix_query_lens=None, - prefix_kv_lens=None, - suffix_kv_lens=None, - ) + if self.method in ["eagle", "eagle3"]: + # FIXME(woosuk): The below two ops cause synchronization. Optimize. + max_seq_len = seq_lens.max().item() + max_num_tokens = (cu_num_tokens[1:] - + cu_num_tokens[:-1]).max().item() + attn_metadata = FlashAttentionMetadata( + num_actual_tokens=num_tokens, + max_query_len=max_num_tokens, + query_start_loc=cu_num_tokens, + max_seq_len=max_seq_len, + seq_lens=seq_lens, + block_table=block_table, + slot_mapping=target_slot_mapping, + # TODO(woosuk): Support cascade attention. + use_cascade=False, + common_prefix_len=0, + cu_prefix_query_lens=None, + prefix_kv_lens=None, + suffix_kv_lens=None, + ) + elif self.method == "deepseek_mtp": + query_lens = cu_num_tokens[1:] - cu_num_tokens[:-1] + max_query_len = query_lens.max().item() + + common_attn_metadata = CommonAttentionMetadata( + query_start_loc=cu_num_tokens, seq_lens=seq_lens) + + assert self.runner is not None + + # FIXME: need to consider multiple kv_cache_groups + attn_metadata = self.runner.attn_metadata_builder.build( + num_reqs=batch_size, + num_actual_tokens=num_tokens, + max_query_len=max_query_len, + common_prefix_len=0, + common_attn_metadata=common_attn_metadata, + ) + else: + raise ValueError(f"Unsupported method: {self.method}") + + # At this moment, we assume all eagle layers belong to the same KV + # cache group, thus using the same attention metadata. + per_layer_attn_metadata = {} + for layer_name in self.attn_layer_names: + per_layer_attn_metadata[layer_name] = attn_metadata if self.use_cuda_graph and \ num_tokens <= self.cudagraph_batch_sizes[-1]: num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) @@ -135,14 +166,18 @@ class EagleProposer: self.positions[:num_tokens] = target_positions self.hidden_states[:num_tokens] = target_hidden_states - with set_forward_context(attn_metadata, + with set_forward_context(per_layer_attn_metadata, self.vllm_config, num_tokens=num_input_tokens): - last_hidden_states, hidden_states = self.model( - input_ids=self.input_ids[:num_input_tokens], - positions=self.positions[:num_input_tokens], - hidden_states=self.hidden_states[:num_input_tokens], + ret_hidden_states = self.model( + self.input_ids[:num_input_tokens], + self.positions[:num_input_tokens], + self.hidden_states[:num_input_tokens], ) + if self.method == "deepseek_mtp": + last_hidden_states = ret_hidden_states + else: + last_hidden_states, hidden_states = ret_hidden_states sample_hidden_states = last_hidden_states[last_token_indices] logits = self.model.compute_logits(sample_hidden_states, None) draft_token_ids = logits.argmax(dim=-1) @@ -152,6 +187,10 @@ class EagleProposer: # [batch_size, 1] return draft_token_ids.view(-1, 1) + # TODO: Currently, MTP module released by deepseek only has + # one layer. Adapt this code to support multiple layers once + # there's a multi-layer MTP module. + # Generate the remaining draft tokens. draft_token_ids_list = [draft_token_ids] @@ -213,13 +252,13 @@ class EagleProposer: self.hidden_states[:batch_size] = hidden_states # Run the model. - with set_forward_context(attn_metadata, + with set_forward_context(per_layer_attn_metadata, self.vllm_config, num_tokens=input_batch_size): last_hidden_states, hidden_states = self.model( - input_ids=self.input_ids[:input_batch_size], - positions=self.positions[:input_batch_size], - hidden_states=self.hidden_states[:input_batch_size], + self.input_ids[:input_batch_size], + self.positions[:input_batch_size], + self.hidden_states[:input_batch_size], ) hidden_states = hidden_states[:batch_size] logits = self.model.compute_logits(last_hidden_states[:batch_size], @@ -239,6 +278,7 @@ class EagleProposer: cu_target_query_lens: torch.Tensor, # [batch_size] num_rejected_tokens: torch.Tensor, + num_tokens: int, ) -> tuple[torch.Tensor, torch.Tensor]: # cu_target_query_lens: [0, a, a + b, a + b + c] # num_rejected_tokens: [n1, n2, n3] @@ -256,21 +296,16 @@ class EagleProposer: # [a - n1, b - n2, c - n3] -> # [0, a - n1, a + b - n1 - n2, a + b + c - n1 - n2 - n3] - cu_num_tokens = torch.empty_like(cu_target_query_lens) + cu_num_tokens = torch.zeros_like(cu_target_query_lens) torch.cumsum(num_tokens_per_req, dim=0, out=cu_num_tokens[1:]) - cu_num_tokens[0] = 0 - - # FIXME(woosuk): Avoid synchronization. - num_tokens = cu_num_tokens[-1].item() token_indices = torch.empty( num_tokens, dtype=torch.int32, - device=cu_num_tokens.device, + device=cu_target_query_lens.device, ) - batch_size = num_rejected_tokens.shape[0] BLOCK_SIZE = 1024 - prepare_input_kernel[(batch_size, )]( + prepare_eagle_input_kernel[(batch_size, )]( token_indices, cu_target_query_lens, cu_num_tokens, @@ -279,48 +314,28 @@ class EagleProposer: return cu_num_tokens, token_indices def load_model(self, target_model: nn.Module) -> None: - loader = get_model_loader(self.vllm_config.load_config) - target_layer_num = self.vllm_config.model_config.get_num_layers( - self.vllm_config.parallel_config) + draft_model_config = \ + self.vllm_config.speculative_config.draft_model_config target_attn_layer_names = set( get_layers_from_vllm_config(self.vllm_config, Attention).keys()) - draft_model_config = \ - self.vllm_config.speculative_config.draft_model_config - # FIXME(lily): This does not handle with distributed inference. - target_device = self.vllm_config.device_config.device - # We need to set the vllm_config here to register attention - # layers in the forward context. - with set_default_torch_dtype( - draft_model_config.dtype), set_current_vllm_config( - self.vllm_config): - draft_model_cls, arch = ModelRegistry.resolve_model_cls( - draft_model_config.architectures) - self.model = draft_model_cls( - vllm_config=self.vllm_config, - start_layer_id=target_layer_num).to(target_device) + self.model = get_model(vllm_config=self.vllm_config, + model_config=draft_model_config) draft_attn_layer_names = ( get_layers_from_vllm_config(self.vllm_config, Attention).keys() - target_attn_layer_names) - assert len(draft_attn_layer_names) == 1 - self.attn_layer_name = next(iter(draft_attn_layer_names)) - loaded_weights = self.model.load_weights( - loader.get_all_weights(draft_model_config, self.model)) + + self.attn_layer_names = list(draft_attn_layer_names) # share embed_tokens with the target model if needed if get_pp_group().world_size == 1: - assert "model.embed_tokens.weight" not in loaded_weights, \ - "For PP = 1, Eagle draft should share embed with target model" logger.info( "The EAGLE head shares the same vocab embedding" \ " with the target model." ) self.model.model.embed_tokens = target_model.model.embed_tokens else: - assert "model.embed_tokens.weight" in loaded_weights, \ - "For PP > 1, Eagle draft checkpoint should its own copy of " - " the model.embed_tokens.weight" logger.info( "Since PP > 1, the EAGLE head loaded its own vocab embedding" \ " weights instead of sharing them with the target model." @@ -332,7 +347,10 @@ class EagleProposer: if self.vllm_config.speculative_config.method != "eagle3" and \ hasattr(target_model, "lm_head"): logger.info("Loading EAGLE LM head weights from the target model.") - self.model.lm_head = target_model.lm_head + if supports_multimodal(target_model): + self.model.lm_head = target_model.get_language_model().lm_head + else: + self.model.lm_head = target_model.lm_head @torch.inference_mode() def dummy_run( @@ -342,11 +360,30 @@ class EagleProposer: with set_forward_context(None, self.vllm_config, num_tokens=num_tokens): self.model( - input_ids=self.input_ids[:num_tokens], - positions=self.positions[:num_tokens], - hidden_states=self.hidden_states[:num_tokens], + self.input_ids[:num_tokens], + self.positions[:num_tokens], + self.hidden_states[:num_tokens], ) + def validate_same_kv_cache_group(self, + kv_cache_config: KVCacheConfig) -> None: + """ + Validate that all eagle layers belong to the same KVCacheGroup. + Need this assumption to ensure all eagle layers can use the + same AttentionMetadata. + May extend to multiple AttentionMetadata in the future. + """ + kv_cache_groups: dict[str, int] = {} + for id, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups): + for layer_name in kv_cache_group.layer_names: + kv_cache_groups[layer_name] = id + assert len( + set([ + kv_cache_groups[layer_name] + for layer_name in self.attn_layer_names + ]) + ) == 1, "All eagle layers should belong to the same kv cache group" + # NOTE(woosuk): Currently, the below code is not used and we always use argmax # to sample the draft tokens. We will use this after we find a way to manage @@ -389,29 +426,3 @@ def compute_probs_and_sample_next_token( next_token_ids, ) return next_token_ids, probs - - -@triton.jit -def prepare_input_kernel( - out_ptr, - cu_query_lens_ptr, - cu_num_tokens_ptr, - BLOCK_SIZE: tl.constexpr, -): - pid = tl.program_id(0) - - # [start_pos, end_pos) - start_pos = tl.load(cu_num_tokens_ptr + pid) - end_pos = tl.load(cu_num_tokens_ptr + pid + 1) - num_tokens = end_pos - start_pos - - index_start = tl.load(cu_query_lens_ptr + pid) - - num_blocks = tl.cdiv(num_tokens, BLOCK_SIZE) - for i in tl.range(num_blocks): - offset = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) - tl.store( - out_ptr + start_pos + offset, - index_start + offset, - mask=offset < num_tokens, - ) diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py index 14bc9c9e0d1a3..fdac2ef64c3f7 100644 --- a/vllm/v1/spec_decode/medusa.py +++ b/vllm/v1/spec_decode/medusa.py @@ -3,12 +3,10 @@ import torch import torch.nn as nn -from vllm.config import VllmConfig, set_current_vllm_config +from vllm.config import VllmConfig from vllm.forward_context import set_forward_context from vllm.logger import init_logger -from vllm.model_executor.model_loader import get_model_loader -from vllm.model_executor.model_loader.utils import set_default_torch_dtype -from vllm.model_executor.models.medusa import Medusa +from vllm.model_executor.model_loader import get_model from vllm.v1.sample.metadata import SamplingMetadata # Initialize logger @@ -49,20 +47,9 @@ class MedusaProposer: return [list(row) for row in zip(*draft_tokens)] def load_model(self, target_model: nn.Module) -> None: - # Get model loader and config - loader = get_model_loader(self.vllm_config.load_config) - draft_config = self.vllm_config.speculative_config.draft_model_config - - # Load model with proper dtype and config - with set_default_torch_dtype(draft_config.dtype), \ - set_current_vllm_config(self.vllm_config): - self.model = Medusa( - vllm_config=self.vllm_config.speculative_config).to( - self.device) - - # Load model weights - weights = loader.get_all_weights(draft_config, self.model) - self.model.load_weights(weights) + self.model = get_model(vllm_config=self.vllm_config, + model_config=self.vllm_config. + speculative_config.draft_model_config) @torch.inference_mode() def dummy_run(self, num_tokens: int) -> None: diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py index 899aa9200e85e..36091bef28959 100644 --- a/vllm/v1/spec_decode/metrics.py +++ b/vllm/v1/spec_decode/metrics.py @@ -134,17 +134,17 @@ class SpecDecodingProm: self.counter_spec_decode_num_drafts = \ self._counter_cls( - name="vllm:spec_decode_num_drafts_total", + name="vllm:spec_decode_num_drafts", documentation="Number of spec decoding drafts.", labelnames=labelnames).labels(*labelvalues) self.counter_spec_decode_num_draft_tokens = \ self._counter_cls( - name="vllm:spec_decode_num_draft_tokens_total", + name="vllm:spec_decode_num_draft_tokens", documentation="Number of draft tokens.", labelnames=labelnames,).labels(*labelvalues) self.counter_spec_decode_num_accepted_tokens = \ self._counter_cls( - name="vllm:spec_decode_num_accepted_tokens_total", + name="vllm:spec_decode_num_accepted_tokens", documentation="Number of accepted tokens.", labelnames=labelnames).labels(*labelvalues) diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py index ce81a40ee3ae1..334258e7f87ae 100644 --- a/vllm/v1/spec_decode/utils.py +++ b/vllm/v1/spec_decode/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +from vllm.triton_utils import tl, triton from vllm.v1.worker.gpu_input_batch import InputBatch @@ -16,3 +17,29 @@ def is_spec_decode_supported(req_id: str, input_batch: InputBatch) -> bool: return False return True + + +@triton.jit +def prepare_eagle_input_kernel( + out_ptr, + cu_query_lens_ptr, + cu_num_tokens_ptr, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(0) + + # [start_pos, end_pos) + start_pos = tl.load(cu_num_tokens_ptr + pid) + end_pos = tl.load(cu_num_tokens_ptr + pid + 1) + num_tokens = end_pos - start_pos + + index_start = tl.load(cu_query_lens_ptr + pid) + + num_blocks = tl.cdiv(num_tokens, BLOCK_SIZE) + for i in tl.range(num_blocks): + offset = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + tl.store( + out_ptr + start_pos + offset, + index_start + offset, + mask=offset < num_tokens, + ) diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index c701ab1d35a58..07b422814e13a 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -149,31 +149,37 @@ class StructuredOutputManager: # NOTE: This outer loop can likely be parallelized to improve # performance of bitmask generation for large batches. for req_id, _ in ordered_seq: - request = requests[req_id].structured_output_request - if TYPE_CHECKING: - assert request is not None - assert request.grammar is not None + request = requests[req_id] + structured_output_request = request.structured_output_request - apply_bitmask = ( - request.reasoning_ended if self.reasoner is not None else True - ) # noqa: E501 + if TYPE_CHECKING: + assert structured_output_request is not None + assert structured_output_request.grammar is not None + apply_bitmask: bool = True + if self.reasoner is not None: + if structured_output_request.reasoning_ended is None: + structured_output_request.reasoning_ended = \ + self.reasoner.is_reasoning_end(request.prompt_token_ids) + apply_bitmask = structured_output_request.reasoning_ended state_advancements = 0 req_tokens = scheduled_spec_decode_tokens.get(req_id, []) + [None] for i, token in enumerate(req_tokens): - if apply_bitmask and not request.grammar.is_terminated(): - request.grammar.fill_bitmask(bitmask_tensor, - cumulative_index) + if apply_bitmask and not \ + structured_output_request.grammar.is_terminated(): + structured_output_request.grammar.fill_bitmask( + bitmask_tensor, cumulative_index) if token is not None: # In order to generate the correct bitmask for each # position in the speculative sequence, we advance # the FSM state for each speculative token and rollback # to restore the previous state when we are finished. - assert request.grammar.accept_tokens(req_id, [token]) + assert structured_output_request.grammar.accept_tokens( + req_id, [token]) state_advancements += 1 cumulative_index += 1 if state_advancements > 0: - request.grammar.rollback(state_advancements) + structured_output_request.grammar.rollback(state_advancements) if cumulative_index < bitmask_tensor.shape[0]: bitmask_tensor = bitmask_tensor[:cumulative_index] diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py index c16320b9e74c6..9a7e30d41aaa8 100644 --- a/vllm/v1/structured_output/request.py +++ b/vllm/v1/structured_output/request.py @@ -20,7 +20,7 @@ class StructuredOutputRequest: sampling_params: SamplingParams _grammar: Optional[Union[Future[StructuredOutputGrammar], StructuredOutputGrammar]] = None - reasoning_ended: bool = False + reasoning_ended: Optional[bool] = None def _check_grammar_completion(self) -> bool: # NOTE: We have to lazy import to gate circular imports diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index f33f4972e1032..111e92dc0990d 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -2,7 +2,7 @@ from __future__ import annotations -import re +import regex as re def grammar_is_likely_lark(grammar_str: str) -> bool: diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 0758747a83cc6..a26794561a526 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -1,31 +1,41 @@ # SPDX-License-Identifier: Apache-2.0 -import os +import argparse +import multiprocessing import time import weakref from collections import defaultdict from collections.abc import Sequence +from dataclasses import dataclass +from enum import Enum, auto from multiprocessing import Process, connection -from typing import (TYPE_CHECKING, Callable, Generic, Optional, TypeVar, Union, - overload) +from multiprocessing.process import BaseProcess +from typing import (TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar, + Union, overload) +import msgspec import torch +import zmq -from vllm.config import VllmConfig +from vllm.config import CacheConfig, ParallelConfig, VllmConfig from vllm.logger import init_logger from vllm.model_executor.models.utils import extract_layer_index from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, usage_message) -from vllm.utils import get_mp_context, kill_process_tree +from vllm.utils import (get_mp_context, get_open_port, get_open_zmq_ipc_path, + get_tcp_uri, kill_process_tree) from vllm.v1.executor.abstract import Executor if TYPE_CHECKING: from vllm.attention.layer import Attention + from vllm.v1.engine.coordinator import DPCoordinator logger = init_logger(__name__) T = TypeVar("T") +STARTUP_POLL_PERIOD_MS = 10000 + class ConstantList(Generic[T], Sequence): @@ -95,6 +105,78 @@ class ConstantList(Generic[T], Sequence): return f"ConstantList({self._x})" +def get_engine_client_zmq_addr(local_only: bool, + host: str, + port: int = 0) -> str: + return get_open_zmq_ipc_path() if local_only else (get_tcp_uri( + host, port or get_open_port())) + + +class APIServerProcessManager: + """Manages a group of API server processes. + + Handles creation, monitoring, and termination of API server worker + processes. Also monitors extra processes to check if they are healthy. + """ + + def __init__( + self, + target_server_fn: Callable, + listen_address: str, + sock: Any, + args: argparse.Namespace, + num_servers: int, + input_addresses: list[str], + output_addresses: list[str], + stats_update_address: Optional[str] = None, + ): + """Initialize and start API server worker processes. + + Args: + target_server_fn: Function to call for each API server process + listen_address: Address to listen for client connections + sock: Socket for client connections + args: Command line arguments + num_servers: Number of API server processes to start + input_addresses: Input addresses for each API server + output_addresses: Output addresses for each API server + stats_update_address: Optional stats update address + """ + self.listen_address = listen_address + self.sock = sock + self.args = args + + # Start API servers + spawn_context = multiprocessing.get_context("spawn") + self.processes: list[BaseProcess] = [] + + for i, in_addr, out_addr in zip(range(num_servers), input_addresses, + output_addresses): + client_config = { + "input_address": in_addr, + "output_address": out_addr, + "client_index": i + } + if stats_update_address is not None: + client_config["stats_update_address"] = stats_update_address + + proc = spawn_context.Process(target=target_server_fn, + name=f"ApiServer_{i}", + args=(listen_address, sock, args, + client_config)) + self.processes.append(proc) + proc.start() + + logger.info("Started %d API server processes", len(self.processes)) + + # Shutdown only the API server processes on garbage collection + # The extra processes are managed by their owners + self._finalizer = weakref.finalize(self, shutdown, self.processes) + + def close(self) -> None: + self._finalizer() + + class CoreEngineProcManager: """ Utility class to handle creation, readiness, and shutdown @@ -109,7 +191,7 @@ class CoreEngineProcManager: local_start_index: int, vllm_config: VllmConfig, on_head_node: bool, - input_address: str, + handshake_address: str, executor_class: type[Executor], log_stats: bool, ): @@ -117,12 +199,12 @@ class CoreEngineProcManager: common_kwargs = { "vllm_config": vllm_config, "on_head_node": on_head_node, - "input_address": input_address, + "handshake_address": handshake_address, "executor_class": executor_class, "log_stats": log_stats, } - self.processes: list[Process] = [] + self.processes: list[BaseProcess] = [] for index in range(local_engine_count): local_index = local_start_index + index global_index = start_index + index @@ -135,8 +217,7 @@ class CoreEngineProcManager: "local_dp_rank": local_index, })) - self._finalizer = weakref.finalize(self, shutdown, self.processes, - input_address) + self._finalizer = weakref.finalize(self, shutdown, self.processes) try: for proc in self.processes: proc.start() @@ -164,9 +245,199 @@ class CoreEngineProcManager: } +class CoreEngineState(Enum): + NEW = auto() + CONNECTED = auto() + READY = auto() + + +class CoreEngine: + """One per data parallel rank.""" + + def __init__(self, index: int = 0, local: bool = True): + self.local = local + self.index = index + self.identity = index.to_bytes(2, "little") + + self.state = CoreEngineState.NEW + + +@dataclass +class EngineZmqAddresses: + # ZMQ input socket addresses for each front-end client (requests) + inputs: list[str] + # ZMQ output socket addresses for each front-end client (responses) + outputs: list[str] + # ZMQ input socket address of DP coordinator if applicable + coordinator_input: Optional[str] = None + # ZMQ output socket address of DP coordinator if applicable + coordinator_output: Optional[str] = None + + +@dataclass +class EngineHandshakeMetadata: + """Metadata sent to each engine process during startup handshake, + including addresses of the front-end ZMQ queues that they should + connect to. + """ + addresses: EngineZmqAddresses + parallel_config: dict[str, Union[int, str]] + + +def wait_for_engine_startup( + handshake_socket: zmq.Socket, + addresses: EngineZmqAddresses, + core_engines: list[CoreEngine], + parallel_config: ParallelConfig, + cache_config: CacheConfig, + proc_manager: Optional[CoreEngineProcManager], + coord_process: Optional[Process], +): + + # Wait for engine core process(es) to send ready messages. + local_count = parallel_config.data_parallel_size_local + remote_count = len(core_engines) - local_count + # [local, remote] counts + conn_pending, start_pending = [local_count, remote_count], [0, 0] + poller = zmq.Poller() + poller.register(handshake_socket, zmq.POLLIN) + + if proc_manager is not None: + for sentinel in proc_manager.sentinels(): + poller.register(sentinel, zmq.POLLIN) + if coord_process is not None: + poller.register(coord_process.sentinel, zmq.POLLIN) + while any(conn_pending) or any(start_pending): + events = poller.poll(STARTUP_POLL_PERIOD_MS) + if not events: + if any(conn_pending): + logger.debug( + "Waiting for %d local, %d remote core engine proc(s) " + "to connect.", *conn_pending) + if any(start_pending): + logger.debug( + "Waiting for %d local, %d remote core engine proc(s) " + "to start.", *start_pending) + continue + if len(events) > 1 or events[0][0] != handshake_socket: + # One of the local core processes exited. + finished = proc_manager.finished_procs() if proc_manager else {} + if coord_process is not None and coord_process.exitcode is not None: + finished[coord_process.name] = coord_process.exitcode + raise RuntimeError("Engine core initialization failed. " + "See root cause above. " + f"Failed core proc(s): {finished}") + + # Receive HELLO and READY messages from the input socket. + eng_identity, ready_msg_bytes = handshake_socket.recv_multipart() + eng_index = int.from_bytes(eng_identity, "little") + engine = next((e for e in core_engines if e.identity == eng_identity), + None) + if engine is None: + raise RuntimeError(f"Message from engine with unexpected data " + f"parallel rank: {eng_index}") + msg = msgspec.msgpack.decode(ready_msg_bytes) + status, local = msg["status"], msg["local"] + if local != engine.local: + raise RuntimeError(f"{status} message from " + f"{'local' if local else 'remote'} " + f"engine {eng_index}, expected it to be " + f"{'local' if engine.local else 'remote'}") + + if status == "HELLO" and engine.state == CoreEngineState.NEW: + + # Send init message with DP config info. + init_message = msgspec.msgpack.encode( + EngineHandshakeMetadata( + addresses=addresses, + parallel_config={ + "data_parallel_master_ip": + parallel_config.data_parallel_master_ip, + "data_parallel_master_port": + parallel_config.data_parallel_master_port, + "data_parallel_size": + parallel_config.data_parallel_size, + })) + handshake_socket.send_multipart((eng_identity, init_message), + copy=False) + conn_pending[0 if local else 1] -= 1 + start_pending[0 if local else 1] += 1 + engine.state = CoreEngineState.CONNECTED + elif status == "READY" and (engine.state == CoreEngineState.CONNECTED): + # Setup KV cache config with initialization state from + # engine core process. Sum values from all engines in DP case. + num_gpu_blocks = cache_config.num_gpu_blocks or 0 + num_gpu_blocks += msg["num_gpu_blocks"] + cache_config.num_gpu_blocks = num_gpu_blocks + + start_pending[0 if local else 1] -= 1 + engine.state = CoreEngineState.READY + else: + raise RuntimeError(f"Unexpected {status} message for " + f"{'local' if local else 'remote'} engine " + f"{eng_index} in {engine.state} state.") + + logger.debug("%s from %s core engine process %s.", status, + "local" if local else "remote", eng_index) + + +def wait_for_completion_or_failure( + api_server_manager: APIServerProcessManager, + local_engine_manager: Optional[CoreEngineProcManager] = None, + coordinator: Optional["DPCoordinator"] = None) -> None: + """Wait for all processes to complete or detect if any fail. + + Raises an exception if any process exits with a non-zero status. + """ + + try: + logger.info("Waiting for API servers to complete ...") + # Create a mapping of sentinels to their corresponding processes + # for efficient lookup + sentinel_to_proc: dict[Any, BaseProcess] = { + proc.sentinel: proc + for proc in api_server_manager.processes + } + + if coordinator: + sentinel_to_proc[coordinator.proc.sentinel] = coordinator.proc + + if local_engine_manager: + for proc in local_engine_manager.processes: + sentinel_to_proc[proc.sentinel] = proc + + # Check if any process terminates + while sentinel_to_proc: + # Wait for any process to terminate + ready_sentinels: list[Any] = connection.wait(sentinel_to_proc) + + # Process any terminated processes + for sentinel in ready_sentinels: + proc = sentinel_to_proc.pop(sentinel) + + # Check if process exited with error + if proc.exitcode != 0: + raise RuntimeError( + f"Process {proc.name} (PID: {proc.pid}) " + f"died with exit code {proc.exitcode}") + except KeyboardInterrupt: + logger.info("Received KeyboardInterrupt, shutting down API servers...") + except Exception as e: + logger.exception("Exception occurred while running API servers: %s", + str(e)) + raise + finally: + logger.info("Terminating remaining processes ...") + api_server_manager.close() + if coordinator: + coordinator.close() + if local_engine_manager: + local_engine_manager.close() + + # Note(rob): shutdown function cannot be a bound method, -# else the gc cannot collect the objedecoupct. -def shutdown(procs: list[Process], input_address: str): +# else the gc cannot collect the object. +def shutdown(procs: list[BaseProcess]): # Shutdown the process. for proc in procs: if proc.is_alive(): @@ -185,12 +456,6 @@ def shutdown(procs: list[Process], input_address: str): if proc.is_alive() and (pid := proc.pid) is not None: kill_process_tree(pid) - # Remove zmq ipc socket files. - if input_address.startswith("ipc://"): - socket_file = input_address[len("ipc://"):] - if os and os.path.exists(socket_file): - os.remove(socket_file) - def bind_kv_cache( kv_caches: dict[str, torch.Tensor], diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 0c3341691509f..576086ebeb7f7 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -5,7 +5,6 @@ import torch from vllm.logger import init_logger from vllm.utils import cdiv -from vllm.v1.kv_cache_interface import KVCacheConfig logger = init_logger(__name__) @@ -105,15 +104,10 @@ class MultiGroupBlockTable: def __init__(self, max_num_reqs: int, max_model_len: int, max_num_batched_tokens: int, pin_memory: bool, - device: torch.device, kv_cache_config: KVCacheConfig) -> None: - max_num_blocks_per_req = [ - cdiv(max_model_len, g.kv_cache_spec.block_size) - for g in kv_cache_config.kv_cache_groups - ] + device: torch.device, block_size: int) -> None: self.block_tables = [ - BlockTable(max_num_reqs, max_num_blocks_per_req[i], + BlockTable(max_num_reqs, cdiv(max_model_len, block_size), max_num_batched_tokens, pin_memory, device) - for i in range(len(kv_cache_config.kv_cache_groups)) ] def append_row(self, block_ids: list[list[int]], row_idx: int) -> None: diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 570de9bddd290..b3e65917d3cc2 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -11,7 +11,6 @@ from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange from vllm.sampling_params import SamplingParams, SamplingType from vllm.utils import swap_dict_values -from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.outputs import LogprobsTensors from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.utils import copy_slice @@ -63,7 +62,7 @@ class InputBatch: device: torch.device, pin_memory: bool, vocab_size: int, - kv_cache_config: KVCacheConfig, + block_size: int, ): self.max_num_reqs = max_num_reqs self.max_model_len = max_model_len @@ -105,7 +104,7 @@ class InputBatch: max_num_batched_tokens=max_num_batched_tokens, pin_memory=pin_memory, device=device, - kv_cache_config=kv_cache_config, + block_size=block_size, ) # Sampling-related. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 15edc0d67fba9..4e4e34a16bce3 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -26,19 +26,20 @@ from vllm.distributed.parallel_state import ( get_pp_group, get_tp_group, graph_capture, prepare_communication_buffer_for_model) from vllm.forward_context import (create_forward_context, get_forward_context, - override_forward_context, + override_forward_context, DPMetadata, set_forward_context) from vllm.logger import init_logger from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding -from vllm.model_executor.model_loader import get_model +from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange from vllm.multimodal.utils import group_mm_inputs_by_modality from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, - GiB_bytes, LazyLoader, cdiv, check_use_alibi, - current_stream, is_pin_memory_available) + GiB_bytes, LazyLoader, async_tensor_h2d, cdiv, + check_use_alibi, is_pin_memory_available, + current_stream) from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.core.encoder_cache_manager import compute_encoder_budget @@ -67,6 +68,7 @@ from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs, if TYPE_CHECKING: import xgrammar as xgr + from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.v1.core.sched.output import SchedulerOutput else: xgr = LazyLoader("xgr", globals(), "xgrammar") @@ -157,31 +159,41 @@ class GPUModelRunner(LoRAModelRunnerMixin): # req_id -> (input_id -> encoder_output) self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} - # Set up speculative decoding. - self.use_spec_decode = False self.use_aux_hidden_state_outputs = False - if self.speculative_config: - self.use_spec_decode = True - if get_pp_group().is_last_rank: - if self.speculative_config.method == "ngram": - self.drafter = NgramProposer(self.vllm_config) - elif self.speculative_config.use_eagle(): - self.drafter = EagleProposer(self.vllm_config, - self.device) # type: ignore - if self.speculative_config.method == "eagle3": - self.use_aux_hidden_state_outputs = True - elif self.speculative_config.method == "medusa": - self.drafter = MedusaProposer( - vllm_config=self.vllm_config, - device=self.device) # type: ignore - else: - raise ValueError("Unknown speculative decoding method: " - f"{self.speculative_config.method}") - self.rejection_sampler = RejectionSampler() + # Set up speculative decoding. + # NOTE(Jiayi): currently we put the entire draft model on + # the last PP rank. This is not ideal if there are many + # layers in the draft model. + if self.speculative_config and get_pp_group().is_last_rank: + if self.speculative_config.method == "ngram": + self.drafter = NgramProposer(self.vllm_config) + elif self.speculative_config.use_eagle(): + self.drafter = EagleProposer(self.vllm_config, self.device, + self) # type: ignore + if self.speculative_config.method == "eagle3": + self.use_aux_hidden_state_outputs = True + elif self.speculative_config.method == "medusa": + self.drafter = MedusaProposer( + vllm_config=self.vllm_config, + device=self.device) # type: ignore + else: + raise ValueError("Unknown speculative decoding method: " + f"{self.speculative_config.method}") + self.rejection_sampler = RejectionSampler() # Request states. self.requests: dict[str, CachedRequestState] = {} + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.max_model_len, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=self.pin_memory, + vocab_size=self.model_config.get_vocab_size(), + block_size=self.cache_config.block_size, + ) + self.use_cuda_graph = (self.vllm_config.compilation_config.level == CompilationLevel.PIECEWISE and not self.model_config.enforce_eager) @@ -278,7 +290,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool: """ Update the order of requests in the batch based on the attention - backend's needs. For example, some attention backends (namely MLA) may + backend's needs. For example, some attention backends (namely MLA) may want to separate requests based on if the attention computation will be compute-bound or memory-bound. @@ -537,6 +549,26 @@ class GPUModelRunner(LoRAModelRunnerMixin): total_num_scheduled_tokens))] return None + def _get_cumsum_and_arange( + self, + num_tokens: np.ndarray, + cumsum_dtype: Optional[np.dtype] = None, + ) -> tuple[np.ndarray, np.ndarray]: + """Get the cumulative sum and batched arange of the given array. + # E.g., [2, 5, 3] -> ([2, 7, 10], [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]) + # Equivalent to but faster than: + # np.concatenate([np.arange(n) for n in num_tokens]) + """ + # Step 1. [2, 5, 3] -> [2, 7, 10] + cu_num_tokens = np.cumsum(num_tokens, dtype=cumsum_dtype) + total_num_tokens = cu_num_tokens[-1] + # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7] + cumsums_offsets = np.repeat(cu_num_tokens - num_tokens, num_tokens) + # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + arange = self.arange_np[:total_num_tokens] - cumsums_offsets + + return cu_num_tokens, arange + def _prepare_inputs( self, scheduler_output: "SchedulerOutput" ) -> tuple[PerLayerAttnMetadata, torch.Tensor, @@ -561,17 +593,10 @@ class GPUModelRunner(LoRAModelRunnerMixin): req_indices = np.repeat(self.arange_np[:num_reqs], num_scheduled_tokens) - # Get batched arange. - # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] - # Equivalent to but faster than: - # np.concatenate([np.arange(n) for n in num_scheduled_tokens]) - # Step 1. [2, 5, 3] -> [2, 7, 10] - cu_num_tokens = np.cumsum(num_scheduled_tokens) - # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7] - cumsums_offsets = np.repeat(cu_num_tokens - num_scheduled_tokens, - num_scheduled_tokens) - # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] - arange = self.arange_np[:total_num_scheduled_tokens] - cumsums_offsets + # cu_num_tokens: [2, 5, 3] -> [2, 7, 10] + # arange: [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + cu_num_tokens, arange = self._get_cumsum_and_arange( + num_scheduled_tokens) # Get positions. positions_np = self.positions_np[:total_num_scheduled_tokens] @@ -913,32 +938,25 @@ class GPUModelRunner(LoRAModelRunnerMixin): # Compute the logits indices. # [4, 1, 3, 1, 2] num_sampled_tokens = num_draft_tokens + 1 - # Step 1. [4, 5, 8, 9, 11] - cu_num_sampled_tokens = np.cumsum(num_sampled_tokens, dtype=np.int32) - total_num_sampled_tokens = cu_num_sampled_tokens[-1] - # Step 2. [0, 0, 0, 0, 4, 5, 5, 5, 8, 9, 9] - cumsums_offsets = np.repeat(cu_num_sampled_tokens - num_sampled_tokens, - num_sampled_tokens) - # Step 3. [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1] - arange = self.arange_np[:total_num_sampled_tokens] - cumsums_offsets - # Step 4. [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207] + + # Step 1. cu_num_sampled_tokens: [4, 5, 8, 9, 11] + # arange: [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1] + cu_num_sampled_tokens, arange = self._get_cumsum_and_arange( + num_sampled_tokens, cumsum_dtype=np.int32) + # Step 2. [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207] logits_indices = np.repeat( cu_num_scheduled_tokens - num_sampled_tokens, num_sampled_tokens) - # Step 5. [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208] + # Step 3. [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208] logits_indices += arange # Compute the bonus logits indices. bonus_logits_indices = cu_num_sampled_tokens - 1 # Compute the draft logits indices. - # [3, 3, 5, 5, 6] - cu_num_draft_tokens = np.cumsum(num_draft_tokens, dtype=np.int32) - total_num_draft_tokens = cu_num_draft_tokens[-1] - # [0, 0, 0, 3, 3, 5] - cumsums_offsets = np.repeat(cu_num_draft_tokens - num_draft_tokens, - num_draft_tokens) - # [0, 1, 2, 0, 1, 0] - arange = self.arange_np[:total_num_draft_tokens] - cumsums_offsets + # cu_num_draft_tokens: [3, 3, 5, 5, 6] + # arange: [0, 1, 2, 0, 1, 0] + cu_num_draft_tokens, arange = self._get_cumsum_and_arange( + num_draft_tokens, cumsum_dtype=np.int32) # [0, 0, 0, 5, 5, 9] target_logits_indices = np.repeat( cu_num_sampled_tokens - num_sampled_tokens, num_draft_tokens) @@ -998,8 +1016,11 @@ class GPUModelRunner(LoRAModelRunnerMixin): encoder_outputs = [] for grouped_mm_inputs in grouped_mm_inputs_list: batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs) - batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs, - device=self.device) + batched_mm_inputs = MultiModalKwargs.as_kwargs( + batched_mm_inputs, + dtype=self.model_config.dtype, + device=self.device, + ) # Run the encoder. # `curr_group_outputs` is either of the following: @@ -1384,6 +1405,31 @@ class GPUModelRunner(LoRAModelRunnerMixin): return model_output + def get_dp_padding(self, + num_tokens: int) -> tuple[int, Optional[torch.Tensor]]: + dp_size = self.vllm_config.parallel_config.data_parallel_size + dp_rank = self.vllm_config.parallel_config.data_parallel_rank + + # For DP: Don't pad when setting enforce_eager. + # This lets us set enforce_eager on the prefiller in a P/D setup and + # still use CUDA graphs (enabled by this padding) on the decoder. + # + # TODO(tms) : There are many cases where padding is enabled for + # prefills, causing unnecessary and excessive padding of activations. + + if dp_size == 1 or self.vllm_config.model_config.enforce_eager: + # Early exit. + return 0, None + + num_tokens_across_dp = DPMetadata.num_tokens_across_dp( + num_tokens, dp_size, dp_rank) + max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item() + num_tokens_after_padding = torch.tensor([max_tokens_across_dp_cpu] * + dp_size, + device="cpu", + dtype=torch.int32) + return max_tokens_across_dp_cpu - num_tokens, num_tokens_after_padding + @torch.inference_mode() def execute_model( self, @@ -1416,6 +1462,87 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.maybe_wait_for_kv_save() finished_sending, finished_recving = ( self.get_finished_kv_transfers(scheduler_output)) + # if (self.use_cuda_graph + # and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): + # # Use piecewise CUDA graphs. + # # Add padding to the batch size. + # num_input_tokens = self.vllm_config.pad_for_cudagraph( + # num_scheduled_tokens) + # else: + # # Eager mode. + # # Pad tokens to multiple of tensor_parallel_size when + # # enabled collective fusion for SP + # tp_size = self.vllm_config.parallel_config.tensor_parallel_size + # if self.vllm_config.compilation_config.pass_config. \ + # enable_sequence_parallelism and tp_size > 1: + # from vllm.utils import round_up + # num_input_tokens = round_up(num_scheduled_tokens, tp_size) + # else: + # num_input_tokens = num_scheduled_tokens + + # # Padding for DP + # num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens) + # num_input_tokens += num_pad + + # # _prepare_inputs may reorder the batch, so we must gather multi + # # modal outputs after that to ensure the correct order + # if self.is_multimodal_model: + # # Run the multimodal encoder if any. + # self._execute_mm_encoder(scheduler_output) + # mm_embeds = self._gather_mm_embeddings(scheduler_output) + # else: + # mm_embeds = [] + + # if self.is_multimodal_model and get_pp_group().is_first_rank: + # # NOTE(woosuk): To unify token ids and soft tokens (vision + # # embeddings), we always use embeddings (rather than token ids) + # # as input to the multimodal model, even when the input is text. + # input_ids = self.input_ids[:num_scheduled_tokens] + # if mm_embeds: + # inputs_embeds = self.model.get_input_embeddings( + # input_ids, mm_embeds) + # else: + # inputs_embeds = self.model.get_input_embeddings(input_ids) + # # TODO(woosuk): Avoid the copy. Optimize. + # self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds) + # inputs_embeds = self.inputs_embeds[:num_input_tokens] + # input_ids = None + # else: + # # For text-only models, we use token ids as input. + # # While it is possible to use embeddings as input just like the + # # multimodal models, it is not desirable for performance since + # # then the embedding layer is not included in the CUDA graph. + # input_ids = self.input_ids[:num_input_tokens] + # inputs_embeds = None + # if self.uses_mrope: + # positions = self.mrope_positions[:, :num_input_tokens] + # else: + # positions = self.positions[:num_input_tokens] + + # if get_pp_group().is_first_rank: + # intermediate_tensors = None + # else: + # intermediate_tensors = self.sync_and_slice_intermediate_tensors( + # num_input_tokens, intermediate_tensors, True) + + # # Run the decoder. + # # Use persistent buffers for CUDA graphs. + # with set_forward_context(attn_metadata, + # self.vllm_config, + # num_tokens=num_input_tokens, + # num_tokens_across_dp=num_tokens_across_dp): + # self.maybe_setup_kv_connector(scheduler_output) + + # model_output = self.model( + # input_ids=input_ids, + # positions=positions, + # intermediate_tensors=intermediate_tensors, + # inputs_embeds=inputs_embeds, + # ) + + # self.maybe_wait_for_kv_save() + # finished_sending, finished_recving = ( + # self.get_finished_kv_transfers(scheduler_output)) if self.use_aux_hidden_state_outputs: hidden_states, aux_hidden_states = model_output @@ -1531,7 +1658,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): for i in discard_sampled_tokens_req_indices: valid_sampled_token_ids[i].clear() - if not self.use_spec_decode: + if not self.speculative_config: # Speculative decoding is not enabled. spec_token_ids = None elif self.speculative_config.method == "ngram": @@ -1579,7 +1706,16 @@ class GPUModelRunner(LoRAModelRunnerMixin): next_token_ids = torch.tensor(next_token_ids, dtype=torch.int32, device=self.device) - eagle_attn_metadata = attn_metadata[self.drafter.attn_layer_name] + # At this moment, we assume all eagle layers belong to the same KV + # cache group, thus using the same attention metadata. + eagle_attn_metadata = attn_metadata[ + self.drafter.attn_layer_names[0]] + + # NOTE: deepseek_mtp uses MLA which does not have `block_table` + if hasattr(eagle_attn_metadata, "block_table"): + block_table = eagle_attn_metadata.block_table + else: + block_table = None if spec_decode_metadata is None: # input_ids can be None for multimodal models. @@ -1601,14 +1737,16 @@ class GPUModelRunner(LoRAModelRunnerMixin): n + 1 - len(valid_sampled_token_ids[i]) if n > 0 else 0 for i, n in enumerate(num_draft_tokens) ] - num_rejected_tokens = torch.tensor( + num_rejected_tokens_tensor = async_tensor_h2d( num_rejected_tokens, dtype=torch.int32, - device=self.device, - ) + target_device=self.device, + pin_memory=True) + num_tokens = num_scheduled_tokens - sum(num_rejected_tokens) cu_num_tokens, token_indices = self.drafter.prepare_inputs( eagle_attn_metadata.query_start_loc, - num_rejected_tokens, + num_rejected_tokens_tensor, + num_tokens, ) target_token_ids = self.input_ids[token_indices] #TODO(sage) make sure this works with mrope @@ -1620,7 +1758,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): target_hidden_states = hidden_states[token_indices] target_slot_mapping = eagle_attn_metadata.slot_mapping[ token_indices] - draft_token_ids = self.drafter.propose( target_token_ids=target_token_ids, target_positions=target_positions, @@ -1628,7 +1765,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): target_slot_mapping=target_slot_mapping, next_token_ids=next_token_ids, cu_num_tokens=cu_num_tokens, - block_table=eagle_attn_metadata.block_table, + block_table=block_table, sampling_metadata=sampling_metadata, ) spec_token_ids = draft_token_ids.tolist() @@ -1737,7 +1874,18 @@ class GPUModelRunner(LoRAModelRunnerMixin): logger.info("Starting to load model %s...", self.model_config.model) with DeviceMemoryProfiler() as m: # noqa: SIM117 time_before_load = time.perf_counter() - self.model = get_model(vllm_config=self.vllm_config) + model_loader = get_model_loader(self.load_config) + if not hasattr(self, "model"): + logger.info("Loading model from scratch...") + self.model = model_loader.load_model( + vllm_config=self.vllm_config, + model_config=self.model_config) + else: + logger.info( + "Model was already initialized. Loading weights inplace..." + ) + model_loader.load_weights(self.model, + model_config=self.model_config) if self.lora_config: self.model = self.load_lora_model(self.model, self.model_config, @@ -1757,6 +1905,15 @@ class GPUModelRunner(LoRAModelRunnerMixin): time_after_load - time_before_load) prepare_communication_buffer_for_model(self.model) + def save_tensorized_model( + self, + tensorizer_config: "TensorizerConfig", + ) -> None: + TensorizerLoader.save_model( + self.model, + tensorizer_config=tensorizer_config, + ) + def _get_prompt_logprobs_dict( self, hidden_states: torch.Tensor, @@ -1862,6 +2019,10 @@ class GPUModelRunner(LoRAModelRunnerMixin): allow_microbatching: bool = False, ) -> torch.Tensor: + # Padding for DP + num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens) + num_tokens += num_pad + # Set num_scheduled_tokens based on num_tokens and max_num_seqs # for dummy run with LoRA so that the num_reqs collectively # has num_tokens in total. @@ -1915,13 +2076,48 @@ class GPUModelRunner(LoRAModelRunnerMixin): if not should_microbatch else dummy_microbatches, is_dummy_run=True, ) + # model = self.model + # if self.is_multimodal_model: + # input_ids = None + # inputs_embeds = self.inputs_embeds[:num_tokens] + # else: + # input_ids = self.input_ids[:num_tokens] + # inputs_embeds = None + # if self.uses_mrope: + # positions = self.mrope_positions[:, :num_tokens] + # else: + # positions = self.positions[:num_tokens] + + # if get_pp_group().is_first_rank: + # intermediate_tensors = None + # else: + # if self.intermediate_tensors is None: + # self.intermediate_tensors = ( + # self.model.make_empty_intermediate_tensors( + # batch_size=self.max_num_tokens, + # dtype=self.model_config.dtype, + # device=self.device)) + + # intermediate_tensors = self.sync_and_slice_intermediate_tensors( + # num_tokens, None, False) + + # with set_forward_context( + # attn_metadata, + # self.vllm_config, + # num_tokens=num_tokens, + # num_tokens_across_dp=num_tokens_across_dp): + # outputs = model( + # input_ids=input_ids, + # positions=positions, + # intermediate_tensors=intermediate_tensors, + # inputs_embeds=inputs_embeds, + # ) if self.use_aux_hidden_state_outputs: hidden_states, _ = outputs else: hidden_states = outputs - if self.use_spec_decode and \ - self.speculative_config.method in ('eagle', 'eagle3'): + if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) self.drafter.dummy_run(num_tokens) @@ -1933,6 +2129,10 @@ class GPUModelRunner(LoRAModelRunnerMixin): self, hidden_states: torch.Tensor, ) -> torch.Tensor: + # The dummy hidden states may contain special values, + # like `inf` or `nan`. + # To avoid breaking the sampler, we use a random tensor here instead. + hidden_states = torch.rand_like(hidden_states) logits = self.model.compute_logits(hidden_states, None) num_reqs = logits.size(0) @@ -1972,7 +2172,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): "initializing the engine.") from e else: raise e - if self.use_spec_decode: + if self.speculative_config: draft_token_ids = [[0] for _ in range(num_reqs)] dummy_spec_decode_metadata = SpecDecodeMetadata.make_dummy( draft_token_ids, self.device) @@ -2054,7 +2254,10 @@ class GPUModelRunner(LoRAModelRunnerMixin): batched_dummy_mm_inputs = MultiModalKwargs.batch( [dummy_mm_kwargs] * max_num_mm_items) batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs( - batched_dummy_mm_inputs, device=self.device) + batched_dummy_mm_inputs, + dtype=self.model_config.dtype, + device=self.device, + ) # Run multimodal encoder. dummy_encoder_outputs = self.model.get_multimodal_embeddings( @@ -2164,16 +2367,11 @@ class GPUModelRunner(LoRAModelRunnerMixin): kv_cache_config: Configuration for the KV cache, including the KV cache size of each layer """ + if len(kv_cache_config.kv_cache_groups) > 1: + raise NotImplementedError( + "Hybrid models with more than one KV cache type are not " + "supported yet.") self.kv_cache_config = kv_cache_config - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.max_model_len, - max_num_batched_tokens=self.max_num_tokens, - device=self.device, - pin_memory=self.pin_memory, - vocab_size=self.model_config.get_vocab_size(), - kv_cache_config=kv_cache_config, - ) self.initialize_attn_backend(kv_cache_config) kv_caches: dict[str, torch.Tensor] = {} @@ -2197,14 +2395,40 @@ class GPUModelRunner(LoRAModelRunnerMixin): num_blocks, kv_cache_spec.block_size, kv_cache_spec.num_kv_heads, kv_cache_spec.head_size) dtype = kv_cache_spec.dtype - kv_caches[layer_name] = torch.zeros(kv_cache_shape, - dtype=dtype, - device=self.device) + try: + kv_cache_stride_order = self.attn_backends[ + i].get_kv_cache_stride_order() + assert len(kv_cache_stride_order) == len( + kv_cache_shape) + except (AttributeError, NotImplementedError): + kv_cache_stride_order = tuple( + range(len(kv_cache_shape))) + # The allocation respects the backend-defined stride order + # to ensure the semantic remains consistent for each + # backend. We first obtain the generic kv cache shape and + # then permute it according to the stride order which could + # result in a non-contiguous tensor. + kv_cache_shape = tuple(kv_cache_shape[i] + for i in kv_cache_stride_order) + # Maintain original KV shape view. + inv_order = [ + kv_cache_stride_order.index(i) + for i in range(len(kv_cache_stride_order)) + ] + kv_caches[layer_name] = torch.zeros( + kv_cache_shape, dtype=dtype, + device=self.device).permute(*inv_order) else: # TODO: add new branches when introducing more types of # KV cache specs. raise ValueError("Unknown KV cache spec type.") + if self.speculative_config and self.speculative_config.use_eagle(): + assert isinstance(self.drafter, EagleProposer) + # validate all draft model layers belong to the same kv cache + # group + self.drafter.validate_same_kv_cache_group(kv_cache_config) + bind_kv_cache( kv_caches, self.vllm_config.compilation_config.static_forward_context, diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 2ce07acbb8938..883e0a2ee4c31 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -31,6 +31,7 @@ from vllm.v1.worker.worker_base import WorkerBase logger = init_logger(__name__) if TYPE_CHECKING: + from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.v1.core.sched.output import SchedulerOutput @@ -171,10 +172,9 @@ class Worker(WorkerBase): Then, it calculate the free memory that can be used for KV cache in bytes. - :::{tip} - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. - ::: + Tip: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. """ torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() @@ -292,6 +292,8 @@ class Worker(WorkerBase): self.profiler.start() else: self.profiler.stop() + print(self.profiler.key_averages().table( + sort_by="self_cuda_time_total")) def execute_dummy_batch(self) -> None: # TODO: adding allow_microbatching will break non-gpu backends @@ -327,6 +329,13 @@ class Worker(WorkerBase): max_size=max_size, ) + def save_tensorized_model( + self, + tensorizer_config: "TensorizerConfig", + ) -> None: + self.model_runner.save_tensorized_model( + tensorizer_config=tensorizer_config, ) + def init_worker_distributed_environment( vllm_config: VllmConfig, @@ -342,8 +351,7 @@ def init_worker_distributed_environment( distributed_init_method, local_rank) ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - parallel_config.enable_expert_parallel) + parallel_config.pipeline_parallel_size) ensure_kv_transfer_initialized(vllm_config) diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 3cbab840e9693..eb8ed622161d5 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -80,8 +80,38 @@ class LoRAModelRunnerMixin: lora_requests) @contextmanager - def maybe_dummy_run_with_lora(self, lora_config: LoRAConfig, - num_scheduled_tokens: np.ndarray): + def maybe_setup_dummy_loras(self, lora_config): + if lora_config is None: + yield + else: + # __enter__ code + assert self.lora_manager is not None, "LoRA is not enabled" + + num_loras = lora_config.max_loras + + # Make dummy lora requests + lora_requests: set[LoRARequest] = { + LoRARequest(lora_name=f"warmup_{lora_id}", + lora_int_id=lora_id, + lora_path="/not/a/real/path") + for lora_id in range(1, num_loras + 1) + } + + with self.lora_manager.dummy_lora_cache(): + # Add the dummy LoRAs here so _set_active_loras doesn't try to + # load from disk. + for lr in lora_requests: + self.lora_manager.add_dummy_lora( + lr, rank=self.LORA_WARMUP_RANK) + + yield + + # __exit__ code + self.lora_manager.remove_all_adapters() + + @contextmanager + def maybe_select_dummy_loras(self, lora_config: LoRAConfig, + num_scheduled_tokens: np.ndarray): if lora_config is None: yield else: @@ -108,21 +138,18 @@ class LoRAModelRunnerMixin: for lora_id in range(1, num_loras + 1) } - with self.lora_manager.dummy_lora_cache(): - # Add the dummy LoRAs here so _set_active_loras doesn't try to - # load from disk. - for lr in lora_requests: - self.lora_manager.add_dummy_lora( - lr, rank=self.LORA_WARMUP_RANK) + self._set_active_loras(tuple(prompt_lora_mapping), + tuple(token_lora_mapping), lora_requests) - self._set_active_loras(tuple(prompt_lora_mapping), - tuple(token_lora_mapping), - lora_requests) + yield - yield - - # __exit__ code - self.lora_manager.remove_all_adapters() + @contextmanager + def maybe_dummy_run_with_lora(self, lora_config: LoRAConfig, + num_scheduled_tokens: np.ndarray): + with self.maybe_setup_dummy_loras( + lora_config), self.maybe_select_dummy_loras( + lora_config, num_scheduled_tokens): + yield def add_lora(self, lora_request: LoRARequest) -> bool: if not self.lora_manager: diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 2da99696445ee..5de92351e24ba 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -20,7 +20,8 @@ from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.forward_context import set_forward_context from vllm.logger import init_logger -from vllm.model_executor.model_loader import get_model +from vllm.lora.layers import BaseLayerWithLoRA +from vllm.model_executor.model_loader import get_model_loader from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs, PlaceholderRange) @@ -152,6 +153,9 @@ class TPUModelRunner(LoRAModelRunnerMixin): self.hidden_size = model_config.get_hidden_size() self.vocab_size = model_config.get_vocab_size() + if self.lora_config is not None: + self.vocab_size += self.lora_config.lora_extra_vocab_size + # Multi-modal data support self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope @@ -167,15 +171,25 @@ class TPUModelRunner(LoRAModelRunnerMixin): self.encoder_cache_size = encoder_cache_size # Lazy initialization - # self.model: nn.Module # Set after load_model + self.model: nn.Module # Set after load_model self.kv_caches: list[torch.Tensor] = [] # req_id -> (input_id -> encoder_output) self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} - # self.input_batch: InputBatch # Persistent batch. # Request states. self.requests: dict[str, CachedRequestState] = {} + # Initialize input batch early to avoid AttributeError in _update_states + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.max_model_len, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=self.pin_memory, + vocab_size=self.model_config.get_vocab_size(), + block_size=self.block_size, + ) + # Cached torch/numpy tensor # The pytorch tensor and numpy array share the same buffer. # Sometimes the numpy op is faster so we create both. @@ -405,7 +419,6 @@ class TPUModelRunner(LoRAModelRunnerMixin): return len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0 def get_model(self) -> nn.Module: - assert self.model is not None return self.model def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: @@ -591,6 +604,17 @@ class TPUModelRunner(LoRAModelRunnerMixin): logits_indices = self.query_start_loc_cpu[1:padded_num_reqs + 1] - 1 logits_indices = logits_indices.to(self.device) + if self.lora_config is not None: + # We need to respect padding when activating LoRA adapters + padded_num_scheduled_tokens_per_req = np.copy( + num_scheduled_tokens_per_req + ) # Copying to avoid accidental state corruption bugs + padded_num_scheduled_tokens_per_req[-1] += \ + padded_total_num_scheduled_tokens - total_num_scheduled_tokens + + self.set_active_loras(self.input_batch, + padded_num_scheduled_tokens_per_req) + layer_names = get_layers_from_vllm_config(self.vllm_config, Attention).keys() per_layer_attn_metadata = { @@ -652,8 +676,11 @@ class TPUModelRunner(LoRAModelRunnerMixin): encoder_outputs = [] for grouped_mm_inputs in grouped_mm_inputs_list: batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs) - batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs, - device=self.device) + batched_mm_inputs = MultiModalKwargs.as_kwargs( + batched_mm_inputs, + dtype=self.model_config.dtype, + device=self.device, + ) # Run the encoder. # `curr_group_outputs` is either of the following: @@ -908,17 +935,30 @@ class TPUModelRunner(LoRAModelRunnerMixin): "vllm.model_executor.layers.vocab_parallel_embedding." "get_tensor_model_parallel_rank", return_value=xm_tp_rank): - model = get_model(vllm_config=self.vllm_config) + # model = get_model(vllm_config=self.vllm_config) + model_loader = get_model_loader(self.load_config) + if not hasattr(self, "model"): + logger.info("Loading model from scratch...") + model = model_loader.load_model(vllm_config=self.vllm_config, + model_config=self.model_config) + else: + logger.info( + "Model was already initialized. Loading weights inplace..." + ) + model_loader.load_weights(self.model, + model_config=self.model_config) if self.lora_config is not None: model = self.load_lora_model(model, self.model_config, self.scheduler_config, self.lora_config, self.device) + replace_set_lora(model) # Sync all pending XLA execution during model initialization and weight # loading. xm.mark_step() xm.wait_device_ops() - self.model = model + if not hasattr(self, "model"): + self.model = model self.sampler = TPUSampler() @torch.no_grad() @@ -977,7 +1017,7 @@ class TPUModelRunner(LoRAModelRunnerMixin): for layer_name in layer_names } - with self.maybe_dummy_run_with_lora( + with self.maybe_select_dummy_loras( self.lora_config, np.array([num_tokens], dtype=np.int32)), set_forward_context( per_layer_attn_metadata, self.vllm_config, 0): @@ -986,6 +1026,13 @@ class TPUModelRunner(LoRAModelRunnerMixin): inputs_embeds=inputs_embeds) self._hidden_states_dtype = out.dtype + def _set_active_loras(self, prompt_lora_mapping, token_lora_mapping, + lora_requests) -> None: + xm.mark_step() # Captures input updates + super()._set_active_loras(prompt_lora_mapping, token_lora_mapping, + lora_requests) + xm.mark_step() # Captures metadata updates + def _precompile_mm_encoder(self) -> None: # Pre-compile MM encoder for all supported data modalities. hf_config = self.vllm_config.model_config.hf_config @@ -1148,7 +1195,10 @@ class TPUModelRunner(LoRAModelRunnerMixin): generate_params_if_all_greedy, )) sampling_metadata.all_greedy = all_greedy - self.sample_from_logits(dummy_logits, sampling_metadata) + with self.maybe_select_dummy_loras( + self.lora_config, np.array([num_reqs], + dtype=np.int32)): + self.sample_from_logits(dummy_logits, sampling_metadata) logger.info(" -- num_seqs: %d", num_reqs) xm.wait_device_ops() end = time.perf_counter() @@ -1164,7 +1214,9 @@ class TPUModelRunner(LoRAModelRunnerMixin): dtype=self._hidden_states_dtype) dummy_tokens = torch.zeros((num_reqs, 1), dtype=torch.int64).to(self.device) - self.gather_logprobs(dummy_logits, dummy_tokens) + with self.maybe_select_dummy_loras( + self.lora_config, np.array([num_reqs], dtype=np.int32)): + self.gather_logprobs(dummy_logits, dummy_tokens) logger.info(" -- num_seqs: %d", num_reqs) xm.wait_device_ops() end = time.perf_counter() @@ -1175,13 +1227,14 @@ class TPUModelRunner(LoRAModelRunnerMixin): """ Precompile all the subgraphs with possible input shapes. """ - self._precompile_mm_encoder() - self._precompile_backbone() - self._precompile_select_hidden_states() - self._precompile_compute_logits() - self._precompile_structured_decoding() - self._precompile_sample_from_logits() - self._precompile_gather_logprobs() + with self.maybe_setup_dummy_loras(self.lora_config): + self._precompile_mm_encoder() + self._precompile_backbone() + self._precompile_select_hidden_states() + self._precompile_compute_logits() + self._precompile_structured_decoding() + self._precompile_sample_from_logits() + self._precompile_gather_logprobs() def profile_run( self, @@ -1254,15 +1307,19 @@ class TPUModelRunner(LoRAModelRunnerMixin): "Hybrid models with more than one KV cache type are not " "supported yet.") - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.max_model_len, - max_num_batched_tokens=self.max_num_tokens, - device=self.device, - pin_memory=self.pin_memory, - vocab_size=self.model_config.get_vocab_size(), - kv_cache_config=kv_cache_config, - ) + if kv_cache_config.kv_cache_groups[ + 0].kv_cache_spec.block_size != self.block_size: + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.max_model_len, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=self.pin_memory, + vocab_size=self.model_config.get_vocab_size(), + block_size=kv_cache_config.kv_cache_groups[0].kv_cache_spec. + block_size, + ) + # Verify dtype compatibility between block_table_cpu and input_batch assert self.block_table_cpu.dtype == self.input_batch.block_table[ 0].get_cpu_tensor().dtype @@ -1434,8 +1491,11 @@ class TPUModelRunner(LoRAModelRunnerMixin): batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] * batch_size) - return MultiModalKwargs.as_kwargs(batched_dummy_mm_inputs, - device=self.device) + return MultiModalKwargs.as_kwargs( + batched_dummy_mm_inputs, + dtype=self.model_config.dtype, + device=self.device, + ) def _get_req_paddings(min_req_size: int, max_req_size: int) -> list[int]: @@ -1460,11 +1520,11 @@ def _get_token_paddings(min_token_size: int, max_token_size: int, padding_gap: int) -> list[int]: """Generate a list of padding size, starting from min_token_size, ending with a number that can cover max_token_size - + If padding_gap == 0 then: increase 2X each time (exponential) else: - first increase the size to twice, + first increase the size to twice, then increase the padding size by padding_gap. """ # assert min_token_size is power of 2 @@ -1501,3 +1561,32 @@ def _get_padded_token_len(paddings: list[int], x: int) -> int: index = bisect.bisect_left(paddings, x) assert index < len(paddings) return paddings[index] + + +def replace_set_lora(model): + + def _tpu_set_lora( + self, + index: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, + ): + # TODO: The integer index leads to a recompilation, but converting it + # to a tensor doesn't seem to work anymore. This might be fixed with a + # later release of torch_xla. + self._original_set_lora(index, lora_a, lora_b, embeddings_tensor, bias) + xm.mark_step() + + def _tpu_reset_lora(self, index: int): + self._original_reset_lora(index) + xm.mark_step() + + for _, module in model.named_modules(): + if isinstance(module, BaseLayerWithLoRA): + module._original_set_lora = module.set_lora + module._original_reset_lora = module.reset_lora + module.set_lora = _tpu_set_lora.__get__(module, module.__class__) + module.reset_lora = _tpu_reset_lora.__get__( + module, module.__class__) diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index ae3735ab0255f..0707e17afe7a7 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -83,10 +83,6 @@ class TPUWorker: if self.model_config.seed is None: self.model_config.seed = 0 - if vllm_config.lora_config is not None: - raise NotImplementedError( - "The V1 TPU backend doesn't support LoRA serving") - def init_device(self): os.environ["PJRT_DEVICE"] = "TPU" # Note: Currently the XLA compiler wrongly uses 2D ring strategy on 1D @@ -166,7 +162,8 @@ class TPUWorker: runner_kv_caches) # `max_num_tokens >= max_num_batched_tokens` due to padding. - self.model_runner.profile_run(self.model_runner.max_num_tokens) + with self.model_runner.maybe_setup_dummy_loras(self.lora_config): + self.model_runner.profile_run(self.model_runner.max_num_tokens) # Synchronize before measuring the memory usage. xm.wait_device_ops() @@ -265,8 +262,7 @@ def init_tpu_worker_distributed_environment( backend="gloo", ) ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - parallel_config.enable_expert_parallel) + parallel_config.pipeline_parallel_size) try: diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 267754036b317..91548a52cfc70 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -10,7 +10,7 @@ def sanity_check_mm_encoder_outputs( ) -> None: """ Perform sanity checks for the result of - {meth}`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`. + [`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`][]. """ assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), ( "Expected multimodal embeddings to be a list/tuple of 2D tensors, " @@ -39,7 +39,7 @@ def scatter_mm_placeholders( Scatter the multimodal embeddings into a contiguous tensor that represents the placeholder tokens. - {class}`vllm.multimodal.processing.PromptUpdateDetails.is_embed`. + [`vllm.multimodal.processing.PromptUpdateDetails.is_embed`][]. Args: embeds: The multimodal embeddings. @@ -66,7 +66,7 @@ def gather_mm_placeholders( """ Reconstructs the embeddings from the placeholder tokens. - This is the operation of {func}`scatter_mm_placeholders`. + This is the operation of [scatter_mm_placeholders][]. """ if is_embed is None: return placeholders diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py index c2120c035175a..82eeeb570d222 100644 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -297,8 +297,11 @@ class CPUEncoderDecoderModelRunner( model_input.encoder_input_tokens, "encoder_positions": model_input.encoder_input_positions, - **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, - device=self.device), + **MultiModalKwargs.as_kwargs( + model_input.multi_modal_kwargs or {}, + dtype=self.model_config.dtype, + device=self.device, + ), "intermediate_tensors": intermediate_tensors, } diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 710ca1a13b0c5..fb436a079f878 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -628,7 +628,10 @@ class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]): multimodal_kwargs = {} if model_input.multi_modal_kwargs is not None: multimodal_kwargs = MultiModalKwargs.as_kwargs( - model_input.multi_modal_kwargs, device=self.device) + model_input.multi_modal_kwargs, + dtype=self.model_config.dtype, + device=self.device, + ) execute_model_kwargs = {} if previous_hidden_states is not None: execute_model_kwargs.update( diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py index 1ceb2557c6b3d..2a60e51261ad6 100644 --- a/vllm/worker/cpu_pooling_model_runner.py +++ b/vllm/worker/cpu_pooling_model_runner.py @@ -50,8 +50,11 @@ class CPUPoolingModelRunner( model_input.input_tokens, "positions": model_input.input_positions, - **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, - device=self.device), + **MultiModalKwargs.as_kwargs( + model_input.multi_modal_kwargs or {}, + dtype=self.model_config.dtype, + device=self.device, + ), **cross_enc_kwargs, "intermediate_tensors": intermediate_tensors, diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index a92cf1e5a3b3c..1436a404335a0 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -390,8 +390,7 @@ class CPUWorker(LocalOrDistributedWorkerBase): ensure_model_parallel_initialized( parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - parallel_config.enable_expert_parallel) + parallel_config.pipeline_parallel_size) def get_cache_block_size_bytes(self) -> int: """Return the size in bytes of a single KV cache block. diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 4864163b0de2a..3957e5608524f 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -202,9 +202,13 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): encoder_input_ids=model_input.encoder_input_tokens, encoder_positions=model_input.encoder_input_positions, intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs(multi_modal_kwargs, - device=self.device), - **seqlen_agnostic_kwargs) + **MultiModalKwargs.as_kwargs( + multi_modal_kwargs, + dtype=self.model_config.dtype, + device=self.device, + ), + **seqlen_agnostic_kwargs, + ) logits = self.model.compute_logits(hidden_or_intermediate_states, model_input.sampling_metadata) diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 42882992f2da2..533fead0e669e 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -201,10 +201,9 @@ class HPUWorker(LocalOrDistributedWorkerBase): Then, it calculate the maximum possible number of GPU and CPU blocks that can be allocated with the remaining free memory. - :::{tip} - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. - ::: + Tip: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. """ # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. @@ -416,8 +415,7 @@ def init_worker_distributed_environment( backend='hccl') ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - parallel_config.enable_expert_parallel) + parallel_config.pipeline_parallel_size) if torch.distributed.is_initialized(): torch_world_size = torch.distributed.get_world_size() @@ -443,8 +441,7 @@ def init_worker_distributed_environment( torch.distributed.all_reduce(dummy_tensor_hpu) assert dummy_tensor_hpu.item() == parallel_config.world_size ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - parallel_config.enable_expert_parallel) + parallel_config.pipeline_parallel_size) def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 15f40bcef8969..8c968faa78101 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -730,6 +730,11 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): seq_group_metadata, range(positions[0], positions[0] + len(positions))) + # M-RoPE requires mrope_positions even for plain text; return early + # when mm_kwargs is empty only if inter_data.is_prompt is False. + if not mm_kwargs and not inter_data.is_prompt: + return + inter_data.multi_modal_kwargs = mm_kwargs inter_data.multi_modal_placeholder_maps = placeholder_maps @@ -1840,8 +1845,11 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): inputs_embeds=model_input.inputs_embeds, positions=model_input.input_positions, intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs(multi_modal_kwargs, - device=self.device), + **MultiModalKwargs.as_kwargs( + multi_modal_kwargs, + dtype=self.model_config.dtype, + device=self.device, + ), **seqlen_agnostic_kwargs, **model_kwargs, ) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 0825abbed1437..f8d5acf586c51 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -733,12 +733,13 @@ def _pythonize_sampler_output( logprobs_tensor: Optional[torch.Tensor], cache: Optional[PythonizationCache], ) -> None: - """ This function is only called when the output tensors are ready. - See {class}`ModelOutput`. - - Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place, + """ This function is only called when the output tensors are ready. + See [`ModelOutput`][vllm.worker.multi_step_model_runner.ModelOutput]. + + Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place, adding a Pythonized output data structure - ({class}`CompletionSequenceGroupOutput`) for each {class}`SequenceGroup`. + ([`CompletionSequenceGroupOutput`][vllm.sequence.CompletionSequenceGroupOutput]) + for each [`SequenceGroup`][vllm.sequence.SequenceGroup]. Args: model_input @@ -824,7 +825,7 @@ def _pythonize_sampler_output( for sgdx, (seq_group, sample_result) in enumerate(zip(seq_groups, samples_list)): - # Reminder: Please update docs/source/features/compatibility_matrix.md + # Reminder: Please update docs/features/compatibility_matrix.md # If the feature combo become valid # (Check for Guided Decoding) if seq_group.sampling_params.logits_processors: diff --git a/vllm/worker/multi_step_neuron_model_runner.py b/vllm/worker/multi_step_neuron_model_runner.py index 9618a4b49ff89..aafb7ab7cfb8d 100644 --- a/vllm/worker/multi_step_neuron_model_runner.py +++ b/vllm/worker/multi_step_neuron_model_runner.py @@ -70,8 +70,11 @@ class MultiStepNeuronModelRunner(NeuronModelRunner): input_ids=model_input.input_tokens, positions=model_input.input_positions, input_block_ids=model_input.input_block_ids, - **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, - device=self.device), + **MultiModalKwargs.as_kwargs( + model_input.multi_modal_kwargs or {}, + dtype=self.model_config.dtype, + device=self.device, + ), ) output = self.model.sample( diff --git a/vllm/worker/multi_step_neuronx_distributed_model_runner.py b/vllm/worker/multi_step_neuronx_distributed_model_runner.py index b6a3492a493bb..3a9c0993e004f 100644 --- a/vllm/worker/multi_step_neuronx_distributed_model_runner.py +++ b/vllm/worker/multi_step_neuronx_distributed_model_runner.py @@ -49,8 +49,11 @@ class MultiStepNeuronxDistributedModelRunner(NeuronxDistributedModelRunner): positions=model_input.input_positions, input_block_ids=model_input.input_block_ids, sampling_params=sampling_params, - **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, - device=self.device), + **MultiModalKwargs.as_kwargs( + model_input.multi_modal_kwargs or {}, + dtype=self.model_config.dtype, + device=self.device, + ), ) output = self.model.sample( diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index e97adf757cc12..3aff3e01aef16 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -2,13 +2,15 @@ import os from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union import torch from torch import nn from vllm.config import DeviceConfig, VllmConfig from vllm.logger import init_logger +from vllm.lora.layers import LoRAMapping +from vllm.lora.request import LoRARequest from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.neuron import get_neuron_model @@ -36,6 +38,7 @@ class ModelInputForNeuron(ModelRunnerInputBase): input_block_ids: Optional[torch.Tensor] = None sampling_metadata: SamplingMetadata = None multi_modal_kwargs: BatchedTensorInputs = None + adapter_ids: Optional[str] = None def as_broadcastable_tensor_dict( self) -> Dict[str, Union[int, torch.Tensor]]: @@ -80,6 +83,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): "The model will run without sliding window.") self.device_config = (self.device_config if self.device_config is not None else DeviceConfig()) + self.lora_config = vllm_config.lora_config self.device = self.device_config.device self.pin_memory = is_pin_memory_available() @@ -165,6 +169,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): mm_kwargs = seq_group_metadata.multi_modal_data if mm_kwargs: + mm_kwargs = self.process_multi_modal_data_neuron(mm_kwargs) multi_modal_kwargs_list.append(mm_kwargs) max_seq_len = max(seq_lens) @@ -270,6 +275,14 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): sampling_params.top_p = top_p sampling_params.temperature = temperature + # we need multi_modal_data for later tokens as well + multi_modal_kwargs_list: List[MultiModalKwargs] = [] + for seq_group_metadata in seq_group_metadata_list: + mm_data = seq_group_metadata.multi_modal_data + if mm_data: + multi_modal_kwargs_list.append(mm_data) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) + sampling_metadata = SamplingMetadata.prepare( seq_group_metadata_list, seq_lens, @@ -378,9 +391,12 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): positions=model_input.input_positions, input_block_ids=model_input.input_block_ids, sampling_params=sampling_params, - **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs - or {}, - device=self.device), + adapter_ids=model_input.adapter_ids, + **MultiModalKwargs.as_kwargs( + model_input.multi_modal_kwargs or {}, + dtype=self.model_config.dtype, + device=self.device, + ), ) elif current_platform.use_transformers_neuronx(): # [TODO] validate on-device sampling @@ -389,9 +405,11 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): input_ids=model_input.input_tokens, positions=model_input.input_positions, input_block_ids=model_input.input_block_ids, - **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs - or {}, - device=self.device), + **MultiModalKwargs.as_kwargs( + model_input.multi_modal_kwargs or {}, + dtype=self.model_config.dtype, + device=self.device, + ), ) # Compute the logits only if the on-device sampling is turned off as @@ -412,3 +430,32 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): @property def vocab_size(self) -> int: return self.model_config.get_vocab_size() + + def process_multi_modal_data_neuron(self, mm_data): + # this is a no-op for NeuronModelRunner + return mm_data + + def remove_all_loras(self): + raise NotImplementedError( + "LoRAs are not supported for Transformers NeuronX framework") + + def set_active_loras(self, lora_requests: Set[LoRARequest], + lora_mapping: LoRAMapping) -> None: + raise NotImplementedError( + "LoRAs are not supported for Transformers NeuronX framework") + + def add_lora(self, lora_request: LoRARequest): + raise NotImplementedError( + "LoRAs are not supported for Transformers NeuronX framework") + + def remove_lora(self, lora_id: int) -> bool: + raise NotImplementedError( + "LoRAs are not supported for Transformers NeuronX framework") + + def pin_lora(self, lora_id: int) -> bool: + raise NotImplementedError( + "LoRAs are not supported for Transformers NeuronX framework") + + def list_loras(self) -> Set[int]: + raise NotImplementedError( + "LoRAs are not supported for Transformers NeuronX framework") diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index aa8e39613eec8..64daee31bbdf5 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """A Neuron worker class.""" import os -from typing import List, Optional, Tuple +from typing import List, Optional, Set, Tuple import torch.distributed @@ -9,19 +9,19 @@ from vllm.config import VllmConfig from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.logger import init_logger +from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed from vllm.platforms import current_platform from vllm.platforms.neuron import NeuronFramework from vllm.sequence import ExecuteModelRequest from vllm.worker.neuron_model_runner import NeuronModelRunner -from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, - LoRANotSupportedWorkerBase, WorkerBase, +from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase, WorkerInput) logger = init_logger(__name__) -class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase): +class NeuronWorker(LocalOrDistributedWorkerBase): """A worker class that executes the model on a group of neuron cores. """ @@ -38,6 +38,7 @@ class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase): self.rank = rank self.distributed_init_method = distributed_init_method self.is_driver_worker = is_driver_worker + self.lora_config = vllm_config.lora_config if self.model_config.trust_remote_code: # note: lazy import to avoid importing torch before initializing @@ -59,6 +60,9 @@ class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase): "[transformers-neuronx, neuronx-distributed-inference]") def get_tnx_model_runner(self, vllm_config): + assert (self.lora_config + is None), ("LoRA is not supported for TransformersNeuronX " + "framework.") from vllm.worker.multi_step_neuron_model_runner import ( MultiStepNeuronModelRunner) if self.speculative_config is not None: @@ -72,6 +76,8 @@ class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase): from vllm.worker.neuronx_distributed_model_runner import ( NeuronxDistributedModelRunner) if self.speculative_config is not None: + assert (self.lora_config + is None), "LoRA is not supported for Speculative Decoding" return MultiStepNeuronxDistributedModelRunner( vllm_config=vllm_config) else: @@ -156,3 +162,31 @@ class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase): 1, 1, ) + + def add_lora(self, lora_request: LoRARequest) -> bool: + if current_platform.use_transformers_neuronx(): + raise NotImplementedError( + f"{type(self)} does not support LoRA with Neuron Framework " + f"Transformers NeuronX") + return self.model_runner.add_lora(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + if current_platform.use_transformers_neuronx(): + raise NotImplementedError( + f"{type(self)} does not support LoRA with Neuron Framework " + f"Transformers NeuronX") + return self.model_runner.remove_lora(lora_id) + + def pin_lora(self, lora_id: int) -> bool: + if current_platform.use_transformers_neuronx(): + raise NotImplementedError( + f"{type(self)} does not support LoRA with Neuron Framework " + f"Transformers NeuronX") + return self.model_runner.pin_lora(lora_id) + + def list_loras(self) -> Set[int]: + if current_platform.use_transformers_neuronx(): + raise NotImplementedError( + f"{type(self)} does not support LoRA with Neuron Framework " + f"Transformers NeuronX") + return self.model_runner.list_loras() diff --git a/vllm/worker/neuronx_distributed_model_runner.py b/vllm/worker/neuronx_distributed_model_runner.py index 4e784e5e0302d..9cd4f88d32f06 100644 --- a/vllm/worker/neuronx_distributed_model_runner.py +++ b/vllm/worker/neuronx_distributed_model_runner.py @@ -1,17 +1,26 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional +from typing import List, Optional, Set import torch +from neuronx_distributed_inference.models.mllama.aspect_ratio_utils import ( + get_all_supported_aspect_ratios) from neuronx_distributed_inference.modules.generation.sampling import ( prepare_sampling_params) +from neuronx_distributed_inference.modules.lora_serving import ( + LoraCheckpoint, LoraServingConfig) from vllm.config import VllmConfig +from vllm.entrypoints.openai.serving_models import LoRAModulePath from vllm.logger import init_logger +from vllm.lora.layers import LoRAMapping +from vllm.lora.request import LoRARequest +from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.neuronx_distributed import ( _get_model_architecture, get_neuron_model) -from vllm.sequence import IntermediateTensors +from vllm.multimodal import MultiModalKwargs +from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.worker.neuron_model_runner import (ModelInputForNeuron, NeuronModelRunner) @@ -25,11 +34,44 @@ class NeuronxDistributedModelRunner(NeuronModelRunner): vllm_config: VllmConfig, ): super().__init__(vllm_config) + self.lora_checkpoint = None + self.model = None + self.lora_serving_config = None + + @staticmethod + def _get_lora_paths_strings(lora_modules: List[LoRAModulePath]): + if not lora_modules: + return None + return {_.get("name"): _.get("path") for _ in lora_modules} + + def _get_nxdi_lora_config(self): + override_neuron_config = self.model_config.override_neuron_config + lora_modules = override_neuron_config.pop("lora_modules", None) + target_modules = override_neuron_config.pop("target_modules", None) + lora_ckpt_paths = self._get_lora_paths_strings(lora_modules) + if self.lora_config.max_loras < len(lora_ckpt_paths): + raise ValueError( + "Number of LoRAs (%s) exceeds maximum " + "allowed (%s)", len(lora_ckpt_paths), + self.lora_config.max_loras) + + return LoraServingConfig( + max_loras=self.lora_config.max_loras, + max_lora_rank=self.lora_config.max_lora_rank, + target_modules=target_modules, + lora_ckpt_paths=lora_ckpt_paths, + ) def load_model(self) -> None: - self.model = get_neuron_model(self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config) + # Update LoRA config + if self.lora_config is not None: + self.lora_serving_config = self._get_nxdi_lora_config() + self.lora_checkpoint = LoraCheckpoint(self.lora_serving_config) + self.model = get_neuron_model( + self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + lora_serving_config=self.lora_serving_config) def get_nxd_sampling_params(self, sampling_metadata): if self.model.config.neuron_config.on_device_sampling_config: @@ -81,42 +123,28 @@ class NeuronxDistributedModelRunner(NeuronModelRunner): sampling_params = self.get_nxd_sampling_params( model_input.sampling_metadata) - if model_input.multi_modal_kwargs.get('image') is not None: - pixel_values = [] - aspect_ratios = [] - num_chunks = [] - has_image = [] - for multi_modal_input in model_input.multi_modal_kwargs.get( - 'image'): - image_tensors = self.get_multi_modal_data_neuron( - multi_modal_input.squeeze(0)) - pixel_values.append(image_tensors[0]) - aspect_ratios.append(image_tensors[1]) - num_chunks.append(image_tensors[2]) - has_image.append(image_tensors[3]) - - pixel_values = torch.cat(pixel_values, dim=0) - aspect_ratios = torch.cat(aspect_ratios, dim=0) - num_chunks = torch.cat(num_chunks, dim=0) - has_image = torch.cat(has_image, dim=0) - + if model_input.multi_modal_kwargs.get('pixel_values') is not None: hidden_states = self.model( input_ids=model_input.input_tokens, positions=model_input.input_positions, seq_ids=model_input.input_block_ids, - pixel_values=pixel_values, - aspect_ratios=aspect_ratios, + pixel_values=model_input.multi_modal_kwargs.get( + 'pixel_values'), + aspect_ratios=model_input.multi_modal_kwargs.get( + 'aspect_ratios'), sampling_params=sampling_params, - num_chunks=num_chunks, - has_image=has_image, + num_chunks=model_input.multi_modal_kwargs.get('num_chunks'), + has_image=model_input.multi_modal_kwargs.get( + 'has_image').squeeze(1), ) else: - empty_pixel_values = torch.zeros([1, 1, 4, 3, 560, 560], + bs = model_input.input_tokens.shape[0] if (model_input.input_tokens + is not None) else 1 + empty_pixel_values = torch.zeros([bs, 1, 4, 3, 560, 560], dtype=torch.bfloat16) - empty_aspect_ratios = torch.ones([1, 1, 2], dtype=torch.int64) - num_chunks = torch.tensor([[1] - ]) # dummy num_chunks, will not be used - has_image = torch.tensor([0]) + empty_aspect_ratios = torch.ones([bs, 1, 2], dtype=torch.int64) + num_chunks = torch.zeros((bs, 1), dtype=torch.int32) + has_image = torch.zeros([bs], dtype=torch.int32) hidden_states = self.model( input_ids=model_input.input_tokens, positions=model_input.input_positions, @@ -134,3 +162,132 @@ class NeuronxDistributedModelRunner(NeuronModelRunner): ) return [output] + + def process_multi_modal_data_neuron(self, mm_data): + # Neuron uses aspect_ratios instead of aspect_ratio_ids + all_supported_aspect_ratios = get_all_supported_aspect_ratios( + self.model.config.vision_config.max_num_tiles) + aspect_ratio_ids = mm_data.get("aspect_ratio_ids") + mm_data["aspect_ratios"] = torch.tensor( + all_supported_aspect_ratios[aspect_ratio_ids]).unsqueeze(0) + + # Neuron's num_chunks is HF's num_tiles + mm_data["num_chunks"] = mm_data.get("num_tiles") + + # Input has an image if it has pixel_values + bs = mm_data["num_chunks"].shape[0] + pixel_values = mm_data.get("pixel_values") + if pixel_values is not None and not torch.all(pixel_values == 0): + mm_data["has_image"] = torch.ones(bs) + + else: + mm_data["has_image"] = torch.zeros(bs) + return mm_data + + def _get_lora_adapter_ids(self, seq_group_metadata_list): + # set LoRA adapter IDs for multi-lora serving + batch_size = len(seq_group_metadata_list) + if self.lora_checkpoint is not None: + # "0" indicates NxDI to use the base model for inference + adapter_ids = ["0"] * batch_size + for idx, seq_group_metadata in enumerate(seq_group_metadata_list): + if seq_group_metadata.lora_request is not None: + adapter_ids[ + idx] = seq_group_metadata.lora_request.lora_name + + # convert adapter_ids from strings to integers + adapter_ids = self.lora_checkpoint.convert_adapter_ids_to_indices( + adapter_ids, batch_size) + else: + adapter_ids = torch.zeros((batch_size), dtype=torch.int32) + + return adapter_ids + + def prepare_model_input( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + virtual_engine: int = 0, + finished_requests_ids: Optional[List[str]] = None + ) -> ModelInputForNeuron: + # NOTE: We assume that all sequences in the group are all prompts or + # all decodes. + is_prompt = seq_group_metadata_list[0].is_prompt + # Prepare input tensors. + if is_prompt: + (input_tokens, input_positions, input_block_ids, seq_lens, + multi_modal_kwargs + ) = self._prepare_prompt(seq_group_metadata_list) + else: + (input_tokens, input_positions, + input_block_ids) = self._prepare_decode(seq_group_metadata_list) + seq_lens = None + + if not self._on_device_sampling_disabled: + for seq_group_metadata in seq_group_metadata_list: + sampling_params = seq_group_metadata.sampling_params + top_k, top_p, temperature = ( + self._convert_to_neuron_sampling_params(sampling_params)) + sampling_params.top_k = top_k + sampling_params.top_p = top_p + sampling_params.temperature = temperature + + # we need multi_modal_data for later tokens as well + multi_modal_kwargs_list: List[MultiModalKwargs] = [] + for seq_group_metadata in seq_group_metadata_list: + mm_data = seq_group_metadata.multi_modal_data + if mm_data: + multi_modal_kwargs_list.append(mm_data) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) + + lora_adapter_ids = self._get_lora_adapter_ids(seq_group_metadata_list) + + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, + seq_lens, + # query_lens is not needed if chunked prefill is not + # supported. Since neuron worker doesn't support chunked prefill + # just use seq_lens instead. + seq_lens, + self.device, + self.pin_memory, + generators=self.get_generators(finished_requests_ids)) + + return ModelInputForNeuron(input_tokens=input_tokens, + input_positions=input_positions, + input_block_ids=input_block_ids, + sampling_metadata=sampling_metadata, + multi_modal_kwargs=multi_modal_kwargs, + adapter_ids=lora_adapter_ids) + + def remove_all_loras(self): + raise NotImplementedError( + "Managing LoRAs is only supported through the " + "lora_modules parameter in override_neuron_config") + + def set_active_loras(self, lora_requests: Set[LoRARequest], + lora_mapping: LoRAMapping) -> None: + raise NotImplementedError( + "Managing LoRAs is only supported through the " + "lora_modules parameter in override_neuron_config") + + def add_lora(self, lora_request: LoRARequest): + logger.warning( + "Adding LoRAs is only supported through the " + "lora_modules parameter in override_neuron_config. If you supplied " + "the parameter, you can ignore this warning. Ignoring" + "lora request: ", lora_request) + + def remove_lora(self, lora_id: int) -> bool: + raise NotImplementedError( + "Managing LoRAs is only supported through the " + "lora_modules parameter in override_neuron_config") + + def pin_lora(self, lora_id: int) -> bool: + raise NotImplementedError( + "Managing LoRAs is only supported through the " + "lora_modules parameter in override_neuron_config") + + def list_loras(self) -> Set[int]: + raise NotImplementedError( + "Managing LoRAs is only supported through the " + "lora_modules parameter in override_neuron_config") diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py index fdb7353f2f9ce..912e04c435f54 100644 --- a/vllm/worker/pooling_model_runner.py +++ b/vllm/worker/pooling_model_runner.py @@ -119,10 +119,14 @@ class PoolingModelRunner( input_ids=model_input.input_tokens, positions=model_input.input_positions, intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs(multi_modal_kwargs, - device=self.device), + **MultiModalKwargs.as_kwargs( + multi_modal_kwargs, + dtype=self.model_config.dtype, + device=self.device, + ), **cross_enc_kwargs, - **seqlen_agnostic_kwargs) + **seqlen_agnostic_kwargs, + ) if (self.observability_config is not None and self.observability_config.collect_model_forward_time): diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 891ed66599dca..4bb9bea022f99 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -76,8 +76,7 @@ class TPUWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase): ) ensure_model_parallel_initialized( self.parallel_config.tensor_parallel_size, - self.parallel_config.pipeline_parallel_size, - self.parallel_config.enable_expert_parallel) + self.parallel_config.pipeline_parallel_size) # Device initialization should happen after initializing the distributed # runtime. diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py index d925f088357b5..e2854bcb37cef 100644 --- a/vllm/worker/utils.py +++ b/vllm/worker/utils.py @@ -14,7 +14,7 @@ def assert_enc_dec_mr_supported_scenario( a supported scenario. ''' - # Reminder: Please update docs/source/features/compatibility_matrix.md + # Reminder: Please update docs/features/compatibility_matrix.md # If the feature combo become valid if enc_dec_mr.cache_config.enable_prefix_caching: diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 41546462e5c4b..2a43172719342 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -128,6 +128,8 @@ class Worker(LocalOrDistributedWorkerBase): if self.profiler is None: raise RuntimeError("Profiler is not enabled.") self.profiler.stop() + print( + self.profiler.key_averages().table(sort_by="self_cuda_time_total")) def sleep(self, level: int = 1) -> None: free_bytes_before_sleep = torch.cuda.mem_get_info()[0] @@ -234,10 +236,9 @@ class Worker(LocalOrDistributedWorkerBase): Then, it calculate the maximum possible number of GPU and CPU blocks that can be allocated with the remaining free memory. - :::{tip} - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. - ::: + Tip: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. """ # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. @@ -530,8 +531,7 @@ def init_worker_distributed_environment( init_distributed_environment(parallel_config.world_size, rank, distributed_init_method, local_rank) ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - parallel_config.enable_expert_parallel) + parallel_config.pipeline_parallel_size) ensure_kv_transfer_initialized(vllm_config) diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 7042b575aa787..79fa7d2c73e88 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -562,9 +562,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): input_ids=model_input.input_tokens, positions=model_input.input_positions, intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs - or {}, - device=self.device)) + **MultiModalKwargs.as_kwargs( + model_input.multi_modal_kwargs or {}, + dtype=self.model_config.dtype, + device=self.device, + ), + ) # Compute the logits in the last pipeline stage. if not get_pp_group().is_last_rank: return hidden_or_intermediate_states diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index 65085f80f97ae..a5109a982cbfe 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -93,10 +93,9 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker): Then, it calculate the maximum possible number of GPU and CPU blocks that can be allocated with the remaining free memory. - :::{tip} - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. - ::: + Tip: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. """ # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. @@ -176,8 +175,7 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker): ensure_model_parallel_initialized( parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - parallel_config.enable_expert_parallel) + parallel_config.pipeline_parallel_size) # global all_reduce needed for overall oneccl warm up torch.distributed.all_reduce(torch.zeros(1).xpu())