diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh index 56bb5cedaa0a9..df805e0850806 100755 --- a/.buildkite/scripts/annotate-release.sh +++ b/.buildkite/scripts/annotate-release.sh @@ -23,8 +23,8 @@ To download the wheel (by version): aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl . aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl . -aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl . aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl . +aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl . \`\`\` To download and upload the image: @@ -45,9 +45,10 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 docker push vllm/vllm-openai:latest-aarch64 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 -docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend -docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend +docker manifest rm vllm/vllm-openai:latest +docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 +docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 docker manifest push vllm/vllm-openai:latest docker manifest push vllm/vllm-openai:v${RELEASE_VERSION} \`\`\` -EOF \ No newline at end of file +EOF diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh new file mode 100755 index 0000000000000..d0036f24c8d04 --- /dev/null +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# This script build the CPU docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# allow to bind to different cores +CORE_RANGE=${CORE_RANGE:-0-16} +OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16} +NUMA_NODE=${NUMA_NODE:-0} + +export CMAKE_BUILD_PARALLEL_LEVEL=32 + +# Setup cleanup +remove_docker_container() { + set -e; + docker rm -f cpu-test-"$NUMA_NODE" || true; +} +trap remove_docker_container EXIT +remove_docker_container + +# Try building the docker image +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . + +# Run the image, setting --shm-size=4g for tensor parallel. +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" + +function cpu_tests() { + set -e + export NUMA_NODE=$2 + + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pip list" + + # offline inference + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" + + # Run kernel tests + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pytest -x -v -s tests/kernels/test_onednn.py + pytest -x -v -s tests/kernels/attention/test_cpu_attn.py" + + # basic online serving + docker exec cpu-test-"$NUMA_NODE" bash -c ' + set -e + VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 & + server_pid=$! + timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 + vllm bench serve \ + --backend vllm \ + --dataset-name random \ + --model meta-llama/Llama-3.2-3B-Instruct \ + --num-prompts 20 \ + --endpoint /v1/completions + kill -s SIGTERM $server_pid &' +} + +# All of CPU tests are expected to be finished less than 40 mins. +export -f cpu_tests +timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh index 39ea180173081..3728f73fa2a36 100755 --- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh @@ -25,20 +25,22 @@ function cpu_tests() { # offline inference podman exec -it "$container_id" bash -c " + export TORCH_COMPILE_DISABLE=1 set -xve python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log # Run basic model test podman exec -it "$container_id" bash -c " + export TORCH_COMPILE_DISABLE=1 set -evx pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib - pip install sentence-transformers datamodel_code_generator + pip install sentence-transformers datamodel_code_generator tblib # Note: disable Bart until supports V1 # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model - pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2] - pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m] - pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it] + pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2] + pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m] + pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it] pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being. # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 7479c43977d78..2267718f75ca5 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -73,12 +73,11 @@ function cpu_tests() { pytest -x -s -v \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs" - # Note: disable it until supports V1 - # Run AWQ test - # docker exec cpu-test-"$NUMA_NODE" bash -c " - # set -e - # pytest -x -s -v \ - # tests/quantization/test_ipex_quant.py" + # Run AWQ/GPTQ test + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pytest -x -s -v \ + tests/quantization/test_cpu_wna16.py" # Run multi-lora tests docker exec cpu-test-"$NUMA_NODE" bash -c " diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh index 5302f524a0ae4..8106f50f18f66 100644 --- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh +++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh @@ -17,7 +17,17 @@ wait_for_server() { } MODEL="deepseek-ai/DeepSeek-V2-lite" -BACKENDS=("deepep_high_throughput" "deepep_low_latency") + +# Set BACKENDS based on platform +if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then + # ROCm platform + BACKENDS=("allgather_reducescatter") + # Disable MOE padding for ROCm since it is causing eplb to fail + export VLLM_ROCM_MOE_PADDING=0 +else + # Non-ROCm platform (CUDA/other) + BACKENDS=("deepep_high_throughput" "deepep_low_latency") +fi cleanup() { if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh index a5135299297e2..0d06f53a183d0 100644 --- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh +++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh @@ -17,7 +17,16 @@ wait_for_server() { } MODEL="QWen/Qwen3-30B-A3B-FP8" -BACKENDS=("deepep_high_throughput" "deepep_low_latency") +# Set BACKENDS based on platform +if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then + # ROCm platform + BACKENDS=("allgather_reducescatter") + # Disable MOE padding for ROCm since it is causing eplb to fail + export VLLM_ROCM_MOE_PADDING=0 +else + # Non-ROCm platform (CUDA/other) + BACKENDS=("deepep_high_throughput" "deepep_low_latency") +fi cleanup() { if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 2471b509a9fff..f098e23866eb3 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -61,7 +61,7 @@ steps: - pytest -v -s -m 'not cpu_test' multimodal - pytest -v -s utils_ -- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins +- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins timeout_in_minutes: 10 mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 @@ -73,6 +73,7 @@ steps: - tests/multimodal - tests/standalone_tests/lazy_imports.py - tests/transformers_utils + - tests/config no_gpu: true commands: - python3 standalone_tests/lazy_imports.py @@ -80,6 +81,7 @@ steps: - pytest -v -s test_outputs.py - pytest -v -s -m 'cpu_test' multimodal - pytest -v -s transformers_utils + - pytest -v -s config - label: Python-only Installation Test # 10min timeout_in_minutes: 20 @@ -187,7 +189,7 @@ steps: - tests/distributed/test_utils - tests/distributed/test_pynccl - tests/distributed/test_events - - tests/compile/test_basic_correctness + - tests/compile/fullgraph/test_basic_correctness.py - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py - tests/examples/offline_inference/data_parallel.py @@ -215,7 +217,7 @@ steps: - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s distributed/test_utils.py - - pytest -v -s compile/test_basic_correctness.py + - pytest -v -s compile/fullgraph/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_events.py - pytest -v -s distributed/test_symm_mem_allreduce.py @@ -390,6 +392,15 @@ steps: commands: - pytest -v -s v1/attention +- label: V1 Test attention (B200) # 10min + timeout_in_minutes: 30 + gpu: b200 + source_file_dependencies: + - vllm/v1/attention + - tests/v1/attention + commands: + - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this + - label: V1 Test others (CPU) # 5 mins mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 @@ -493,17 +504,12 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_pass_manager.py - - pytest -v -s compile/test_fusion.py - - pytest -v -s compile/test_fusion_attn.py - - pytest -v -s compile/test_functionalization.py - - pytest -v -s compile/test_silu_mul_quant_fusion.py - # - pytest -v -s compile/test_sequence_parallelism.py - # - pytest -v -s compile/test_async_tp.py - - pytest -v -s compile/test_fusion_all_reduce.py - - pytest -v -s compile/test_decorator.py - - pytest -v -s compile/test_noop_elimination.py - - pytest -v -s compile/test_aot_compile.py + # Run unit tests defined directly under compile/, + # not including subdirectories, which are usually heavier + # tests covered elsewhere. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 @@ -515,9 +521,11 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_basic_correctness.py - - pytest -v -s compile/test_multimodal_compile.py - - pytest -v -s compile/piecewise/ + # Run smoke tests under fullgraph directory, except test_full_graph.py + # as it is a heavy test that is covered in other steps. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" - label: PyTorch Fullgraph Test # 27min timeout_in_minutes: 40 @@ -529,10 +537,10 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' # Limit to no custom ops to reduce running time # Wrap with quotes to escape yaml and avoid starting -k string with a - - - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'" + - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" - label: Cudagraph test timeout_in_minutes: 20 @@ -697,7 +705,7 @@ steps: - vllm/model_executor/models/whisper.py commands: # LMEval # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442 - - pytest -s entrypoints/openai/correctness/ --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py + - pytest -s entrypoints/openai/correctness/ - label: OpenAI-Compatible Tool Use # 23 min timeout_in_minutes: 35 @@ -746,6 +754,7 @@ steps: torch_nightly: true source_file_dependencies: - vllm/model_executor/models/ + - vllm/transformers_utils/ - tests/models/test_initialization.py commands: # Only when vLLM model source is modified - test initialization of a large @@ -998,12 +1007,12 @@ steps: optional: true commands: - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py + - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' - pytest -v -s tests/models/test_transformers.py - - pytest -v -s tests/models/multimodal/processing/ - - pytest -v -s tests/models/multimodal/test_mapping.py + # - pytest -v -s tests/models/multimodal/processing/ + - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' - python3 examples/offline_inference/basic/chat.py - - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper @@ -1048,7 +1057,7 @@ steps: - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_flashinfer.py -- label: Blackwell Fusion Tests # 30 min +- label: Blackwell Fusion and Compile Tests # 30 min timeout_in_minutes: 40 working_dir: "/vllm-workspace/" gpu: b200 @@ -1066,10 +1075,12 @@ steps: - pytest -v -s tests/compile/test_fusion_attn.py - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py # this runner has 2 GPUs available even though num_gpus=2 is not set - - pytest -v -s tests/compile/test_fusion_all_reduce.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time # Wrap with quotes to escape yaml - - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'" + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) + - pytest -v -s tests/compile/distributed/test_full_graph.py::test_fp8_kv_scale_compile - label: Blackwell Fusion E2E Tests # 30 min timeout_in_minutes: 40 @@ -1086,20 +1097,18 @@ steps: - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/test_fusions_e2e.py - - tests/compile/test_full_graph.py + - tests/compile/distributed/test_fusions_e2e.py + - tests/compile/fullgraph/test_full_graph.py commands: - nvidia-smi # Run all e2e fusion tests - pytest -v -s tests/compile/test_fusions_e2e.py - # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile - label: ROCm GPT-OSS Eval timeout_in_minutes: 60 working_dir: "/vllm-workspace/" agent_pool: mi325_1 - mirror_hardwares: [amdproduction] + mirror_hardwares: [amdexperimental, amdproduction] optional: true # run on nightlies source_file_dependencies: - tests/evals/gpt_oss @@ -1198,7 +1207,7 @@ steps: - vllm/worker/worker_base.py - vllm/v1/engine/ - vllm/v1/worker/ - - tests/compile/test_basic_correctness.py + - tests/compile/fullgraph/test_basic_correctness.py - tests/compile/test_wrapper.py - tests/distributed/ - tests/entrypoints/llm/test_collective_rpc.py @@ -1211,7 +1220,7 @@ steps: - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py - - pytest -v -s ./compile/test_basic_correctness.py + - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' @@ -1311,7 +1320,10 @@ steps: - pytest -v -s -x lora/test_llama_tp.py - pytest -v -s -x lora/test_llm_with_multi_loras.py - pytest -v -s -x lora/test_olmoe_tp.py - - pytest -v -s -x lora/test_gptoss_tp.py + + # Disabled for now because MXFP4 backend on non-cuda platform + # doesn't support LoRA yet + #- pytest -v -s -x lora/test_gptoss_tp.py - label: Weight Loading Multiple GPU Test # 33min @@ -1326,7 +1338,7 @@ steps: - vllm/ - tests/weight_loading commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt - label: Weight Loading Multiple GPU Test - Large Models # optional mirror_hardwares: [amdexperimental] @@ -1334,13 +1346,12 @@ steps: # grade: Blocking working_dir: "/vllm-workspace/tests" num_gpus: 2 - gpu: a100 optional: true source_file_dependencies: - vllm/ - tests/weight_loading commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt - label: NixlConnector PD accuracy tests (Distributed) # 30min mirror_hardwares: [amdexperimental] @@ -1417,10 +1428,12 @@ steps: working_dir: "/vllm-workspace/" num_gpus: 2 commands: - - pytest -v -s tests/compile/test_async_tp.py - - pytest -v -s tests/compile/test_sequence_parallelism.py - - pytest -v -s tests/compile/test_fusion_all_reduce.py - - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm + - pytest -v -s tests/compile/distributed/test_async_tp.py + - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" + - pytest -v -s tests/compile/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - pytest -v -s tests/v1/distributed/test_dbo.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 4ac76aba67b9c..7a46e919f93bf 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -167,7 +167,7 @@ steps: - tests/distributed/test_utils - tests/distributed/test_pynccl - tests/distributed/test_events - - tests/compile/test_basic_correctness + - tests/compile/fullgraph/test_basic_correctness.py - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py - tests/examples/offline_inference/data_parallel.py @@ -197,7 +197,7 @@ steps: - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s distributed/test_utils.py - - pytest -v -s compile/test_basic_correctness.py + - pytest -v -s compile/fullgraph/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_events.py - pytest -v -s distributed/test_symm_mem_allreduce.py @@ -346,6 +346,18 @@ steps: commands: - pytest -v -s v1/attention +- label: Batch Invariance Tests (H100) # 10min + timeout_in_minutes: 25 + gpu: h100 + source_file_dependencies: + - vllm/ + - tests/v1/determinism/ + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pip install pytest-timeout pytest-forked + - pytest -v -s v1/determinism/test_batch_invariance.py + - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py + - label: V1 Test attention (B200) # 10min timeout_in_minutes: 30 gpu: b200 @@ -445,18 +457,12 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_graph_partition.py - - pytest -v -s compile/test_config.py - - pytest -v -s compile/test_pass_manager.py - - pytest -v -s compile/test_fusion.py - - pytest -v -s compile/test_fusion_attn.py - - pytest -v -s compile/test_functionalization.py - - pytest -v -s compile/test_silu_mul_quant_fusion.py - - pytest -v -s compile/test_fusion_all_reduce.py - - pytest -v -s compile/test_decorator.py - - pytest -v -s compile/test_noop_elimination.py - - pytest -v -s compile/test_aot_compile.py - - pytest -v -s compile/test_qk_norm_rope_fusion.py + # Run unit tests defined directly under compile/, + # not including subdirectories, which are usually heavier + # tests covered elsewhere. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 @@ -466,9 +472,11 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_basic_correctness.py - - pytest -v -s compile/test_multimodal_compile.py - - pytest -v -s compile/piecewise/ + # Run smoke tests under fullgraph directory, except test_full_graph.py + # as it is a heavy test that is covered in other steps. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" - label: PyTorch Fullgraph Test # 27min timeout_in_minutes: 40 @@ -479,10 +487,10 @@ steps: - tests/compile commands: # fp8 kv scales not supported on sm89, tested on Blackwell instead - - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' # Limit to no custom ops to reduce running time # Wrap with quotes to escape yaml and avoid starting -k string with a - - - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" + - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" - label: Cudagraph test timeout_in_minutes: 20 @@ -554,6 +562,25 @@ steps: commands: - pytest -v -s kernels/mamba +- label: Kernels DeepGEMM Test (H100) + timeout_in_minutes: 45 + gpu: h100 + num_gpus: 1 + source_file_dependencies: + - tools/install_deepgemm.sh + - vllm/utils/deep_gemm.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization + - tests/kernels/quantization/test_block_fp8.py + - tests/kernels/moe/test_deepgemm.py + - tests/kernels/moe/test_batched_deepgemm.py + - tests/kernels/attention/test_deepgemm_attention.py + commands: + - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm + - pytest -v -s kernels/moe/test_deepgemm.py + - pytest -v -s kernels/moe/test_batched_deepgemm.py + - pytest -v -s kernels/attention/test_deepgemm_attention.py + - label: Model Executor Test # 23min timeout_in_minutes: 35 torch_nightly: true @@ -664,6 +691,7 @@ steps: torch_nightly: true source_file_dependencies: - vllm/model_executor/models/ + - vllm/transformers_utils/ - tests/models/test_initialization.py commands: # Only when vLLM model source is modified - test initialization of a large @@ -876,12 +904,12 @@ steps: optional: true commands: - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' + - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)' - pytest -v -s tests/models/test_transformers.py # - pytest -v -s tests/models/multimodal/processing/ - - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' + - pytest -v -s tests/models/multimodal/test_mapping.py - python3 examples/offline_inference/basic/chat.py - # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper @@ -925,6 +953,7 @@ steps: - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_flashinfer.py + - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py - label: Blackwell Fusion and Compile Tests # 30 min timeout_in_minutes: 40 @@ -934,22 +963,29 @@ steps: - csrc/quantization/fp4/ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/v1/attention/backends/flashinfer.py + - vllm/v1/worker/ + - vllm/v1/cudagraph_dispatcher.py - vllm/compilation/ # can affect pattern matching - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/test_fusion_attn.py + - tests/compile/test_silu_mul_quant_fusion.py + - tests/compile/distributed/test_fusion_all_reduce.py + - tests/compile/distributed/test_fusions_e2e.py + - tests/compile/fullgraph/test_full_graph.py commands: - nvidia-smi - pytest -v -s tests/compile/test_fusion_attn.py - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py # this runner has 2 GPUs available even though num_gpus=2 is not set - - pytest -v -s tests/compile/test_fusion_all_reduce.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time # Wrap with quotes to escape yaml - - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile + - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - label: Blackwell Fusion E2E Tests # 30 min timeout_in_minutes: 40 @@ -966,12 +1002,11 @@ steps: - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/test_fusions_e2e.py - - tests/compile/test_full_graph.py + - tests/compile/distributed/test_fusions_e2e.py commands: - nvidia-smi # Run all e2e fusion tests - - pytest -v -s tests/compile/test_fusions_e2e.py + - pytest -v -s tests/compile/distributed/test_fusions_e2e.py - label: Blackwell GPT-OSS Eval timeout_in_minutes: 60 @@ -1069,7 +1104,7 @@ steps: - vllm/worker/worker_base.py - vllm/v1/engine/ - vllm/v1/worker/ - - tests/compile/test_basic_correctness.py + - tests/compile/fullgraph/test_basic_correctness.py - tests/compile/test_wrapper.py - tests/distributed/ - tests/entrypoints/llm/test_collective_rpc.py @@ -1084,7 +1119,7 @@ steps: - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py - - pytest -v -s ./compile/test_basic_correctness.py + - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' @@ -1264,10 +1299,10 @@ steps: working_dir: "/vllm-workspace/" num_gpus: 2 commands: - - pytest -v -s tests/compile/test_async_tp.py - - pytest -v -s tests/compile/test_sequence_parallelism.py - - pytest -v -s tests/compile/test_fusion_all_reduce.py - - "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'" + - pytest -v -s tests/compile/distributed/test_async_tp.py + - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 6e178bb690c56..3247408e1163e 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -9,6 +9,7 @@ /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety /vllm/model_executor/layers/mamba @tdoublep /vllm/model_executor/model_loader @22quinn +/vllm/model_executor/layers/batch_invariant.py @yewentao256 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa /vllm/vllm_flash_attn @LucasWilkinson /vllm/lora @jeejeelee @@ -35,6 +36,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /vllm/v1/kv_cache_interface.py @heheda12345 /vllm/v1/offloading @ApostaC +# Model runner V2 +/vllm/v1/worker/gpu @WoosukKwon + # Test ownership /.buildkite/lm-eval-harness @mgoin /tests/distributed/test_multi_node_assignment.py @youkaichao @@ -56,6 +60,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/v1/kv_connector/nixl_integration @NickLucche /tests/v1/kv_connector @ApostaC /tests/v1/offloading @ApostaC +/tests/v1/determinism @yewentao256 # Transformers modeling backend /vllm/model_executor/models/transformers @hmellor diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml index 8d40aa587bf00..a183033c9adde 100644 --- a/.github/workflows/macos-smoke-test.yml +++ b/.github/workflows/macos-smoke-test.yml @@ -1,12 +1,15 @@ name: macOS Apple Silicon Smoke Test on: + push: + branches: + - main workflow_dispatch: # Manual trigger jobs: macos-m1-smoke-test: runs-on: macos-latest - timeout-minutes: 20 + timeout-minutes: 30 steps: - uses: actions/checkout@v4 @@ -19,28 +22,29 @@ jobs: pyproject.toml python-version: '3.12' - - name: Install dependencies + - name: Create virtual environment run: | - uv pip install -r requirements/cpu-build.txt - uv pip install -r requirements/cpu.txt + uv venv + echo "$GITHUB_WORKSPACE/.venv/bin" >> "$GITHUB_PATH" - - name: Build vLLM - run: uv pip install -v -e . + - name: Install dependencies and build vLLM + run: | + uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match + uv pip install -e . env: CMAKE_BUILD_PARALLEL_LEVEL: 4 - name: Verify installation run: | python -c "import vllm; print(f'vLLM version: {vllm.__version__}')" - python -c "import torch; print(f'PyTorch: {torch.__version__}')" - name: Smoke test vllm serve - timeout-minutes: 10 run: | # Start server in background vllm serve Qwen/Qwen3-0.6B \ - --max-model-len=2048 \ + --max-model-len=2K \ --load-format=dummy \ + --hf-overrides '{"num_hidden_layers": 2}' \ --enforce-eager \ --port 8000 & diff --git a/.gitignore b/.gitignore index 50070d7898fe6..7cda86478664f 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,9 @@ # vllm-flash-attn built from source vllm/vllm_flash_attn/* +# OpenAI triton kernels copied from source +vllm/third_party/triton_kernels/* + # triton jit .triton diff --git a/CMakeLists.txt b/CMakeLists.txt index 3a37040edbf1a..a4cf51d17e982 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -307,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. - set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use") + set(CUTLASS_REVISION "v4.2.1") # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) @@ -512,9 +512,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x) # require CUDA 12.8 or later if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") else() - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}") endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) set(SRCS @@ -619,9 +619,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # FP4 Archs and flags if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) - cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") else() - cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}") endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) set(SRCS @@ -695,7 +695,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") else() - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}") endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu") @@ -741,9 +741,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") else() - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}") endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu") @@ -1030,6 +1030,11 @@ if(VLLM_GPU_LANG STREQUAL "HIP") WITH_SOABI) endif() +# For CUDA and HIP builds also build the triton_kernels external package. +if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP") + include(cmake/external_projects/triton_kernels.cmake) +endif() + # For CUDA we also build and ship some external projects. if (VLLM_GPU_LANG STREQUAL "CUDA") include(cmake/external_projects/flashmla.cmake) diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py index 904f805349148..d072c03c440b2 100644 --- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py +++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py @@ -5,11 +5,12 @@ import argparse import asyncio import logging import os +import time +import uuid +from urllib.parse import urlparse import aiohttp from quart import Quart, Response, make_response, request -from rate_limiter import RateLimiter -from request_queue import RequestQueue # Configure logging logging.basicConfig(level=logging.INFO) @@ -24,26 +25,8 @@ def parse_args(): parser.add_argument( "--timeout", type=float, - default=300, - help="Timeout for backend service requests in seconds (default: 300)", - ) - parser.add_argument( - "--max-concurrent", - type=int, - default=100, - help="Maximum concurrent requests to backend services (default: 100)", - ) - parser.add_argument( - "--queue-size", - type=int, - default=500, - help="Maximum number of requests in the queue (default: 500)", - ) - parser.add_argument( - "--rate-limit", - type=int, - default=40, - help="Maximum requests per second (default: 40)", + default=6 * 60 * 60, + help="Timeout for backend service requests in seconds (default: 21600)", ) parser.add_argument( "--port", @@ -54,14 +37,32 @@ def parse_args(): parser.add_argument( "--prefill-url", type=str, - default="http://localhost:8100/v1/completions", - help="Prefill service endpoint URL", + default="http://localhost:8100", + help="Prefill service base URL (protocol + host[:port])", ) parser.add_argument( "--decode-url", type=str, - default="http://localhost:8200/v1/completions", - help="Decode service endpoint URL", + default="http://localhost:8200", + help="Decode service base URL (protocol + host[:port])", + ) + parser.add_argument( + "--kv-host", + type=str, + default="localhost", + help="Hostname or IP used by KV transfer (default: localhost)", + ) + parser.add_argument( + "--prefill-kv-port", + type=int, + default=14579, + help="Prefill KV port (default: 14579)", + ) + parser.add_argument( + "--decode-kv-port", + type=int, + default=14580, + help="Decode KV port (default: 14580)", ) return parser.parse_args() @@ -73,70 +74,129 @@ def main(): # Initialize configuration using command line parameters AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout) - MAX_CONCURRENT_REQUESTS = args.max_concurrent - REQUEST_QUEUE_SIZE = args.queue_size - RATE_LIMIT = args.rate_limit PREFILL_SERVICE_URL = args.prefill_url DECODE_SERVICE_URL = args.decode_url PORT = args.port + PREFILL_KV_ADDR = f"{args.kv_host}:{args.prefill_kv_port}" + DECODE_KV_ADDR = f"{args.kv_host}:{args.decode_kv_port}" + + logger.info( + "Proxy resolved KV addresses -> prefill: %s, decode: %s", + PREFILL_KV_ADDR, + DECODE_KV_ADDR, + ) + app = Quart(__name__) - # Initialize the rate limiter and request queue - rate_limiter = RateLimiter(RATE_LIMIT) - request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE) - - # Attach the configuration object to the application instance + # Attach the configuration object to the application instance so helper + # coroutines can read the resolved backend URLs and timeouts without using + # globals. app.config.update( { "AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT, - "rate_limiter": rate_limiter, - "request_queue": request_queue, "PREFILL_SERVICE_URL": PREFILL_SERVICE_URL, "DECODE_SERVICE_URL": DECODE_SERVICE_URL, + "PREFILL_KV_ADDR": PREFILL_KV_ADDR, + "DECODE_KV_ADDR": DECODE_KV_ADDR, } ) - # Start queue processing on app startup - @app.before_serving - async def startup(): - """Start request processing task when app starts serving""" - asyncio.create_task(request_queue.process()) + def _normalize_base_url(url: str) -> str: + """Remove any trailing slash so path joins behave predictably.""" + return url.rstrip("/") - async def forward_request(url, data): - """Forward request to backend service with rate limiting and error handling""" - headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} + def _get_host_port(url: str) -> str: + """Return the hostname:port portion for logging and KV headers.""" + parsed = urlparse(url) + host = parsed.hostname or "localhost" + port = parsed.port + if port is None: + port = 80 if parsed.scheme == "http" else 443 + return f"{host}:{port}" - # Use rate limiter as context manager - async with ( - rate_limiter, - aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session, - ): - try: - async with session.post( - url=url, json=data, headers=headers - ) as response: - if response.status == 200: - # Stream response chunks - async for chunk_bytes in response.content.iter_chunked(1024): - yield chunk_bytes - else: - # Handle backend service errors - error_text = await response.text() - logger.error( - "Backend service error: %s - %s", - response.status, - error_text, - ) - yield b'{"error": "Backend service error"}' - except aiohttp.ClientError as e: - # Handle connection errors - logger.error("Connection error to %s: %s", url, str(e)) - yield b'{"error": "Service unavailable"}' - except asyncio.TimeoutError: - # Handle timeout errors - logger.error("Timeout connecting to %s", url) - yield b'{"error": "Service timeout"}' + PREFILL_BASE = _normalize_base_url(PREFILL_SERVICE_URL) + DECODE_BASE = _normalize_base_url(DECODE_SERVICE_URL) + KV_TARGET = _get_host_port(DECODE_SERVICE_URL) + + def _build_headers(request_id: str) -> dict[str, str]: + """Construct the headers expected by vLLM's P2P disagg connector.""" + headers: dict[str, str] = {"X-Request-Id": request_id, "X-KV-Target": KV_TARGET} + api_key = os.environ.get("OPENAI_API_KEY") + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + return headers + + async def _run_prefill( + request_path: str, + payload: dict, + headers: dict[str, str], + request_id: str, + ): + url = f"{PREFILL_BASE}{request_path}" + start_ts = time.perf_counter() + logger.info("[prefill] start request_id=%s url=%s", request_id, url) + try: + async with ( + aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session, + session.post(url=url, json=payload, headers=headers) as resp, + ): + if resp.status != 200: + error_text = await resp.text() + raise RuntimeError( + f"Prefill backend error {resp.status}: {error_text}" + ) + await resp.read() + logger.info( + "[prefill] done request_id=%s status=%s elapsed=%.2fs", + request_id, + resp.status, + time.perf_counter() - start_ts, + ) + except asyncio.TimeoutError as exc: + raise RuntimeError(f"Prefill service timeout at {url}") from exc + except aiohttp.ClientError as exc: + raise RuntimeError(f"Prefill service unavailable at {url}") from exc + + async def _stream_decode( + request_path: str, + payload: dict, + headers: dict[str, str], + request_id: str, + ): + url = f"{DECODE_BASE}{request_path}" + # Stream tokens from the decode service once the prefill stage has + # materialized KV caches on the target workers. + logger.info("[decode] start request_id=%s url=%s", request_id, url) + try: + async with ( + aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session, + session.post(url=url, json=payload, headers=headers) as resp, + ): + if resp.status != 200: + error_text = await resp.text() + logger.error( + "Decode backend error %s - %s", resp.status, error_text + ) + err_msg = ( + '{"error": "Decode backend error ' + str(resp.status) + '"}' + ) + yield err_msg.encode() + return + logger.info( + "[decode] streaming response request_id=%s status=%s", + request_id, + resp.status, + ) + async for chunk_bytes in resp.content.iter_chunked(1024): + yield chunk_bytes + logger.info("[decode] finished streaming request_id=%s", request_id) + except asyncio.TimeoutError: + logger.error("Decode service timeout at %s", url) + yield b'{"error": "Decode service timeout"}' + except aiohttp.ClientError as exc: + logger.error("Decode service error at %s: %s", url, exc) + yield b'{"error": "Decode service unavailable"}' async def process_request(): """Process a single request through prefill and decode stages""" @@ -146,13 +206,27 @@ def main(): # Create prefill request (max_tokens=1) prefill_request = original_request_data.copy() prefill_request["max_tokens"] = 1 + if "max_completion_tokens" in prefill_request: + prefill_request["max_completion_tokens"] = 1 # Execute prefill stage - async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request): - continue + # The request id encodes both KV socket addresses so the backend can + # shuttle tensors directly via NCCL once the prefill response + # completes. + request_id = ( + f"___prefill_addr_{PREFILL_KV_ADDR}___decode_addr_" + f"{DECODE_KV_ADDR}_{uuid.uuid4().hex}" + ) + + headers = _build_headers(request_id) + await _run_prefill(request.path, prefill_request, headers, request_id) # Execute decode stage and stream response - generator = forward_request(DECODE_SERVICE_URL, original_request_data) + # Pass the unmodified user request so the decode phase can continue + # sampling with the already-populated KV cache. + generator = _stream_decode( + request.path, original_request_data, headers, request_id + ) response = await make_response(generator) response.timeout = None # Disable timeout for streaming response return response @@ -168,23 +242,10 @@ def main(): @app.route("/v1/completions", methods=["POST"]) async def handle_request(): """Handle incoming API requests with concurrency and rate limiting""" - # Create task for request processing - task = asyncio.create_task(process_request()) - - # Enqueue request or reject if queue is full - if not await request_queue.enqueue(task): - return Response( - response=b'{"error": "Server busy, try again later"}', - status=503, - content_type="application/json", - ) - try: - # Return the response from the processing task - return await task + return await process_request() except asyncio.CancelledError: - # Handle task cancellation (timeout or queue full) - logger.warning("Request cancelled due to timeout or queue full") + logger.warning("Request cancelled") return Response( response=b'{"error": "Request cancelled"}', status=503, diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py index 027f67ad4db69..e07d6c776bc00 100644 --- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py +++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py @@ -255,8 +255,8 @@ def bench_run( torch.cuda.synchronize() # Timing - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) latencies = [] for _ in range(num_iters): diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index c99951aa27826..a1af0b8aec3d0 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -185,8 +185,8 @@ def benchmark_config( graph.replay() torch.cuda.synchronize() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) latencies: list[float] = [] for i in range(num_iters): diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index efa5a7386027e..b8913a217c608 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -105,8 +105,8 @@ def benchmark_permute( graph.replay() torch.cuda.synchronize() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) latencies: list[float] = [] for i in range(num_iters): @@ -241,8 +241,8 @@ def benchmark_unpermute( graph.replay() torch.cuda.synchronize() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) latencies: list[float] = [] for i in range(num_iters): diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index cb848d2bf579e..83bd91917508f 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -6,7 +6,7 @@ # # The CSV file (named with current date/time) contains these columns: # model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position, -# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99, +# is_neox_style, rope_parameters, dtype, torch_mean, torch_median, torch_p99, # torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max, # speedup # @@ -86,9 +86,8 @@ def benchmark_mrope( num_heads: int, num_kv_heads: int, max_position: int = 8192, - rope_theta: float = 10000, is_neox_style: bool = True, - rope_scaling: dict[str, Any] = None, + rope_parameters: dict[str, Any] | None = None, dtype: torch.dtype = torch.bfloat16, seed: int = 0, warmup_iter: int = 10, @@ -102,9 +101,8 @@ def benchmark_mrope( head_size=head_dim, rotary_dim=head_dim, max_position=max_position, - base=rope_theta, is_neox_style=is_neox_style, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dtype=dtype, ).to(device=device) @@ -203,9 +201,8 @@ def benchmark_mrope( num_kv_heads, head_dim, max_position, - rope_theta, is_neox_style, - str(rope_scaling), + str(rope_parameters), str(dtype).split(".")[-1], torch_stats["mean"], torch_stats["median"], @@ -255,9 +252,8 @@ if __name__ == "__main__": "num_kv_heads", "head_dim", "max_position", - "rope_theta", "is_neox_style", - "rope_scaling", + "rope_parameters", "dtype", "torch_mean", "torch_median", @@ -303,7 +299,7 @@ if __name__ == "__main__": q_size = num_heads * head_dim kv_size = num_kv_heads * head_dim is_neox_style = True - rope_theta = config.rope_theta + rope_parameters = config.rope_parameters max_position = config.max_position_embeddings for num_tokens in num_tokens_list: @@ -315,9 +311,8 @@ if __name__ == "__main__": num_heads=num_heads, num_kv_heads=num_kv_heads, max_position=max_position, - rope_theta=rope_theta, is_neox_style=is_neox_style, - rope_scaling=config.rope_scaling, + rope_parameters=rope_parameters, dtype=getattr(torch, args.dtype), seed=args.seed, warmup_iter=args.warmup_iter, diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py index bdc1eb733084e..eba4d510258b6 100644 --- a/benchmarks/kernels/benchmark_per_token_group_quant.py +++ b/benchmarks/kernels/benchmark_per_token_group_quant.py @@ -30,8 +30,8 @@ def _time_cuda( fn() torch.cuda.synchronize() - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) + start = torch.Event(enable_timing=True) + end = torch.Event(enable_timing=True) start.record() for _ in range(bench_iters): diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py index a5887aafd30d6..de01ff197eab7 100644 --- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py +++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py @@ -253,8 +253,8 @@ def benchmark( ) torch.cuda.synchronize() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) # Benchmark latencies: list[float] = [] diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py index 29ce18234dfa0..1d0d6fbb9a470 100644 --- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py @@ -127,8 +127,8 @@ def benchmark_decode( def time_fn(fn, warmup=10, trials=20): torch.cuda.synchronize() - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) + start = torch.Event(enable_timing=True) + end = torch.Event(enable_timing=True) times = [] for i in range(warmup): fn() diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py index 2a25d03748112..84bde723abf7f 100644 --- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py @@ -139,8 +139,8 @@ def benchmark_prefill( def time_fn(fn, warmup=10, trials=20): torch.cuda.synchronize() - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) + start = torch.Event(enable_timing=True) + end = torch.Event(enable_timing=True) times = [] for i in range(warmup): fn() diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index ab54f81985bc2..b52500c8c5217 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -183,8 +183,8 @@ def benchmark_config( run() torch.cuda.synchronize() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) latencies: list[float] = [] for i in range(num_iters): diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md index f5b5c6c97d484..b0be1e3a69a66 100644 --- a/benchmarks/multi_turn/README.md +++ b/benchmarks/multi_turn/README.md @@ -55,6 +55,10 @@ output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75 ---------------------------------------------------------------------------------------------------- ``` +If you run with `--warmup-step`, the summary will also include `warmup_runtime_sec` +and `total_runtime_incl_warmup_sec` (while `runtime_sec` continues to reflect the +benchmark-only runtime so the reported throughput stays comparable). + ### JSON configuration file for synthetic conversations generation The input flag `--input-file` is used to determine the input conversations for the benchmark.
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py index 772d685ad90ff..e23f6b923f1b9 100644 --- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py +++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py @@ -1076,6 +1076,7 @@ def process_statistics( verbose: bool, gen_conv_args: GenConvArgs | None = None, excel_output: bool = False, + warmup_runtime_sec: float | None = None, ) -> None: if len(client_metrics) == 0: logger.info("No samples to process") @@ -1169,8 +1170,13 @@ def process_statistics( # Convert milliseconds to seconds runtime_sec = runtime_sec / 1000.0 requests_per_sec = float(len(df)) / runtime_sec - - params = {"runtime_sec": runtime_sec, "requests_per_sec": requests_per_sec} + params = { + "runtime_sec": runtime_sec, + "requests_per_sec": requests_per_sec, + } + if warmup_runtime_sec is not None: + params["warmup_runtime_sec"] = warmup_runtime_sec + params["total_runtime_incl_warmup_sec"] = runtime_sec + warmup_runtime_sec # Generate a summary of relevant metrics (and drop irrelevant data) df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose() @@ -1552,6 +1558,8 @@ async def main() -> None: url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop ) + warmup_runtime_sec: float | None = None + # Warm-up step if args.warmup_step: # Only send a single user prompt from every conversation. @@ -1566,26 +1574,56 @@ async def main() -> None: # all clients should finish their work before exiting warmup_bench_args = bench_args._replace(early_stop=False) - logger.info(f"{Color.PURPLE}Warmup start{Color.RESET}") + logger.info("%sWarmup start%s", Color.PURPLE, Color.RESET) + warmup_start_ns = time.perf_counter_ns() conversations, _ = await main_mp( warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations ) - logger.info(f"{Color.PURPLE}Warmup done{Color.RESET}") + warmup_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - warmup_start_ns) + logger.info( + "%sWarmup runtime: %.3f sec (%.3f ms)%s", + Color.PURPLE, + warmup_runtime_sec, + warmup_runtime_sec * 1000, + Color.RESET, + ) + logger.info("%sWarmup done%s", Color.PURPLE, Color.RESET) # Run the benchmark - start_time = time.perf_counter_ns() + benchmark_start_ns = time.perf_counter_ns() client_convs, client_metrics = await main_mp( client_args, req_args, bench_args, tokenizer, conversations ) - total_runtime_ms = nanosec_to_millisec(time.perf_counter_ns() - start_time) + benchmark_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - benchmark_start_ns) # Calculate requests per second - total_runtime_sec = total_runtime_ms / 1000.0 - rps = len(client_metrics) / total_runtime_sec + requests_per_sec = len(client_metrics) / benchmark_runtime_sec + benchmark_runtime_ms = benchmark_runtime_sec * 1000.0 logger.info( - f"{Color.GREEN}All clients finished, total runtime: {total_runtime_sec:.3f} sec" - f" ({total_runtime_ms:.3f} ms), requests per second: {rps:.3f}{Color.RESET}" + "%sAll clients finished, benchmark runtime: %.3f sec (%.3f ms), " + "requests per second: %.3f%s", + Color.GREEN, + benchmark_runtime_sec, + benchmark_runtime_ms, + requests_per_sec, + Color.RESET, ) + if warmup_runtime_sec is not None: + total_runtime_sec = benchmark_runtime_sec + warmup_runtime_sec + logger.info( + "%sWarmup runtime: %.3f sec (%.3f ms)%s", + Color.GREEN, + warmup_runtime_sec, + warmup_runtime_sec * 1000, + Color.RESET, + ) + logger.info( + "%sTotal runtime (including warmup): %.3f sec (%.3f ms)%s", + Color.GREEN, + total_runtime_sec, + total_runtime_sec * 1000, + Color.RESET, + ) # Benchmark parameters params = { @@ -1610,6 +1648,7 @@ async def main() -> None: verbose=args.verbose, gen_conv_args=gen_conv_args, excel_output=args.excel_output, + warmup_runtime_sec=warmup_runtime_sec, ) if args.output_file is not None: diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index aa84125818d10..fbbb03c5ed465 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -375,6 +375,7 @@ set(VLLM_EXT_SRC if (AVX512_FOUND AND NOT AVX512_DISABLED) set(VLLM_EXT_SRC "csrc/cpu/shm.cpp" + "csrc/cpu/cpu_wna16.cpp" ${VLLM_EXT_SRC}) if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI) set(VLLM_EXT_SRC diff --git a/cmake/external_projects/triton_kernels.cmake b/cmake/external_projects/triton_kernels.cmake new file mode 100644 index 0000000000000..d35ad123dd9de --- /dev/null +++ b/cmake/external_projects/triton_kernels.cmake @@ -0,0 +1,53 @@ +# Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels + +set(DEFAULT_TRITON_KERNELS_TAG "v3.5.0") + +# Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to +# be directly set to the triton_kernels python directory. +if (DEFINED ENV{TRITON_KERNELS_SRC_DIR}) + message(STATUS "[triton_kernels] Fetch from $ENV{TRITON_KERNELS_SRC_DIR}") + FetchContent_Declare( + triton_kernels + SOURCE_DIR $ENV{TRITON_KERNELS_SRC_DIR} + ) + +else() + set(TRITON_GIT "https://github.com/triton-lang/triton.git") + message (STATUS "[triton_kernels] Fetch from ${TRITON_GIT}:${DEFAULT_TRITON_KERNELS_TAG}") + FetchContent_Declare( + triton_kernels + # TODO (varun) : Fetch just the triton_kernels directory from Triton + GIT_REPOSITORY https://github.com/triton-lang/triton.git + GIT_TAG ${DEFAULT_TRITON_KERNELS_TAG} + GIT_PROGRESS TRUE + SOURCE_SUBDIR python/triton_kernels/triton_kernels + ) +endif() + +# Fetch content +FetchContent_MakeAvailable(triton_kernels) + +if (NOT triton_kernels_SOURCE_DIR) + message (FATAL_ERROR "[triton_kernels] Cannot resolve triton_kernels_SOURCE_DIR") +endif() + +if (DEFINED ENV{TRITON_KERNELS_SRC_DIR}) + set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/") +else() + set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/python/triton_kernels/triton_kernels/") +endif() + +message (STATUS "[triton_kernels] triton_kernels is available at ${TRITON_KERNELS_PYTHON_DIR}") + +add_custom_target(triton_kernels) + +# Ensure the vllm/third_party directory exists before installation +install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/third_party/triton_kernels\")") + +## Copy .py files to install directory. +install(DIRECTORY + ${TRITON_KERNELS_PYTHON_DIR} + DESTINATION + vllm/third_party/triton_kernels/ + COMPONENT triton_kernels + FILES_MATCHING PATTERN "*.py") diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index 567c8959f0454..ff687e0af7b44 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 58e0626a692f09241182582659e3bf8f16472659 + GIT_TAG 86f8f157cf82aa2342743752b97788922dd7de43 GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 0aa0dc14c7480..32960cc8073bb 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -552,7 +552,11 @@ __global__ void indexer_k_quant_and_cache_kernel( #ifndef USE_ROCM __syncwarp(); #endif +#if defined(__gfx942__) + float scale = fmaxf(amax, 1e-4) / 224.0f; +#else float scale = fmaxf(amax, 1e-4) / 448.0f; +#endif if (use_ue8m0) { scale = exp2f(ceilf(log2f(scale))); } @@ -965,7 +969,9 @@ __global__ void gather_and_maybe_dequant_cache( } }; - for (int pid = split_start; pid < full_blocks_end; ++pid) { + const auto loop_end = + std::min((int64_t)full_blocks_end, block_table_stride - offset); + for (int pid = split_start; pid < loop_end; ++pid) { auto block_id = batch_block_table[pid]; auto block_start_ptr = src_cache + block_id * cache_block_stride; auto block_dst_ptr = dst + pid * block_size * dst_entry_stride; @@ -976,12 +982,15 @@ __global__ void gather_and_maybe_dequant_cache( } if (partial_block_size) { - auto block_id = batch_block_table[full_blocks_end]; - auto block_start_ptr = src_cache + block_id * cache_block_stride; - auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride; - for (int eid = 0; eid < partial_block_size; ++eid) { - copy_entry(block_start_ptr + eid * cache_entry_stride, - block_dst_ptr + eid * dst_entry_stride); + if (offset + full_blocks_end < block_table_stride) { + auto block_id = batch_block_table[full_blocks_end]; + auto block_start_ptr = src_cache + block_id * cache_block_stride; + auto block_dst_ptr = + dst + full_blocks_end * block_size * dst_entry_stride; + for (int eid = 0; eid < partial_block_size; ++eid) { + copy_entry(block_start_ptr + eid * cache_entry_stride, + block_dst_ptr + eid * dst_entry_stride); + } } } } diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp index 50f17c758c148..92f8bee5a47a0 100644 --- a/csrc/cpu/cpu_attn.cpp +++ b/csrc/cpu/cpu_attn.cpp @@ -13,6 +13,18 @@ #define AMX_DISPATCH(...) case cpu_attention::ISA::AMX: #endif +#ifdef __aarch64__ + #include "cpu_attn_neon.hpp" + #define NEON_DISPATCH(...) \ + case cpu_attention::ISA::NEON: { \ + using attn_impl = cpu_attention::AttentionImpl; \ + return __VA_ARGS__(); \ + } +#else + #define NEON_DISPATCH(...) case cpu_attention::ISA::NEON: +#endif // #ifdef __aarch64__ + #define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \ case HEAD_DIM: { \ constexpr size_t head_dim = HEAD_DIM; \ @@ -41,6 +53,7 @@ [&] { \ switch (ISA_TYPE) { \ AMX_DISPATCH(__VA_ARGS__) \ + NEON_DISPATCH(__VA_ARGS__) \ case cpu_attention::ISA::VEC: { \ using attn_impl = \ cpu_attention::AttentionImpl #include #include @@ -12,9 +11,10 @@ #include "cpu_types.hpp" #include "scratchpad_manager.h" #include "cpu_attn_macros.h" +#include "utils.hpp" namespace cpu_attention { -enum class ISA { AMX, VEC, VEC16 }; +enum class ISA { AMX, VEC, VEC16, NEON }; template class AttentionImpl {}; @@ -143,6 +143,12 @@ struct AttentionMetadata { case ISA::VEC: ss << "VEC, "; break; + case ISA::VEC16: + ss << "VEC16, "; + break; + case ISA::NEON: + ss << "NEON, "; + break; } ss << "workitem_group_num: " << workitem_group_num << ", reduction_item_num: " << reduction_item_num diff --git a/csrc/cpu/cpu_attn_neon.hpp b/csrc/cpu/cpu_attn_neon.hpp new file mode 100644 index 0000000000000..827f0cfbc718e --- /dev/null +++ b/csrc/cpu/cpu_attn_neon.hpp @@ -0,0 +1,386 @@ +#ifndef CPU_ATTN_NEON_HPP +#define CPU_ATTN_NEON_HPP + +#include "cpu_attn_impl.hpp" +#include +#include +namespace cpu_attention { + +namespace { + +#define BLOCK_SIZE_ALIGNMENT 32 +#define HEAD_SIZE_ALIGNMENT 32 +#define MAX_Q_HEAD_NUM_PER_ITER 16 + +// These do not use vectorized class for loading / converting +// because csrc/cpu/cpu_types_arm.hpp does not have fallback options +// for vec_op::BF16Vec* / vec_op::BF16Vec* on Arm HW that +// doesn't support BF16. +// We don't use vec_op::FP32Vec* or vec_op::FP16Vec* for consistency. +template +FORCE_INLINE void load_row8_B_as_f32(const kv_cache_t* p, float32x4_t& b0, + float32x4_t& b1); + +template <> +FORCE_INLINE void load_row8_B_as_f32(const float* p, float32x4_t& b0, + float32x4_t& b1) { + b0 = vld1q_f32(p + 0); + b1 = vld1q_f32(p + 4); +} + +template <> +FORCE_INLINE void load_row8_B_as_f32(const c10::Half* p, + float32x4_t& b0, + float32x4_t& b1) { + const float16_t* h = reinterpret_cast(p); + float16x8_t v = vld1q_f16(h); + b0 = vcvt_f32_f16(vget_low_f16(v)); + b1 = vcvt_f32_f16(vget_high_f16(v)); +} + +template <> +FORCE_INLINE void load_row8_B_as_f32(const c10::BFloat16* p, + float32x4_t& b0, + float32x4_t& b1) { + const uint16_t* u = reinterpret_cast(p); +#ifdef ARM_BF16_SUPPORT + uint16x8_t u0 = vld1q_u16(u); + bfloat16x8_t bf0 = vreinterpretq_bf16_u16(u0); + b0 = vcvtq_low_f32_bf16(bf0); + b1 = vcvtq_high_f32_bf16(bf0); +#else + uint16x8_t x0 = vld1q_u16(u); + uint32x4_t lo = vshlq_n_u32(vmovl_u16(vget_low_u16(x0)), 16); + uint32x4_t hi = vshlq_n_u32(vmovl_u16(vget_high_u16(x0)), 16); + b0 = vreinterpretq_f32_u32(lo); + b1 = vreinterpretq_f32_u32(hi); +#endif +} + +// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with NEON FMLAs +// #Loads = (K // 4) * (M + 4 * sizeof(kv_cache_t) / 2) +// #FMLAs = (K // 4) * (4 * 2 * M) +// We have (4 * 2 * M) FMLAs for (M + 4 * sizeof(kv_cache_t) / 2) loads +template +FORCE_INLINE void gemm_micro_neon_fmla_Mx8_Ku4( + const float* __restrict A, // [M x K], + const kv_cache_t* __restrict B, // [K x 8], + float* __restrict C, // [M x 8], + int64_t lda, int64_t ldb, int64_t ldc, int32_t K, bool accumulate) { + // kernel supports max M of 8, as it'd spill for larger M + static_assert(1 <= M && M <= 8, "M must be in [1,8]"); + +// helpers for per-M codegen +#define ROWS_APPLY(OP) OP(0) OP(1) OP(2) OP(3) OP(4) OP(5) OP(6) OP(7) +#define IF_M(i) if constexpr (M > (i)) + + // A row base pointers +#define DECL_A(i) const float* a##i = A + (i) * lda; + ROWS_APPLY(DECL_A) +#undef DECL_A + + // declare 2 accumulators per row of M +#define DECL_ACC(i) float32x4_t acc##i##_0, acc##i##_1; + ROWS_APPLY(DECL_ACC) +#undef DECL_ACC + + // initialize accumulators +#define INIT_ACC(i) \ + IF_M(i) { \ + if (accumulate) { \ + acc##i##_0 = vld1q_f32(C + (i) * ldc + 0); \ + acc##i##_1 = vld1q_f32(C + (i) * ldc + 4); \ + } else { \ + acc##i##_0 = vdupq_n_f32(0.f); \ + acc##i##_1 = vdupq_n_f32(0.f); \ + } \ + } + ROWS_APPLY(INIT_ACC) +#undef INIT_ACC + + int32_t k = 0; + + // K unrolled by 4 + for (; k + 3 < K; k += 4) { + // load A[k..k+3] for each active row (M) +#define LOAD_A4(i) \ + float32x4_t a##i##v; \ + IF_M(i) a##i##v = vld1q_f32(a##i + k); + ROWS_APPLY(LOAD_A4) +#undef LOAD_A4 + + // helper: FMA lane L from aiv +#define FMAS_LANE(i, aiv, L) \ + IF_M(i) { \ + acc##i##_0 = vfmaq_laneq_f32(acc##i##_0, b0, aiv, L); \ + acc##i##_1 = vfmaq_laneq_f32(acc##i##_1, b1, aiv, L); \ + } + + // k + 0 + { + float32x4_t b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 0) * ldb, b0, b1); +#define STEP_K0(i) FMAS_LANE(i, a##i##v, 0) + ROWS_APPLY(STEP_K0) +#undef STEP_K0 + } + // k + 1 + { + float32x4_t b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 1) * ldb, b0, b1); +#define STEP_K1(i) FMAS_LANE(i, a##i##v, 1) + ROWS_APPLY(STEP_K1) +#undef STEP_K1 + } + // k + 2 + { + float32x4_t b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 2) * ldb, b0, b1); +#define STEP_K2(i) FMAS_LANE(i, a##i##v, 2) + ROWS_APPLY(STEP_K2) +#undef STEP_K2 + } + // k + 3 + { + float32x4_t b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 3) * ldb, b0, b1); +#define STEP_K3(i) FMAS_LANE(i, a##i##v, 3) + ROWS_APPLY(STEP_K3) +#undef STEP_K3 + } +#undef FMAS_LANE + } + + // K tail + for (; k < K; ++k) { + float32x4_t b0, b1; + load_row8_B_as_f32(B + (int64_t)k * ldb, b0, b1); +#define TAIL_ROW(i) \ + IF_M(i) { \ + float32x4_t ai = vdupq_n_f32(*(a##i + k)); \ + acc##i##_0 = vfmaq_f32(acc##i##_0, b0, ai); \ + acc##i##_1 = vfmaq_f32(acc##i##_1, b1, ai); \ + } + ROWS_APPLY(TAIL_ROW) +#undef TAIL_ROW + } + + // store accumulators to C +#define STORE_ROW(i) \ + IF_M(i) { \ + vst1q_f32(C + (i) * ldc + 0, acc##i##_0); \ + vst1q_f32(C + (i) * ldc + 4, acc##i##_1); \ + } + ROWS_APPLY(STORE_ROW) +#undef STORE_ROW + +#undef ROWS_APPLY +#undef IF_M +} + +template +FORCE_INLINE void gemm_macro_neon_fmla_Mx8_Ku4(const float* __restrict A, + const kv_cache_t* __restrict B, + float* __restrict C, int32_t M, + int32_t K, int64_t lda, + int64_t ldb, int64_t ldc, + bool accumulate) { + // micro kernel is Mx8 + static_assert(N % 8 == 0, "N must be a multiple of 8"); + for (int32_t m = 0; m < M;) { + int32_t mb = (M - m >= 8) ? 8 : (M - m >= 4) ? 4 : (M - m >= 2) ? 2 : 1; + const float* Ab = A + m * lda; + float* Cb = C + m * ldc; + + for (int32_t n = 0; n < N; n += 8) { + const kv_cache_t* Bn = B + n; + float* Cn = Cb + n; + switch (mb) { + case 8: + gemm_micro_neon_fmla_Mx8_Ku4<8, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, + K, accumulate); + break; + case 4: + gemm_micro_neon_fmla_Mx8_Ku4<4, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, + K, accumulate); + break; + case 2: + gemm_micro_neon_fmla_Mx8_Ku4<2, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, + K, accumulate); + break; + default: + gemm_micro_neon_fmla_Mx8_Ku4<1, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, + K, accumulate); + break; + } + } + // no tail loop for N as it's guaranteed to be a multiple of 8 + m += mb; + } +} + +template +class TileGemmNeonFMLA { + public: + template + FORCE_INLINE static void gemm(const int32_t m_size, + float* __restrict__ a_tile, + kv_cache_t* __restrict__ b_tile, + float* __restrict__ c_tile, const int64_t lda, + const int64_t ldb, const int64_t ldc, + const int32_t block_size, + const int32_t dynamic_k_size, + const bool accum_c) { + if constexpr (phase == AttentionGemmPhase::QK) { + gemm_macro_neon_fmla_Mx8_Ku4( + a_tile, b_tile, c_tile, m_size, k_size, lda, ldb, ldc, accum_c); + } else { + gemm_macro_neon_fmla_Mx8_Ku4( + a_tile, b_tile, c_tile, m_size, dynamic_k_size, lda, ldb, ldc, + accum_c); + } + } +}; + +} // namespace + +// this is similar to "ISA::VEC" at the moment +template +class AttentionImpl { + public: + using query_t = scalar_t; + using q_buffer_t = float; + using kv_cache_t = scalar_t; + using logits_buffer_t = float; + using partial_output_buffer_t = float; + using prob_buffer_t = float; + + constexpr static int64_t BlockSizeAlignment = + BLOCK_SIZE_ALIGNMENT; // KV token num unit of QK and PV phases + constexpr static int64_t HeadDimAlignment = + HEAD_SIZE_ALIGNMENT; // headdim num unit of PV phase + constexpr static int64_t MaxQHeadNumPerIteration = MAX_Q_HEAD_NUM_PER_ITER; + constexpr static int64_t HeadDim = head_dim; + constexpr static ISA ISAType = ISA::NEON; + constexpr static bool scale_on_logits = false; // apply scale on q_buffer + + static_assert(HeadDim % HeadDimAlignment == 0); + // the gemm micro kernel is Mx8 + static_assert(HeadDimAlignment % 8 == 0); + static_assert(BlockSizeAlignment % 8 == 0); + + public: + template