diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh index 56bb5cedaa0a9..df805e0850806 100755 --- a/.buildkite/scripts/annotate-release.sh +++ b/.buildkite/scripts/annotate-release.sh @@ -23,8 +23,8 @@ To download the wheel (by version): aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl . aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl . -aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl . aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl . +aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl . \`\`\` To download and upload the image: @@ -45,9 +45,10 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 docker push vllm/vllm-openai:latest-aarch64 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 -docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend -docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend +docker manifest rm vllm/vllm-openai:latest +docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 +docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 docker manifest push vllm/vllm-openai:latest docker manifest push vllm/vllm-openai:v${RELEASE_VERSION} \`\`\` -EOF \ No newline at end of file +EOF diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh new file mode 100755 index 0000000000000..d0036f24c8d04 --- /dev/null +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# This script build the CPU docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# allow to bind to different cores +CORE_RANGE=${CORE_RANGE:-0-16} +OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16} +NUMA_NODE=${NUMA_NODE:-0} + +export CMAKE_BUILD_PARALLEL_LEVEL=32 + +# Setup cleanup +remove_docker_container() { + set -e; + docker rm -f cpu-test-"$NUMA_NODE" || true; +} +trap remove_docker_container EXIT +remove_docker_container + +# Try building the docker image +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . + +# Run the image, setting --shm-size=4g for tensor parallel. +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" + +function cpu_tests() { + set -e + export NUMA_NODE=$2 + + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pip list" + + # offline inference + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" + + # Run kernel tests + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pytest -x -v -s tests/kernels/test_onednn.py + pytest -x -v -s tests/kernels/attention/test_cpu_attn.py" + + # basic online serving + docker exec cpu-test-"$NUMA_NODE" bash -c ' + set -e + VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 & + server_pid=$! + timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 + vllm bench serve \ + --backend vllm \ + --dataset-name random \ + --model meta-llama/Llama-3.2-3B-Instruct \ + --num-prompts 20 \ + --endpoint /v1/completions + kill -s SIGTERM $server_pid &' +} + +# All of CPU tests are expected to be finished less than 40 mins. +export -f cpu_tests +timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 7479c43977d78..2267718f75ca5 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -73,12 +73,11 @@ function cpu_tests() { pytest -x -s -v \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs" - # Note: disable it until supports V1 - # Run AWQ test - # docker exec cpu-test-"$NUMA_NODE" bash -c " - # set -e - # pytest -x -s -v \ - # tests/quantization/test_ipex_quant.py" + # Run AWQ/GPTQ test + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pytest -x -s -v \ + tests/quantization/test_cpu_wna16.py" # Run multi-lora tests docker exec cpu-test-"$NUMA_NODE" bash -c " diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 2471b509a9fff..4e2ff5c5a6bd5 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -61,7 +61,7 @@ steps: - pytest -v -s -m 'not cpu_test' multimodal - pytest -v -s utils_ -- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins +- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins timeout_in_minutes: 10 mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 @@ -73,6 +73,7 @@ steps: - tests/multimodal - tests/standalone_tests/lazy_imports.py - tests/transformers_utils + - tests/config no_gpu: true commands: - python3 standalone_tests/lazy_imports.py @@ -80,6 +81,7 @@ steps: - pytest -v -s test_outputs.py - pytest -v -s -m 'cpu_test' multimodal - pytest -v -s transformers_utils + - pytest -v -s config - label: Python-only Installation Test # 10min timeout_in_minutes: 20 @@ -187,7 +189,7 @@ steps: - tests/distributed/test_utils - tests/distributed/test_pynccl - tests/distributed/test_events - - tests/compile/test_basic_correctness + - tests/compile/fullgraph/test_basic_correctness.py - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py - tests/examples/offline_inference/data_parallel.py @@ -215,7 +217,7 @@ steps: - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s distributed/test_utils.py - - pytest -v -s compile/test_basic_correctness.py + - pytest -v -s compile/fullgraph/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_events.py - pytest -v -s distributed/test_symm_mem_allreduce.py @@ -390,6 +392,15 @@ steps: commands: - pytest -v -s v1/attention +- label: V1 Test attention (B200) # 10min + timeout_in_minutes: 30 + gpu: b200 + source_file_dependencies: + - vllm/v1/attention + - tests/v1/attention + commands: + - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this + - label: V1 Test others (CPU) # 5 mins mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 @@ -493,17 +504,12 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_pass_manager.py - - pytest -v -s compile/test_fusion.py - - pytest -v -s compile/test_fusion_attn.py - - pytest -v -s compile/test_functionalization.py - - pytest -v -s compile/test_silu_mul_quant_fusion.py - # - pytest -v -s compile/test_sequence_parallelism.py - # - pytest -v -s compile/test_async_tp.py - - pytest -v -s compile/test_fusion_all_reduce.py - - pytest -v -s compile/test_decorator.py - - pytest -v -s compile/test_noop_elimination.py - - pytest -v -s compile/test_aot_compile.py + # Run unit tests defined directly under compile/, + # not including subdirectories, which are usually heavier + # tests covered elsewhere. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 @@ -515,9 +521,11 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_basic_correctness.py - - pytest -v -s compile/test_multimodal_compile.py - - pytest -v -s compile/piecewise/ + # Run smoke tests under fullgraph directory, except test_full_graph.py + # as it is a heavy test that is covered in other steps. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" - label: PyTorch Fullgraph Test # 27min timeout_in_minutes: 40 @@ -529,10 +537,10 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' # Limit to no custom ops to reduce running time # Wrap with quotes to escape yaml and avoid starting -k string with a - - - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'" + - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" - label: Cudagraph test timeout_in_minutes: 20 @@ -697,7 +705,7 @@ steps: - vllm/model_executor/models/whisper.py commands: # LMEval # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442 - - pytest -s entrypoints/openai/correctness/ --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py + - pytest -s entrypoints/openai/correctness/ - label: OpenAI-Compatible Tool Use # 23 min timeout_in_minutes: 35 @@ -998,12 +1006,12 @@ steps: optional: true commands: - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py + - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' - pytest -v -s tests/models/test_transformers.py - - pytest -v -s tests/models/multimodal/processing/ - - pytest -v -s tests/models/multimodal/test_mapping.py + # - pytest -v -s tests/models/multimodal/processing/ + - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' - python3 examples/offline_inference/basic/chat.py - - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper @@ -1048,7 +1056,7 @@ steps: - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_flashinfer.py -- label: Blackwell Fusion Tests # 30 min +- label: Blackwell Fusion and Compile Tests # 30 min timeout_in_minutes: 40 working_dir: "/vllm-workspace/" gpu: b200 @@ -1066,10 +1074,12 @@ steps: - pytest -v -s tests/compile/test_fusion_attn.py - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py # this runner has 2 GPUs available even though num_gpus=2 is not set - - pytest -v -s tests/compile/test_fusion_all_reduce.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time # Wrap with quotes to escape yaml - - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'" + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) + - pytest -v -s tests/compile/distributed/test_full_graph.py::test_fp8_kv_scale_compile - label: Blackwell Fusion E2E Tests # 30 min timeout_in_minutes: 40 @@ -1086,20 +1096,18 @@ steps: - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/test_fusions_e2e.py - - tests/compile/test_full_graph.py + - tests/compile/distributed/test_fusions_e2e.py + - tests/compile/fullgraph/test_full_graph.py commands: - nvidia-smi # Run all e2e fusion tests - pytest -v -s tests/compile/test_fusions_e2e.py - # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile - label: ROCm GPT-OSS Eval timeout_in_minutes: 60 working_dir: "/vllm-workspace/" agent_pool: mi325_1 - mirror_hardwares: [amdproduction] + mirror_hardwares: [amdexperimental, amdproduction] optional: true # run on nightlies source_file_dependencies: - tests/evals/gpt_oss @@ -1198,7 +1206,7 @@ steps: - vllm/worker/worker_base.py - vllm/v1/engine/ - vllm/v1/worker/ - - tests/compile/test_basic_correctness.py + - tests/compile/fullgraph/test_basic_correctness.py - tests/compile/test_wrapper.py - tests/distributed/ - tests/entrypoints/llm/test_collective_rpc.py @@ -1211,7 +1219,7 @@ steps: - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py - - pytest -v -s ./compile/test_basic_correctness.py + - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' @@ -1326,7 +1334,7 @@ steps: - vllm/ - tests/weight_loading commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt - label: Weight Loading Multiple GPU Test - Large Models # optional mirror_hardwares: [amdexperimental] @@ -1334,13 +1342,12 @@ steps: # grade: Blocking working_dir: "/vllm-workspace/tests" num_gpus: 2 - gpu: a100 optional: true source_file_dependencies: - vllm/ - tests/weight_loading commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt - label: NixlConnector PD accuracy tests (Distributed) # 30min mirror_hardwares: [amdexperimental] @@ -1417,10 +1424,12 @@ steps: working_dir: "/vllm-workspace/" num_gpus: 2 commands: - - pytest -v -s tests/compile/test_async_tp.py - - pytest -v -s tests/compile/test_sequence_parallelism.py - - pytest -v -s tests/compile/test_fusion_all_reduce.py - - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm + - pytest -v -s tests/compile/distributed/test_async_tp.py + - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" + - pytest -v -s tests/compile/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - pytest -v -s tests/v1/distributed/test_dbo.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 4ac76aba67b9c..6169b279dc8a4 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -167,7 +167,7 @@ steps: - tests/distributed/test_utils - tests/distributed/test_pynccl - tests/distributed/test_events - - tests/compile/test_basic_correctness + - tests/compile/fullgraph/test_basic_correctness.py - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py - tests/examples/offline_inference/data_parallel.py @@ -197,7 +197,7 @@ steps: - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s distributed/test_utils.py - - pytest -v -s compile/test_basic_correctness.py + - pytest -v -s compile/fullgraph/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_events.py - pytest -v -s distributed/test_symm_mem_allreduce.py @@ -445,18 +445,12 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_graph_partition.py - - pytest -v -s compile/test_config.py - - pytest -v -s compile/test_pass_manager.py - - pytest -v -s compile/test_fusion.py - - pytest -v -s compile/test_fusion_attn.py - - pytest -v -s compile/test_functionalization.py - - pytest -v -s compile/test_silu_mul_quant_fusion.py - - pytest -v -s compile/test_fusion_all_reduce.py - - pytest -v -s compile/test_decorator.py - - pytest -v -s compile/test_noop_elimination.py - - pytest -v -s compile/test_aot_compile.py - - pytest -v -s compile/test_qk_norm_rope_fusion.py + # Run unit tests defined directly under compile/, + # not including subdirectories, which are usually heavier + # tests covered elsewhere. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 @@ -466,9 +460,11 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_basic_correctness.py - - pytest -v -s compile/test_multimodal_compile.py - - pytest -v -s compile/piecewise/ + # Run smoke tests under fullgraph directory, except test_full_graph.py + # as it is a heavy test that is covered in other steps. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" - label: PyTorch Fullgraph Test # 27min timeout_in_minutes: 40 @@ -479,10 +475,10 @@ steps: - tests/compile commands: # fp8 kv scales not supported on sm89, tested on Blackwell instead - - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' # Limit to no custom ops to reduce running time # Wrap with quotes to escape yaml and avoid starting -k string with a - - - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" + - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" - label: Cudagraph test timeout_in_minutes: 20 @@ -554,6 +550,25 @@ steps: commands: - pytest -v -s kernels/mamba +- label: Kernels DeepGEMM Test (H100) + timeout_in_minutes: 45 + gpu: h100 + num_gpus: 1 + source_file_dependencies: + - tools/install_deepgemm.sh + - vllm/utils/deep_gemm.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization + - tests/kernels/quantization/test_block_fp8.py + - tests/kernels/moe/test_deepgemm.py + - tests/kernels/moe/test_batched_deepgemm.py + - tests/kernels/attention/test_deepgemm_attention.py + commands: + - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm + - pytest -v -s kernels/moe/test_deepgemm.py + - pytest -v -s kernels/moe/test_batched_deepgemm.py + - pytest -v -s kernels/attention/test_deepgemm_attention.py + - label: Model Executor Test # 23min timeout_in_minutes: 35 torch_nightly: true @@ -876,12 +891,12 @@ steps: optional: true commands: - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' + - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)' - pytest -v -s tests/models/test_transformers.py # - pytest -v -s tests/models/multimodal/processing/ - - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' + - pytest -v -s tests/models/multimodal/test_mapping.py - python3 examples/offline_inference/basic/chat.py - # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper @@ -925,6 +940,7 @@ steps: - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_flashinfer.py + - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py - label: Blackwell Fusion and Compile Tests # 30 min timeout_in_minutes: 40 @@ -934,22 +950,29 @@ steps: - csrc/quantization/fp4/ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/v1/attention/backends/flashinfer.py + - vllm/v1/worker/ + - vllm/v1/cudagraph_dispatcher.py - vllm/compilation/ # can affect pattern matching - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/test_fusion_attn.py + - tests/compile/test_silu_mul_quant_fusion.py + - tests/compile/distributed/test_fusion_all_reduce.py + - tests/compile/distributed/test_fusions_e2e.py + - tests/compile/fullgraph/test_full_graph.py commands: - nvidia-smi - pytest -v -s tests/compile/test_fusion_attn.py - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py # this runner has 2 GPUs available even though num_gpus=2 is not set - - pytest -v -s tests/compile/test_fusion_all_reduce.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time # Wrap with quotes to escape yaml - - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile + - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - label: Blackwell Fusion E2E Tests # 30 min timeout_in_minutes: 40 @@ -966,12 +989,11 @@ steps: - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/test_fusions_e2e.py - - tests/compile/test_full_graph.py + - tests/compile/distributed/test_fusions_e2e.py commands: - nvidia-smi # Run all e2e fusion tests - - pytest -v -s tests/compile/test_fusions_e2e.py + - pytest -v -s tests/compile/distributed/test_fusions_e2e.py - label: Blackwell GPT-OSS Eval timeout_in_minutes: 60 @@ -1069,7 +1091,7 @@ steps: - vllm/worker/worker_base.py - vllm/v1/engine/ - vllm/v1/worker/ - - tests/compile/test_basic_correctness.py + - tests/compile/fullgraph/test_basic_correctness.py - tests/compile/test_wrapper.py - tests/distributed/ - tests/entrypoints/llm/test_collective_rpc.py @@ -1084,7 +1106,7 @@ steps: - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py - - pytest -v -s ./compile/test_basic_correctness.py + - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' @@ -1264,10 +1286,10 @@ steps: working_dir: "/vllm-workspace/" num_gpus: 2 commands: - - pytest -v -s tests/compile/test_async_tp.py - - pytest -v -s tests/compile/test_sequence_parallelism.py - - pytest -v -s tests/compile/test_fusion_all_reduce.py - - "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'" + - pytest -v -s tests/compile/distributed/test_async_tp.py + - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml index 42b05ecd5ac06..a183033c9adde 100644 --- a/.github/workflows/macos-smoke-test.yml +++ b/.github/workflows/macos-smoke-test.yml @@ -9,7 +9,7 @@ on: jobs: macos-m1-smoke-test: runs-on: macos-latest - timeout-minutes: 20 + timeout-minutes: 30 steps: - uses: actions/checkout@v4 @@ -37,15 +37,14 @@ jobs: - name: Verify installation run: | python -c "import vllm; print(f'vLLM version: {vllm.__version__}')" - python -c "import torch; print(f'PyTorch: {torch.__version__}')" - name: Smoke test vllm serve - timeout-minutes: 10 run: | # Start server in background vllm serve Qwen/Qwen3-0.6B \ - --max-model-len=2048 \ + --max-model-len=2K \ --load-format=dummy \ + --hf-overrides '{"num_hidden_layers": 2}' \ --enforce-eager \ --port 8000 & diff --git a/.gitignore b/.gitignore index 50070d7898fe6..7cda86478664f 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,9 @@ # vllm-flash-attn built from source vllm/vllm_flash_attn/* +# OpenAI triton kernels copied from source +vllm/third_party/triton_kernels/* + # triton jit .triton diff --git a/CMakeLists.txt b/CMakeLists.txt index 3a37040edbf1a..a4cf51d17e982 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -307,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. - set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use") + set(CUTLASS_REVISION "v4.2.1") # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) @@ -512,9 +512,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x) # require CUDA 12.8 or later if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") else() - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}") endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) set(SRCS @@ -619,9 +619,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # FP4 Archs and flags if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) - cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") else() - cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}") endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) set(SRCS @@ -695,7 +695,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") else() - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}") endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu") @@ -741,9 +741,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") else() - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}") endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu") @@ -1030,6 +1030,11 @@ if(VLLM_GPU_LANG STREQUAL "HIP") WITH_SOABI) endif() +# For CUDA and HIP builds also build the triton_kernels external package. +if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP") + include(cmake/external_projects/triton_kernels.cmake) +endif() + # For CUDA we also build and ship some external projects. if (VLLM_GPU_LANG STREQUAL "CUDA") include(cmake/external_projects/flashmla.cmake) diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py index 904f805349148..d072c03c440b2 100644 --- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py +++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py @@ -5,11 +5,12 @@ import argparse import asyncio import logging import os +import time +import uuid +from urllib.parse import urlparse import aiohttp from quart import Quart, Response, make_response, request -from rate_limiter import RateLimiter -from request_queue import RequestQueue # Configure logging logging.basicConfig(level=logging.INFO) @@ -24,26 +25,8 @@ def parse_args(): parser.add_argument( "--timeout", type=float, - default=300, - help="Timeout for backend service requests in seconds (default: 300)", - ) - parser.add_argument( - "--max-concurrent", - type=int, - default=100, - help="Maximum concurrent requests to backend services (default: 100)", - ) - parser.add_argument( - "--queue-size", - type=int, - default=500, - help="Maximum number of requests in the queue (default: 500)", - ) - parser.add_argument( - "--rate-limit", - type=int, - default=40, - help="Maximum requests per second (default: 40)", + default=6 * 60 * 60, + help="Timeout for backend service requests in seconds (default: 21600)", ) parser.add_argument( "--port", @@ -54,14 +37,32 @@ def parse_args(): parser.add_argument( "--prefill-url", type=str, - default="http://localhost:8100/v1/completions", - help="Prefill service endpoint URL", + default="http://localhost:8100", + help="Prefill service base URL (protocol + host[:port])", ) parser.add_argument( "--decode-url", type=str, - default="http://localhost:8200/v1/completions", - help="Decode service endpoint URL", + default="http://localhost:8200", + help="Decode service base URL (protocol + host[:port])", + ) + parser.add_argument( + "--kv-host", + type=str, + default="localhost", + help="Hostname or IP used by KV transfer (default: localhost)", + ) + parser.add_argument( + "--prefill-kv-port", + type=int, + default=14579, + help="Prefill KV port (default: 14579)", + ) + parser.add_argument( + "--decode-kv-port", + type=int, + default=14580, + help="Decode KV port (default: 14580)", ) return parser.parse_args() @@ -73,70 +74,129 @@ def main(): # Initialize configuration using command line parameters AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout) - MAX_CONCURRENT_REQUESTS = args.max_concurrent - REQUEST_QUEUE_SIZE = args.queue_size - RATE_LIMIT = args.rate_limit PREFILL_SERVICE_URL = args.prefill_url DECODE_SERVICE_URL = args.decode_url PORT = args.port + PREFILL_KV_ADDR = f"{args.kv_host}:{args.prefill_kv_port}" + DECODE_KV_ADDR = f"{args.kv_host}:{args.decode_kv_port}" + + logger.info( + "Proxy resolved KV addresses -> prefill: %s, decode: %s", + PREFILL_KV_ADDR, + DECODE_KV_ADDR, + ) + app = Quart(__name__) - # Initialize the rate limiter and request queue - rate_limiter = RateLimiter(RATE_LIMIT) - request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE) - - # Attach the configuration object to the application instance + # Attach the configuration object to the application instance so helper + # coroutines can read the resolved backend URLs and timeouts without using + # globals. app.config.update( { "AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT, - "rate_limiter": rate_limiter, - "request_queue": request_queue, "PREFILL_SERVICE_URL": PREFILL_SERVICE_URL, "DECODE_SERVICE_URL": DECODE_SERVICE_URL, + "PREFILL_KV_ADDR": PREFILL_KV_ADDR, + "DECODE_KV_ADDR": DECODE_KV_ADDR, } ) - # Start queue processing on app startup - @app.before_serving - async def startup(): - """Start request processing task when app starts serving""" - asyncio.create_task(request_queue.process()) + def _normalize_base_url(url: str) -> str: + """Remove any trailing slash so path joins behave predictably.""" + return url.rstrip("/") - async def forward_request(url, data): - """Forward request to backend service with rate limiting and error handling""" - headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} + def _get_host_port(url: str) -> str: + """Return the hostname:port portion for logging and KV headers.""" + parsed = urlparse(url) + host = parsed.hostname or "localhost" + port = parsed.port + if port is None: + port = 80 if parsed.scheme == "http" else 443 + return f"{host}:{port}" - # Use rate limiter as context manager - async with ( - rate_limiter, - aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session, - ): - try: - async with session.post( - url=url, json=data, headers=headers - ) as response: - if response.status == 200: - # Stream response chunks - async for chunk_bytes in response.content.iter_chunked(1024): - yield chunk_bytes - else: - # Handle backend service errors - error_text = await response.text() - logger.error( - "Backend service error: %s - %s", - response.status, - error_text, - ) - yield b'{"error": "Backend service error"}' - except aiohttp.ClientError as e: - # Handle connection errors - logger.error("Connection error to %s: %s", url, str(e)) - yield b'{"error": "Service unavailable"}' - except asyncio.TimeoutError: - # Handle timeout errors - logger.error("Timeout connecting to %s", url) - yield b'{"error": "Service timeout"}' + PREFILL_BASE = _normalize_base_url(PREFILL_SERVICE_URL) + DECODE_BASE = _normalize_base_url(DECODE_SERVICE_URL) + KV_TARGET = _get_host_port(DECODE_SERVICE_URL) + + def _build_headers(request_id: str) -> dict[str, str]: + """Construct the headers expected by vLLM's P2P disagg connector.""" + headers: dict[str, str] = {"X-Request-Id": request_id, "X-KV-Target": KV_TARGET} + api_key = os.environ.get("OPENAI_API_KEY") + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + return headers + + async def _run_prefill( + request_path: str, + payload: dict, + headers: dict[str, str], + request_id: str, + ): + url = f"{PREFILL_BASE}{request_path}" + start_ts = time.perf_counter() + logger.info("[prefill] start request_id=%s url=%s", request_id, url) + try: + async with ( + aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session, + session.post(url=url, json=payload, headers=headers) as resp, + ): + if resp.status != 200: + error_text = await resp.text() + raise RuntimeError( + f"Prefill backend error {resp.status}: {error_text}" + ) + await resp.read() + logger.info( + "[prefill] done request_id=%s status=%s elapsed=%.2fs", + request_id, + resp.status, + time.perf_counter() - start_ts, + ) + except asyncio.TimeoutError as exc: + raise RuntimeError(f"Prefill service timeout at {url}") from exc + except aiohttp.ClientError as exc: + raise RuntimeError(f"Prefill service unavailable at {url}") from exc + + async def _stream_decode( + request_path: str, + payload: dict, + headers: dict[str, str], + request_id: str, + ): + url = f"{DECODE_BASE}{request_path}" + # Stream tokens from the decode service once the prefill stage has + # materialized KV caches on the target workers. + logger.info("[decode] start request_id=%s url=%s", request_id, url) + try: + async with ( + aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session, + session.post(url=url, json=payload, headers=headers) as resp, + ): + if resp.status != 200: + error_text = await resp.text() + logger.error( + "Decode backend error %s - %s", resp.status, error_text + ) + err_msg = ( + '{"error": "Decode backend error ' + str(resp.status) + '"}' + ) + yield err_msg.encode() + return + logger.info( + "[decode] streaming response request_id=%s status=%s", + request_id, + resp.status, + ) + async for chunk_bytes in resp.content.iter_chunked(1024): + yield chunk_bytes + logger.info("[decode] finished streaming request_id=%s", request_id) + except asyncio.TimeoutError: + logger.error("Decode service timeout at %s", url) + yield b'{"error": "Decode service timeout"}' + except aiohttp.ClientError as exc: + logger.error("Decode service error at %s: %s", url, exc) + yield b'{"error": "Decode service unavailable"}' async def process_request(): """Process a single request through prefill and decode stages""" @@ -146,13 +206,27 @@ def main(): # Create prefill request (max_tokens=1) prefill_request = original_request_data.copy() prefill_request["max_tokens"] = 1 + if "max_completion_tokens" in prefill_request: + prefill_request["max_completion_tokens"] = 1 # Execute prefill stage - async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request): - continue + # The request id encodes both KV socket addresses so the backend can + # shuttle tensors directly via NCCL once the prefill response + # completes. + request_id = ( + f"___prefill_addr_{PREFILL_KV_ADDR}___decode_addr_" + f"{DECODE_KV_ADDR}_{uuid.uuid4().hex}" + ) + + headers = _build_headers(request_id) + await _run_prefill(request.path, prefill_request, headers, request_id) # Execute decode stage and stream response - generator = forward_request(DECODE_SERVICE_URL, original_request_data) + # Pass the unmodified user request so the decode phase can continue + # sampling with the already-populated KV cache. + generator = _stream_decode( + request.path, original_request_data, headers, request_id + ) response = await make_response(generator) response.timeout = None # Disable timeout for streaming response return response @@ -168,23 +242,10 @@ def main(): @app.route("/v1/completions", methods=["POST"]) async def handle_request(): """Handle incoming API requests with concurrency and rate limiting""" - # Create task for request processing - task = asyncio.create_task(process_request()) - - # Enqueue request or reject if queue is full - if not await request_queue.enqueue(task): - return Response( - response=b'{"error": "Server busy, try again later"}', - status=503, - content_type="application/json", - ) - try: - # Return the response from the processing task - return await task + return await process_request() except asyncio.CancelledError: - # Handle task cancellation (timeout or queue full) - logger.warning("Request cancelled due to timeout or queue full") + logger.warning("Request cancelled") return Response( response=b'{"error": "Request cancelled"}', status=503, diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py index 027f67ad4db69..e07d6c776bc00 100644 --- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py +++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py @@ -255,8 +255,8 @@ def bench_run( torch.cuda.synchronize() # Timing - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) latencies = [] for _ in range(num_iters): diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index c99951aa27826..a1af0b8aec3d0 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -185,8 +185,8 @@ def benchmark_config( graph.replay() torch.cuda.synchronize() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) latencies: list[float] = [] for i in range(num_iters): diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index efa5a7386027e..b8913a217c608 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -105,8 +105,8 @@ def benchmark_permute( graph.replay() torch.cuda.synchronize() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) latencies: list[float] = [] for i in range(num_iters): @@ -241,8 +241,8 @@ def benchmark_unpermute( graph.replay() torch.cuda.synchronize() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) latencies: list[float] = [] for i in range(num_iters): diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index cb848d2bf579e..83bd91917508f 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -6,7 +6,7 @@ # # The CSV file (named with current date/time) contains these columns: # model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position, -# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99, +# is_neox_style, rope_parameters, dtype, torch_mean, torch_median, torch_p99, # torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max, # speedup # @@ -86,9 +86,8 @@ def benchmark_mrope( num_heads: int, num_kv_heads: int, max_position: int = 8192, - rope_theta: float = 10000, is_neox_style: bool = True, - rope_scaling: dict[str, Any] = None, + rope_parameters: dict[str, Any] | None = None, dtype: torch.dtype = torch.bfloat16, seed: int = 0, warmup_iter: int = 10, @@ -102,9 +101,8 @@ def benchmark_mrope( head_size=head_dim, rotary_dim=head_dim, max_position=max_position, - base=rope_theta, is_neox_style=is_neox_style, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dtype=dtype, ).to(device=device) @@ -203,9 +201,8 @@ def benchmark_mrope( num_kv_heads, head_dim, max_position, - rope_theta, is_neox_style, - str(rope_scaling), + str(rope_parameters), str(dtype).split(".")[-1], torch_stats["mean"], torch_stats["median"], @@ -255,9 +252,8 @@ if __name__ == "__main__": "num_kv_heads", "head_dim", "max_position", - "rope_theta", "is_neox_style", - "rope_scaling", + "rope_parameters", "dtype", "torch_mean", "torch_median", @@ -303,7 +299,7 @@ if __name__ == "__main__": q_size = num_heads * head_dim kv_size = num_kv_heads * head_dim is_neox_style = True - rope_theta = config.rope_theta + rope_parameters = config.rope_parameters max_position = config.max_position_embeddings for num_tokens in num_tokens_list: @@ -315,9 +311,8 @@ if __name__ == "__main__": num_heads=num_heads, num_kv_heads=num_kv_heads, max_position=max_position, - rope_theta=rope_theta, is_neox_style=is_neox_style, - rope_scaling=config.rope_scaling, + rope_parameters=rope_parameters, dtype=getattr(torch, args.dtype), seed=args.seed, warmup_iter=args.warmup_iter, diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py index bdc1eb733084e..eba4d510258b6 100644 --- a/benchmarks/kernels/benchmark_per_token_group_quant.py +++ b/benchmarks/kernels/benchmark_per_token_group_quant.py @@ -30,8 +30,8 @@ def _time_cuda( fn() torch.cuda.synchronize() - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) + start = torch.Event(enable_timing=True) + end = torch.Event(enable_timing=True) start.record() for _ in range(bench_iters): diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py index a5887aafd30d6..de01ff197eab7 100644 --- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py +++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py @@ -253,8 +253,8 @@ def benchmark( ) torch.cuda.synchronize() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) # Benchmark latencies: list[float] = [] diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py index 29ce18234dfa0..1d0d6fbb9a470 100644 --- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py @@ -127,8 +127,8 @@ def benchmark_decode( def time_fn(fn, warmup=10, trials=20): torch.cuda.synchronize() - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) + start = torch.Event(enable_timing=True) + end = torch.Event(enable_timing=True) times = [] for i in range(warmup): fn() diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py index 2a25d03748112..84bde723abf7f 100644 --- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py @@ -139,8 +139,8 @@ def benchmark_prefill( def time_fn(fn, warmup=10, trials=20): torch.cuda.synchronize() - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) + start = torch.Event(enable_timing=True) + end = torch.Event(enable_timing=True) times = [] for i in range(warmup): fn() diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index ab54f81985bc2..b52500c8c5217 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -183,8 +183,8 @@ def benchmark_config( run() torch.cuda.synchronize() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) latencies: list[float] = [] for i in range(num_iters): diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index aa84125818d10..fbbb03c5ed465 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -375,6 +375,7 @@ set(VLLM_EXT_SRC if (AVX512_FOUND AND NOT AVX512_DISABLED) set(VLLM_EXT_SRC "csrc/cpu/shm.cpp" + "csrc/cpu/cpu_wna16.cpp" ${VLLM_EXT_SRC}) if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI) set(VLLM_EXT_SRC diff --git a/cmake/external_projects/triton_kernels.cmake b/cmake/external_projects/triton_kernels.cmake new file mode 100644 index 0000000000000..d35ad123dd9de --- /dev/null +++ b/cmake/external_projects/triton_kernels.cmake @@ -0,0 +1,53 @@ +# Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels + +set(DEFAULT_TRITON_KERNELS_TAG "v3.5.0") + +# Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to +# be directly set to the triton_kernels python directory. +if (DEFINED ENV{TRITON_KERNELS_SRC_DIR}) + message(STATUS "[triton_kernels] Fetch from $ENV{TRITON_KERNELS_SRC_DIR}") + FetchContent_Declare( + triton_kernels + SOURCE_DIR $ENV{TRITON_KERNELS_SRC_DIR} + ) + +else() + set(TRITON_GIT "https://github.com/triton-lang/triton.git") + message (STATUS "[triton_kernels] Fetch from ${TRITON_GIT}:${DEFAULT_TRITON_KERNELS_TAG}") + FetchContent_Declare( + triton_kernels + # TODO (varun) : Fetch just the triton_kernels directory from Triton + GIT_REPOSITORY https://github.com/triton-lang/triton.git + GIT_TAG ${DEFAULT_TRITON_KERNELS_TAG} + GIT_PROGRESS TRUE + SOURCE_SUBDIR python/triton_kernels/triton_kernels + ) +endif() + +# Fetch content +FetchContent_MakeAvailable(triton_kernels) + +if (NOT triton_kernels_SOURCE_DIR) + message (FATAL_ERROR "[triton_kernels] Cannot resolve triton_kernels_SOURCE_DIR") +endif() + +if (DEFINED ENV{TRITON_KERNELS_SRC_DIR}) + set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/") +else() + set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/python/triton_kernels/triton_kernels/") +endif() + +message (STATUS "[triton_kernels] triton_kernels is available at ${TRITON_KERNELS_PYTHON_DIR}") + +add_custom_target(triton_kernels) + +# Ensure the vllm/third_party directory exists before installation +install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/third_party/triton_kernels\")") + +## Copy .py files to install directory. +install(DIRECTORY + ${TRITON_KERNELS_PYTHON_DIR} + DESTINATION + vllm/third_party/triton_kernels/ + COMPONENT triton_kernels + FILES_MATCHING PATTERN "*.py") diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index 567c8959f0454..6cc5cda14c525 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 58e0626a692f09241182582659e3bf8f16472659 + GIT_TAG 71bb26f6295449be880344b93b51791cc009237d GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 0aa0dc14c7480..32960cc8073bb 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -552,7 +552,11 @@ __global__ void indexer_k_quant_and_cache_kernel( #ifndef USE_ROCM __syncwarp(); #endif +#if defined(__gfx942__) + float scale = fmaxf(amax, 1e-4) / 224.0f; +#else float scale = fmaxf(amax, 1e-4) / 448.0f; +#endif if (use_ue8m0) { scale = exp2f(ceilf(log2f(scale))); } @@ -965,7 +969,9 @@ __global__ void gather_and_maybe_dequant_cache( } }; - for (int pid = split_start; pid < full_blocks_end; ++pid) { + const auto loop_end = + std::min((int64_t)full_blocks_end, block_table_stride - offset); + for (int pid = split_start; pid < loop_end; ++pid) { auto block_id = batch_block_table[pid]; auto block_start_ptr = src_cache + block_id * cache_block_stride; auto block_dst_ptr = dst + pid * block_size * dst_entry_stride; @@ -976,12 +982,15 @@ __global__ void gather_and_maybe_dequant_cache( } if (partial_block_size) { - auto block_id = batch_block_table[full_blocks_end]; - auto block_start_ptr = src_cache + block_id * cache_block_stride; - auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride; - for (int eid = 0; eid < partial_block_size; ++eid) { - copy_entry(block_start_ptr + eid * cache_entry_stride, - block_dst_ptr + eid * dst_entry_stride); + if (offset + full_blocks_end < block_table_stride) { + auto block_id = batch_block_table[full_blocks_end]; + auto block_start_ptr = src_cache + block_id * cache_block_stride; + auto block_dst_ptr = + dst + full_blocks_end * block_size * dst_entry_stride; + for (int eid = 0; eid < partial_block_size; ++eid) { + copy_entry(block_start_ptr + eid * cache_entry_stride, + block_dst_ptr + eid * dst_entry_stride); + } } } } diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp index 344296528b652..294b4f714a769 100644 --- a/csrc/cpu/cpu_attn_impl.hpp +++ b/csrc/cpu/cpu_attn_impl.hpp @@ -1,7 +1,6 @@ #ifndef CPU_ATTN_HPP #define CPU_ATTN_HPP -#include #include #include @@ -12,6 +11,7 @@ #include "cpu_types.hpp" #include "scratchpad_manager.h" #include "cpu_attn_macros.h" +#include "utils.hpp" namespace cpu_attention { enum class ISA { AMX, VEC, VEC16 }; diff --git a/csrc/cpu/cpu_types_scalar.hpp b/csrc/cpu/cpu_types_scalar.hpp index 1a9278bc662e5..f9da78283da5e 100644 --- a/csrc/cpu/cpu_types_scalar.hpp +++ b/csrc/cpu/cpu_types_scalar.hpp @@ -26,10 +26,6 @@ namespace vec_op { #define FORCE_INLINE __attribute__((always_inline)) inline -#define __max(a, b) ((a) > (b) ? (a) : (b)) -#define __min(a, b) ((a) < (b) ? (a) : (b)) -#define __abs(a) ((a) < (0) ? (0 - a) : (a)) - typedef struct f16x8_t { uint16_t val[8]; } f16x8_t; @@ -99,7 +95,7 @@ struct FP16Vec16 : public Vec { void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } void save(void* ptr, const int elem_num) const { - int num = __min(elem_num, VEC_ELEM_NUM); + int num = std::min(elem_num, VEC_ELEM_NUM); std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t)); } }; @@ -128,7 +124,7 @@ struct BF16Vec16 : public Vec { void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } void save(void* ptr, const int elem_num) const { - int num = __min(elem_num, VEC_ELEM_NUM); + int num = std::min(elem_num, VEC_ELEM_NUM); std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t)); } }; @@ -143,9 +139,9 @@ struct BF16Vec32 : public Vec { explicit BF16Vec32(f16x32_t data) : reg(data) {}; explicit BF16Vec32(BF16Vec8& vec8_data) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { + unroll_loop([&vec8_data, this](int i) { reg.val[i] = vec8_data.reg.val[i % BF16Vec8::VEC_ELEM_NUM]; - } + }); } void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } @@ -157,15 +153,11 @@ struct FP32Vec4 : public Vec { f32x4_t reg; explicit FP32Vec4(float v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = v; - } + unroll_loop([&v, this](int i) { reg.val[i] = v; }); } explicit FP32Vec4() { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = 0.0f; - } + unroll_loop([this](int i) { reg.val[i] = 0.0f; }); } explicit FP32Vec4(const float* ptr) @@ -182,15 +174,11 @@ struct FP32Vec8 : public Vec { f32x8_t reg; explicit FP32Vec8(float v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = v; - } + unroll_loop([&v, this](int i) { reg.val[i] = v; }); } explicit FP32Vec8() { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = 0.0f; - } + unroll_loop([this](int i) { reg.val[i] = 0.0f; }); } explicit FP32Vec8(const float* ptr) @@ -201,78 +189,68 @@ struct FP32Vec8 : public Vec { explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {}; explicit FP32Vec8(const FP16Vec8& v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = fp16_to_float(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = fp16_to_float(v.reg.val[i]); }); } FP32Vec8(const BF16Vec8& v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = bf16_to_float(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = bf16_to_float(v.reg.val[i]); }); } float reduce_sum() const { float result = 0; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result += reg.val[i]; - } + unroll_loop( + [&result, this](int i) { result += reg.val[i]; }); return result; } FP32Vec8 exp() const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = expf(reg.val[i]); - } + unroll_loop( + [&ret, this](int i) { ret.val[i] = expf(reg.val[i]); }); return FP32Vec8(ret); } FP32Vec8 tanh() const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = tanhf(reg.val[i]); - } + unroll_loop( + [&ret, this](int i) { ret.val[i] = tanhf(reg.val[i]); }); return FP32Vec8(ret); } FP32Vec8 er() const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = erf(reg.val[i]); - } + unroll_loop( + [&ret, this](int i) { ret.val[i] = erf(reg.val[i]); }); return FP32Vec8(ret); } FP32Vec8 operator*(const FP32Vec8& b) const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = reg.val[i] * b.reg.val[i]; - } + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] * b.reg.val[i]; }); return FP32Vec8(ret); } FP32Vec8 operator+(const FP32Vec8& b) const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = reg.val[i] + b.reg.val[i]; - } + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] + b.reg.val[i]; }); return FP32Vec8(ret); } FP32Vec8 operator-(const FP32Vec8& b) const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = reg.val[i] - b.reg.val[i]; - } + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] - b.reg.val[i]; }); return FP32Vec8(ret); } FP32Vec8 operator/(const FP32Vec8& b) const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = reg.val[i] / b.reg.val[i]; - } + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] / b.reg.val[i]; }); return FP32Vec8(ret); } @@ -284,15 +262,11 @@ struct FP32Vec16 : public Vec { f32x16_t reg; explicit FP32Vec16(float v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = v; - } + unroll_loop([&v, this](int i) { reg.val[i] = v; }); } explicit FP32Vec16() { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = 0.0f; - } + unroll_loop([this](int i) { reg.val[i] = 0.0f; }); } explicit FP32Vec16(const float* ptr) @@ -301,29 +275,27 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16(f32x16_t data) : reg(data) {}; FP32Vec16(const FP32Vec4& data) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { + unroll_loop([&data, this](int i) { reg.val[i] = data.reg.val[i % FP32Vec4::VEC_ELEM_NUM]; - } + }); } FP32Vec16(const FP32Vec8& data) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { + unroll_loop([&data, this](int i) { reg.val[i] = data.reg.val[i % FP32Vec8::VEC_ELEM_NUM]; - } + }); } FP32Vec16(const FP32Vec16& data) : reg(data.reg) {}; explicit FP32Vec16(const FP16Vec16& v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = fp16_to_float(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = fp16_to_float(v.reg.val[i]); }); } explicit FP32Vec16(const BF16Vec16& v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = bf16_to_float(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = bf16_to_float(v.reg.val[i]); }); } explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}; @@ -331,82 +303,74 @@ struct FP32Vec16 : public Vec { FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}; FP32Vec16 operator*(const FP32Vec16& b) const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = reg.val[i] * b.reg.val[i]; - } - return result; + f32x16_t ret; + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] * b.reg.val[i]; }); + return FP32Vec16(ret); } FP32Vec16 operator+(const FP32Vec16& b) const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = reg.val[i] + b.reg.val[i]; - } - return result; + f32x16_t ret; + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] + b.reg.val[i]; }); + return FP32Vec16(ret); } FP32Vec16 operator-(const FP32Vec16& b) const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = reg.val[i] - b.reg.val[i]; - } - return result; + f32x16_t ret; + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] - b.reg.val[i]; }); + return FP32Vec16(ret); } FP32Vec16 operator/(const FP32Vec16& b) const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = reg.val[i] / b.reg.val[i]; - } - return result; + f32x16_t ret; + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] / b.reg.val[i]; }); + return FP32Vec16(ret); } FP32Vec16 max(const FP32Vec16& b) const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = __max(reg.val[i], b.reg.val[i]); - } - return result; + f32x16_t ret; + unroll_loop([&ret, &b, this](int i) { + ret.val[i] = std::max(reg.val[i], b.reg.val[i]); + }); + return FP32Vec16(ret); } FP32Vec16 min(const FP32Vec16& b) const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = __min(reg.val[i], b.reg.val[i]); - } - return result; + f32x16_t ret; + unroll_loop([&ret, &b, this](int i) { + ret.val[i] = std::min(reg.val[i], b.reg.val[i]); + }); + return FP32Vec16(ret); } FP32Vec16 abs() const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = __abs(reg.val[i]); - } - return result; + f32x16_t ret; + unroll_loop( + [&ret, this](int i) { ret.val[i] = std::abs(reg.val[i]); }); + return FP32Vec16(ret); } float reduce_sum() const { float result = 0.0f; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result += reg.val[i]; - } + unroll_loop( + [&result, this](int i) { result += reg.val[i]; }); return result; } float reduce_max() const { - float result = reg.val[0]; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result = __max(reg.val[i], result); - } + float result = std::numeric_limits::lowest(); + unroll_loop( + [&result, this](int i) { result = std::max(reg.val[i], result); }); return result; } float reduce_min() const { - float result = reg.val[0]; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result = __min(reg.val[i], result); - } + float result = std::numeric_limits::max(); + unroll_loop( + [&result, this](int i) { result = std::min(reg.val[i], result); }); return result; } @@ -414,13 +378,9 @@ struct FP32Vec16 : public Vec { float reduce_sub_sum(int idx) { static_assert(VEC_ELEM_NUM % group_size == 0); float sum = 0.0; - int start = idx * group_size; - int end = (idx + 1) * group_size; - - for (; (start < VEC_ELEM_NUM) && (start < end); ++start) { - sum += reg.val[start]; - } - + const int start = idx * group_size; + unroll_loop( + [&sum, &start, this](int i) { sum += reg.val[start + i]; }); return sum; } @@ -477,17 +437,13 @@ inline void storeFP32(float v, c10::BFloat16* ptr) { } inline FP16Vec16::FP16Vec16(const FP32Vec16& v) { - int i = 0; - for (i = 0; i < FP16Vec16::VEC_ELEM_NUM; ++i) { - reg.val[i] = float_to_fp16(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = float_to_fp16(v.reg.val[i]); }); } inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) { - int i = 0; - for (i = 0; i < FP16Vec8::VEC_ELEM_NUM; ++i) { - reg.val[i] = float_to_fp16(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = float_to_fp16(v.reg.val[i]); }); } inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { @@ -495,17 +451,13 @@ inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { } inline BF16Vec8::BF16Vec8(const FP32Vec8& v) { - int i = 0; - for (i = 0; i < BF16Vec8::VEC_ELEM_NUM; ++i) { - reg.val[i] = float_to_bf16(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = float_to_bf16(v.reg.val[i]); }); } inline BF16Vec16::BF16Vec16(const FP32Vec16& v) { - int i = 0; - for (i = 0; i < BF16Vec16::VEC_ELEM_NUM; ++i) { - reg.val[i] = float_to_bf16(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = float_to_bf16(v.reg.val[i]); }); } inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 3); } diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index 7ddf028e6e131..6f51277f78440 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -104,6 +104,8 @@ struct FP16Vec16 : public Vec { explicit FP16Vec16(bool, void* ptr) : reg(_mm256_stream_load_si256((__m256i*)ptr)) {} + explicit FP16Vec16(const c10::Half v) : reg(_mm256_set1_epi16(v.x)) {} + explicit FP16Vec16(const FP32Vec16&); void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); } @@ -141,6 +143,8 @@ struct BF16Vec16 : public Vec { explicit BF16Vec16(bool, void* ptr) : reg(_mm256_stream_load_si256((__m256i*)ptr)) {} + explicit BF16Vec16(const c10::BFloat16 v) : reg(_mm256_set1_epi16(v.x)) {} + explicit BF16Vec16(const FP32Vec16&); void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); } @@ -350,6 +354,22 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16(__m512 data) : reg(data) {} + // de-pack 4 bit values + explicit FP32Vec16(int64_t value, const FP32Vec16& lut) { + int64_t mask_0 = 0x0F0F0F0F0F0F0F0F; + int64_t mask_1 = 0xF0F0F0F0F0F0F0F0; + int64_t value_0 = value & mask_0; + int64_t value_1 = value & mask_1; + __m128i vec_0 = _mm_movpi64_epi64((__m64)value_0); + __m128i vec_1 = _mm_movpi64_epi64((__m64)value_1); + vec_0 = _mm_cvtepu8_epi16(vec_0); + vec_1 = _mm_cvtepu8_epi16(vec_1); + vec_1 = _mm_slli_epi16(vec_1, 4); + __m128i vec = _mm_or_si128(vec_0, vec_1); + __m512i vec_i32 = _mm512_cvtepu8_epi32(vec); + reg = _mm512_permutexvar_ps(vec_i32, lut.reg); + } + explicit FP32Vec16(const FP32Vec4& data) : reg((__m512)_mm512_inserti32x4( _mm512_inserti32x4( @@ -426,14 +446,6 @@ struct FP32Vec16 : public Vec { float get_last_elem() const { return _mm512_cvtss_f32(reg); } - template - float reduce_sub_sum(int idx) { - static_assert(VEC_ELEM_NUM % group_size == 0); - constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); - __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size)); - return _mm512_mask_reduce_add_ps(mask, reg); - } - void save(float* ptr) const { _mm512_storeu_ps(ptr, reg); } void save(float* ptr, const int elem_num) const { @@ -755,6 +767,25 @@ inline void non_temporal_save(BF16Vec16& vec, void* ptr) { inline void non_temporal_save(FP32Vec16& vec, void* ptr) { _mm512_stream_ps((float*)ptr, vec.reg); } + +static void interleave_save(const BF16Vec16& vec0, const BF16Vec16& vec1, + void* ptr) { + __m512i vec_0 = _mm512_cvtepu16_epi32(vec0.reg); + __m512i vec_1 = _mm512_cvtepu16_epi32(vec1.reg); + vec_1 = _mm512_slli_epi32(vec_1, 16); + vec_0 = _mm512_or_si512(vec_0, vec_1); + _mm512_storeu_epi32(ptr, vec_0); +} + +static void interleave_save(const FP16Vec16& vec0, const FP16Vec16& vec1, + void* ptr) { + __m512i vec_0 = _mm512_cvtepu16_epi32(vec0.reg); + __m512i vec_1 = _mm512_cvtepu16_epi32(vec1.reg); + vec_1 = _mm512_slli_epi32(vec_1, 16); + vec_0 = _mm512_or_si512(vec_0, vec_1); + _mm512_storeu_epi32(ptr, vec_0); +} + #endif inline void mem_barrier() { _mm_mfence(); } diff --git a/csrc/cpu/cpu_wna16.cpp b/csrc/cpu/cpu_wna16.cpp new file mode 100644 index 0000000000000..816d195506e52 --- /dev/null +++ b/csrc/cpu/cpu_wna16.cpp @@ -0,0 +1,402 @@ +#include "cpu_types.hpp" +#include "scratchpad_manager.h" +#include "utils.hpp" + +#ifdef CPU_CAPABILITY_AMXBF16 + #include "cpu/micro_gemm/cpu_micro_gemm_amx.hpp" +#endif +#include "cpu/micro_gemm/cpu_micro_gemm_vec.hpp" + +#define VLLM_DISPATCH_CASE_16B_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) + +#define VLLM_DISPATCH_16B_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_16B_TYPES(__VA_ARGS__)) + +template +void print_logits(const char* name, T* ptr, int32_t row, int32_t col, + int32_t stride) { + std::stringstream ss; + ss << std::fixed << std::setprecision(5) << name << ": [\n"; + auto* curr_logits_buffer = ptr; + for (int32_t m = 0; m < row; ++m) { + for (int32_t n = 0; n < col; ++n) { + ss << curr_logits_buffer[n] << ", "; + } + ss << "\n"; + curr_logits_buffer += stride; + } + ss << "]\n"; + std::printf("%s", ss.str().c_str()); +} + +namespace { +using cpu_utils::ISA; +using cpu_utils::VecTypeTrait; + +template +class Dequantizer4b { + public: + constexpr static int32_t pack_num = 32 / 4; + using scalar_vec_t = typename VecTypeTrait::vec_t; + + public: + static void dequant(int32_t* __restrict__ q_weight, + scalar_t* __restrict__ weight, + scalar_t* __restrict__ scales, + int32_t* __restrict__ zeros, int32_t* __restrict__ g_idx, + const int64_t scales_stride, const int64_t zeros_stride, + const int32_t k_size, const int32_t group_size) { + vec_op::FP32Vec16 lut; + if constexpr (has_zp) { + // AWQ + alignas(64) static const float LUT[16] = { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, + 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; + lut = vec_op::FP32Vec16(LUT); + } else { + // GPTQ + alignas(64) static const float LUT[16] = { + -8.0f, -7.0f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; + lut = vec_op::FP32Vec16(LUT); + } + + // per 64-bits elem contains 16 output channels + int64_t* __restrict__ curr_q_weight = reinterpret_cast(q_weight); + int64_t* __restrict__ curr_zeros = reinterpret_cast(zeros); + scalar_t* __restrict__ curr_weight = weight; + scalar_t* __restrict__ curr_scale = scales; + vec_op::FP32Vec16 scale_0; + vec_op::FP32Vec16 scale_1; + vec_op::FP32Vec16 zero_0; + vec_op::FP32Vec16 zero_1; + int32_t group_counter = 0; + for (int32_t k_idx = 0; k_idx < k_size; k_idx += 2) { + int64_t qwb_0 = *curr_q_weight; + int64_t qwb_1 = *(curr_q_weight + 1); + vec_op::FP32Vec16 wb_0(qwb_0, lut); + vec_op::FP32Vec16 wb_1(qwb_1, lut); + + if constexpr (!use_desc_act) { + if (group_counter == 0) { + scale_0 = vec_op::FP32Vec16(scalar_vec_t(curr_scale)); + scale_1 = vec_op::FP32Vec16(scale_0); + curr_scale += scales_stride; + + if constexpr (has_zp) { + zero_0 = vec_op::FP32Vec16(*curr_zeros, lut); + zero_1 = vec_op::FP32Vec16(zero_0); + curr_zeros += zeros_stride / 2; + } + } + } else { + int32_t g_idx_0 = g_idx[k_idx]; + int32_t g_idx_1 = g_idx[k_idx + 1]; + scale_0 = vec_op::FP32Vec16( + scalar_vec_t(curr_scale + g_idx_0 * scales_stride)); + scale_1 = vec_op::FP32Vec16( + scalar_vec_t(curr_scale + g_idx_1 * scales_stride)); + if constexpr (has_zp) { + zero_0 = vec_op::FP32Vec16(*(curr_zeros + g_idx_0 * zeros_stride / 2), + lut); + zero_1 = vec_op::FP32Vec16(*(curr_zeros + g_idx_1 * zeros_stride / 2), + lut); + } + } + + if constexpr (has_zp) { + wb_0 = wb_0 - zero_0; + wb_1 = wb_1 - zero_1; + } + + wb_0 = wb_0 * scale_0; + wb_1 = wb_1 * scale_1; + + scalar_vec_t output_vec_0(wb_0); + scalar_vec_t output_vec_1(wb_1); + + // AMX needs to interlave K elements to pack as 32 bits + if constexpr (isa == ISA::AMX) { + vec_op::interleave_save(output_vec_0, output_vec_1, curr_weight); + } else { + output_vec_0.save(curr_weight); + output_vec_1.save(curr_weight + 16); + } + + // update + curr_q_weight += 2; + curr_weight += 32; + if constexpr (!use_desc_act) { + group_counter += 2; + if (group_counter == group_size) { + group_counter = 0; + } + } + } + } +}; +}; // namespace + +template +void cpu_gemm_wna16_impl( + scalar_t* __restrict__ input, int32_t* __restrict__ q_weight, + scalar_t* __restrict__ output, scalar_t* __restrict__ scales, + int32_t* __restrict__ zeros, int32_t* __restrict__ g_idx, + scalar_t* __restrict__ bias, const int32_t m_size, const int32_t n_size, + const int32_t k_size, const int64_t input_stride, + const int64_t output_stride, const int64_t scales_group_stride, + const int64_t zeros_group_stride, const int32_t group_num, + const int32_t group_size, const int64_t pack_factor) { + constexpr int32_t gemm_n_tile_size = gemm_t::NSize; + constexpr int32_t gemm_m_tile_size = gemm_t::MaxMSize; + constexpr int32_t n_block_size = 16; + static_assert(gemm_n_tile_size % n_block_size == 0); + const int32_t thread_num = omp_get_max_threads(); + + // a simple schedule policy, just to hold more B tiles in L2 and make sure + // each thread has tasks + const int32_t n_partition_size = [&]() { + const int64_t cache_size = cpu_utils::get_l2_size(); + int64_t ps_cache_limit = cache_size / (k_size * sizeof(scalar_t)); + int64_t ps_thread_limit = n_size / thread_num; + ps_cache_limit = + std::max((ps_cache_limit / gemm_n_tile_size) * gemm_n_tile_size, + (int64_t)gemm_n_tile_size); + ps_thread_limit = + std::max((ps_thread_limit / gemm_n_tile_size) * gemm_n_tile_size, + (int64_t)gemm_n_tile_size); + return std::min(ps_cache_limit, ps_thread_limit); + }(); + const int32_t task_num = (n_size + n_partition_size - 1) / n_partition_size; + + // get buffer size + const int64_t b_buffer_size = + (((n_partition_size * k_size * sizeof(scalar_t) + 63) / 64) * 64); + const int64_t c_buffer_size = + (((gemm_m_tile_size * gemm_n_tile_size * sizeof(float) + 63) / 64) * 64); + const int64_t b_buffer_offset = 0; + const int64_t c_buffer_offset = b_buffer_size; + const int64_t buffer_size = b_buffer_size + c_buffer_size; + DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(buffer_size * + thread_num); + + alignas(64) cpu_utils::Counter counter; + cpu_utils::Counter* counter_ptr = &counter; + +#pragma omp parallel for schedule(static, 1) + for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) { + scalar_t* __restrict__ b_buffer = nullptr; + float* __restrict__ c_buffer = nullptr; + { + uint8_t* buffer_ptr = DNNLScratchPadManager::get_dnnl_scratchpad_manager() + ->get_data() + + thread_id * buffer_size; + b_buffer = reinterpret_cast(buffer_ptr + b_buffer_offset); + c_buffer = reinterpret_cast(buffer_ptr + c_buffer_offset); + } + + const int64_t q_weight_block_stride = n_block_size / pack_factor * k_size; + const int64_t b_buffer_block_stride = n_block_size * k_size; + const int32_t zeros_block_stride = n_block_size / pack_factor; + + gemm_t gemm; + + for (;;) { + int32_t task_id = counter_ptr->acquire_counter(); + + if (task_id >= task_num) { + break; + } + + const int32_t n_start_idx = task_id * n_partition_size; + const int32_t n_block_start_idx = n_start_idx / n_block_size; + const int32_t n_num = std::min(n_partition_size, n_size - n_start_idx); + const int32_t n_block_num = n_num / n_block_size; + // std::printf("thread_id: %d, task_id: %d, n_start_idx: %d, n_num: %d\n", + // thread_id, task_id, n_start_idx, n_num); + + // dequant weight + { + int32_t* __restrict__ curr_q_weight = + q_weight + n_block_start_idx * q_weight_block_stride; + scalar_t* __restrict__ curr_b_buffer = b_buffer; + scalar_t* __restrict__ curr_scales = scales + n_start_idx; + int32_t* __restrict__ curr_zeros = zeros + n_start_idx / pack_factor; + for (int32_t block_idx = 0; block_idx < n_block_num; ++block_idx) { + dequantizer_t::dequant(curr_q_weight, curr_b_buffer, curr_scales, + curr_zeros, g_idx, scales_group_stride, + zeros_group_stride, k_size, group_size); + + // if (block_idx == 0 && n_start_idx == 0) { + // print_logits("depacked weight", curr_b_buffer, k_size, + // n_block_size, n_block_size); + // } + + // update + curr_q_weight += q_weight_block_stride; + curr_b_buffer += b_buffer_block_stride; + curr_scales += n_block_size; + curr_zeros += zeros_block_stride; + } + } + + // compute loop + { + const int32_t n_tile_num = n_num / gemm_n_tile_size; + scalar_t* __restrict__ curr_input = input; + scalar_t* __restrict__ init_bias = bias; + if (bias != nullptr) { + init_bias += n_start_idx; + } + scalar_t* __restrict__ init_output = output + n_start_idx; + for (int32_t m_idx = 0; m_idx < m_size; m_idx += gemm_m_tile_size) { + const int32_t curr_m_size = + std::min(gemm_m_tile_size, m_size - m_idx); + scalar_t* __restrict__ curr_b_buffer = b_buffer; + scalar_t* __restrict__ curr_bias = init_bias; + scalar_t* __restrict__ curr_output = init_output; + for (int32_t n_tile_idx = 0; n_tile_idx < n_tile_num; ++n_tile_idx) { + gemm.gemm(curr_input, curr_b_buffer, c_buffer, curr_m_size, k_size, + input_stride, b_buffer_block_stride, gemm_n_tile_size, + false); + + if (bias != nullptr) { + cpu_micro_gemm::bias_epilogue( + c_buffer, curr_output, curr_bias, curr_m_size, + gemm_n_tile_size, output_stride); + curr_bias += gemm_n_tile_size; + } else { + cpu_micro_gemm::default_epilogue( + c_buffer, curr_output, curr_m_size, gemm_n_tile_size, + output_stride); + } + + curr_b_buffer += + b_buffer_block_stride * (gemm_n_tile_size / n_block_size); + curr_output += gemm_n_tile_size; + } + curr_input += gemm_m_tile_size * input_stride; + init_output += gemm_m_tile_size * output_stride; + } + } + } + } +} + +void cpu_gemm_wna16( + const torch::Tensor& input, // [M, K] + const torch::Tensor& + q_weight, // [N / 16, K * 16 / pack_factor], packed as int32 + torch::Tensor& output, // [M, N] + const torch::Tensor& scales, // [group_num, N] + const std::optional& + zeros, // [group_num, N / pack_factor], packed as int32 + const std::optional& g_idx, // [K] + const std::optional& bias, // [N] + const int64_t pack_factor, const std::string& isa_hint) { + using cpu_utils::ISA; + TORCH_CHECK_EQ(pack_factor, 8); // only supports 4bits + const int32_t a_m_size = input.size(0); + const int32_t a_k_size = input.size(1); + const int64_t a_m_stride = input.stride(0); + const int32_t b_n_size = q_weight.size(0) * 16; + TORCH_CHECK_EQ(a_k_size % 32, 0); + TORCH_CHECK_EQ(b_n_size % 32, 0); + const int32_t group_num = scales.size(0); + const int32_t group_size = a_k_size / group_num; + TORCH_CHECK_EQ(group_size % 2, 0); + const int64_t scales_group_stride = scales.stride(0); + const int64_t output_m_stride = output.stride(0); + + bool has_zp = zeros.has_value(); + bool use_desc_act = g_idx.has_value(); + TORCH_CHECK(!(has_zp && use_desc_act)); + + ISA isa = [&]() { + if (isa_hint == "amx") { + return ISA::AMX; + } else if (isa_hint == "vec") { + return ISA::VEC; + } else { + TORCH_CHECK(false, "unsupported isa hint: " + isa_hint); + } + }(); + + int32_t* zeros_ptr = has_zp ? zeros->data_ptr() : nullptr; + const int64_t zeros_group_stride = has_zp ? zeros->stride(0) : 0; + int32_t* g_idx_ptr = use_desc_act ? g_idx->data_ptr() : nullptr; + + VLLM_DISPATCH_16B_TYPES(input.scalar_type(), "cpu_gemm_wna16", [&]() { + if (isa == ISA::AMX) { + using gemm_t = cpu_micro_gemm::MicroGemm; + if (has_zp) { + using dequantizer_t = Dequantizer4b; + cpu_gemm_wna16_impl( + input.data_ptr(), q_weight.data_ptr(), + output.data_ptr(), scales.data_ptr(), zeros_ptr, + g_idx_ptr, bias.has_value() ? bias->data_ptr() : nullptr, + a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride, + scales_group_stride, zeros_group_stride, group_num, group_size, + pack_factor); + return; + } + if (use_desc_act) { + using dequantizer_t = Dequantizer4b; + cpu_gemm_wna16_impl( + input.data_ptr(), q_weight.data_ptr(), + output.data_ptr(), scales.data_ptr(), zeros_ptr, + g_idx_ptr, bias.has_value() ? bias->data_ptr() : nullptr, + a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride, + scales_group_stride, zeros_group_stride, group_num, group_size, + pack_factor); + return; + } else { + using dequantizer_t = Dequantizer4b; + cpu_gemm_wna16_impl( + input.data_ptr(), q_weight.data_ptr(), + output.data_ptr(), scales.data_ptr(), zeros_ptr, + g_idx_ptr, bias.has_value() ? bias->data_ptr() : nullptr, + a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride, + scales_group_stride, zeros_group_stride, group_num, group_size, + pack_factor); + return; + } + } else if (isa == ISA::VEC) { + using gemm_t = cpu_micro_gemm::MicroGemm; + if (has_zp) { + using dequantizer_t = Dequantizer4b; + cpu_gemm_wna16_impl( + input.data_ptr(), q_weight.data_ptr(), + output.data_ptr(), scales.data_ptr(), zeros_ptr, + g_idx_ptr, bias.has_value() ? bias->data_ptr() : nullptr, + a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride, + scales_group_stride, zeros_group_stride, group_num, group_size, + pack_factor); + return; + } + if (use_desc_act) { + using dequantizer_t = Dequantizer4b; + cpu_gemm_wna16_impl( + input.data_ptr(), q_weight.data_ptr(), + output.data_ptr(), scales.data_ptr(), zeros_ptr, + g_idx_ptr, bias.has_value() ? bias->data_ptr() : nullptr, + a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride, + scales_group_stride, zeros_group_stride, group_num, group_size, + pack_factor); + return; + } else { + using dequantizer_t = Dequantizer4b; + cpu_gemm_wna16_impl( + input.data_ptr(), q_weight.data_ptr(), + output.data_ptr(), scales.data_ptr(), zeros_ptr, + g_idx_ptr, bias.has_value() ? bias->data_ptr() : nullptr, + a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride, + scales_group_stride, zeros_group_stride, group_num, group_size, + pack_factor); + return; + } + } + }); +} diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp index 02a8072ccf306..cfb6e78cba9a1 100644 --- a/csrc/cpu/dnnl_helper.cpp +++ b/csrc/cpu/dnnl_helper.cpp @@ -396,9 +396,9 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args) : DNNLMatMulPrimitiveHandler( static_cast(args), args.ab_type), m_size_cache_(nullptr) { - assert(ab_type_ == dnnl::memory::data_type::f32 || - ab_type_ == dnnl::memory::data_type::bf16 || - ab_type_ == dnnl::memory::data_type::f16); + assert(b_type_ == dnnl::memory::data_type::f32 || + b_type_ == dnnl::memory::data_type::bf16 || + b_type_ == dnnl::memory::data_type::f16); dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_, {b_k_stride_, b_n_stride_}); diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp new file mode 100644 index 0000000000000..87a019773a895 --- /dev/null +++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp @@ -0,0 +1,245 @@ +#ifndef CPU_MICRO_GEMM_AMX_HPP +#define CPU_MICRO_GEMM_AMX_HPP +#include "cpu/micro_gemm/cpu_micro_gemm_impl.hpp" + +namespace cpu_micro_gemm { +namespace { +// AMX specific +constexpr static int64_t AMX_TILE_ROW_BYTES = 64; +constexpr static int64_t AMX_TILE_ROW_NUM = 16; +constexpr static int64_t AMX_TILE_BYTES = AMX_TILE_ROW_BYTES * AMX_TILE_ROW_NUM; + +typedef struct __tile_config { + uint8_t palette_id = 1; + uint8_t start_row = 0; + uint8_t reserved_0[14] = {0}; + uint16_t colsb[16] = {0}; + uint8_t rows[16] = {0}; +} __tilecfg; + +// 2-2-4 pattern, for 16 < m <= 32 +// TILE 0, 1: load A matrix, row num should be 16, m - 16 +// TILE 2, 3: load B matrix, row num should be 16 +// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16, m - 16, m +// - 16 +template +class TileGemm224 { + public: + FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) { + TORCH_CHECK(false, "Unsupported data type for TileGemm224"); + } + + FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) { + TORCH_CHECK(false, "Unsupported data type for TileGemm224"); + } +}; + +template <> +class TileGemm224 { + public: + using scalar_t = c10::BFloat16; + FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) { + const int32_t k_times = k / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16)); + c10::BFloat16* __restrict__ a_tile_0 = a_ptr; + c10::BFloat16* __restrict__ a_tile_1 = a_ptr + lda * AMX_TILE_ROW_NUM; + const int64_t a_tile_stride = lda * sizeof(c10::BFloat16); + + // B is always packed as 16 output channels block + c10::BFloat16* __restrict__ b_tile_2 = b_ptr; + c10::BFloat16* __restrict__ b_tile_3 = b_ptr + b_n_group_stride; + const int32_t b_tile_stride = AMX_TILE_ROW_BYTES; + + float* __restrict__ c_tile_4 = c_ptr; + float* __restrict__ c_tile_5 = + c_tile_4 + AMX_TILE_ROW_BYTES / sizeof(float); + float* __restrict__ c_tile_6 = c_ptr + AMX_TILE_ROW_NUM * ldc; + float* __restrict__ c_tile_7 = + c_tile_6 + AMX_TILE_ROW_BYTES / sizeof(float); + const int32_t c_tile_stride = ldc * sizeof(float); + + if (accum_c) { + _tile_loadd(4, c_tile_4, c_tile_stride); + _tile_loadd(5, c_tile_5, c_tile_stride); + _tile_loadd(6, c_tile_6, c_tile_stride); + _tile_loadd(7, c_tile_7, c_tile_stride); + } else { + _tile_zero(4); + _tile_zero(5); + _tile_zero(6); + _tile_zero(7); + } + + for (int32_t k = 0; k < k_times; ++k) { + _tile_loadd(0, a_tile_0, a_tile_stride); + _tile_stream_loadd(2, b_tile_2, b_tile_stride); + _tile_dpbf16ps(4, 0, 2); + _tile_stream_loadd(3, b_tile_3, b_tile_stride); + _tile_dpbf16ps(5, 0, 3); + _tile_loadd(1, a_tile_1, a_tile_stride); + _tile_dpbf16ps(6, 1, 2); + _tile_dpbf16ps(7, 1, 3); + + // update ptrs + a_tile_0 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16); + a_tile_1 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16); + b_tile_2 += AMX_TILE_BYTES / sizeof(c10::BFloat16); + b_tile_3 += AMX_TILE_BYTES / sizeof(c10::BFloat16); + } + + _tile_stored(4, c_tile_4, c_tile_stride); + _tile_stored(5, c_tile_5, c_tile_stride); + _tile_stored(6, c_tile_6, c_tile_stride); + _tile_stored(7, c_tile_7, c_tile_stride); + } + + FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) { + const int32_t m_0 = AMX_TILE_ROW_NUM; + const int32_t m_1 = m - AMX_TILE_ROW_NUM; + config.rows[0] = m_0; + config.rows[1] = m_1; + config.rows[2] = AMX_TILE_ROW_NUM; + config.rows[3] = AMX_TILE_ROW_NUM; + config.rows[4] = m_0; + config.rows[5] = m_0; + config.rows[6] = m_1; + config.rows[7] = m_1; + _tile_loadconfig(&config); + } +}; + +// 1-2-2 pattern, for 0 < m <= 16 +// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should be +// m, m +// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row +// num should be 16 +// TILE 6, 7, (6, 7): store results C matrix, row num should be +// m +template +class TileGemm122 { + public: + FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) { + TORCH_CHECK(false, "Unsupported data type for TileGemm122"); + } + + FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) { + TORCH_CHECK(false, "Unsupported data type for TileGemm122"); + } +}; + +template <> +class TileGemm122 { + public: + using scalar_t = c10::BFloat16; + FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) { + c10::BFloat16* __restrict__ a_tile_0 = a_ptr; + c10::BFloat16* __restrict__ a_tile_1 = + a_ptr + AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16); + const int64_t a_tile_stride = lda * sizeof(c10::BFloat16); + + c10::BFloat16* __restrict__ b_tile_2 = b_ptr; + c10::BFloat16* __restrict__ b_tile_3 = b_ptr + b_n_group_stride; + c10::BFloat16* __restrict__ b_tile_4 = + b_tile_2 + AMX_TILE_BYTES / sizeof(c10::BFloat16); + c10::BFloat16* __restrict__ b_tile_5 = + b_tile_3 + AMX_TILE_BYTES / sizeof(c10::BFloat16); + int64_t b_stride = AMX_TILE_ROW_BYTES; + + float* __restrict__ c_tile_6 = c_ptr; + float* __restrict__ c_tile_7 = c_ptr + AMX_TILE_ROW_BYTES / sizeof(float); + int64_t c_stride = ldc * sizeof(float); + + const int32_t k_times = k / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16)); + const int32_t k_group_times = k_times / 2; + const bool has_tail = (k_times % 2 == 1); + + if (accum_c) { + _tile_loadd(6, c_tile_6, c_stride); + _tile_loadd(7, c_tile_7, c_stride); + } else { + _tile_zero(6); + _tile_zero(7); + } + + for (int32_t k = 0; k < k_group_times; ++k) { + _tile_loadd(0, a_tile_0, a_tile_stride); + _tile_stream_loadd(2, b_tile_2, b_stride); + _tile_dpbf16ps(6, 0, 2); + _tile_stream_loadd(3, b_tile_3, b_stride); + _tile_dpbf16ps(7, 0, 3); + _tile_loadd(1, a_tile_1, a_tile_stride); + _tile_stream_loadd(4, b_tile_4, b_stride); + _tile_dpbf16ps(6, 1, 4); + _tile_stream_loadd(5, b_tile_5, b_stride); + _tile_dpbf16ps(7, 1, 5); + + // update ptrs + a_tile_0 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16); + a_tile_1 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16); + b_tile_2 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16); + b_tile_3 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16); + b_tile_4 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16); + b_tile_5 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16); + } + + if (has_tail) { + _tile_loadd(0, a_tile_0, a_tile_stride); + _tile_stream_loadd(2, b_tile_2, b_stride); + _tile_dpbf16ps(6, 0, 2); + _tile_stream_loadd(3, b_tile_3, b_stride); + _tile_dpbf16ps(7, 0, 3); + } + + _tile_stored(6, c_tile_6, c_stride); + _tile_stored(7, c_tile_7, c_stride); + } + + FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) { + config.rows[0] = m; + config.rows[1] = m; + config.rows[2] = AMX_TILE_ROW_NUM; + config.rows[3] = AMX_TILE_ROW_NUM; + config.rows[4] = AMX_TILE_ROW_NUM; + config.rows[5] = AMX_TILE_ROW_NUM; + config.rows[6] = m; + config.rows[7] = m; + _tile_loadconfig(&config); + } +}; +} // namespace + +// Gemm kernel uses AMX, requires B matrix to be packed +template +class MicroGemm { + public: + static constexpr int32_t MaxMSize = 32; + static constexpr int32_t NSize = 32; + + public: + MicroGemm() : curr_m_(-1) { + vec_op::unroll_loop([&](int i) { amx_tile_config_.colsb[i] = 64; }); + } + + void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) { + if (m > AMX_TILE_ROW_NUM) { + if (m != curr_m_) { + curr_m_ = m; + TileGemm224::init_tile_config(m, amx_tile_config_); + } + TileGemm224::gemm(CPU_MICRO_GEMM_PARAMS); + } else { + if (m != curr_m_) { + curr_m_ = m; + TileGemm122::init_tile_config(m, amx_tile_config_); + } + TileGemm122::gemm(CPU_MICRO_GEMM_PARAMS); + } + } + + private: + alignas(64) __tilecfg amx_tile_config_; + int32_t curr_m_; +}; + +} // namespace cpu_micro_gemm + +#endif diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp new file mode 100644 index 0000000000000..784da55a420e5 --- /dev/null +++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp @@ -0,0 +1,91 @@ +#ifndef CPU_MICRO_GEMM_IMPL_HPP +#define CPU_MICRO_GEMM_IMPL_HPP +#include "cpu/utils.hpp" +#include "cpu/cpu_types.hpp" + +namespace cpu_micro_gemm { +#define DEFINE_CPU_MICRO_GEMM_PARAMS \ + scalar_t *__restrict__ a_ptr, scalar_t *__restrict__ b_ptr, \ + float *__restrict__ c_ptr, const int32_t m, const int32_t k, \ + const int64_t lda, const int64_t b_n_group_stride, const int64_t ldc, \ + const bool accum_c + +#define CPU_MICRO_GEMM_PARAMS \ + a_ptr, b_ptr, c_ptr, m, k, lda, b_n_group_stride, ldc, accum_c + +template +class MicroGemm { + public: + static constexpr int32_t MaxMSize = 16; + static constexpr int32_t NSize = 16; + + public: + void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) { + TORCH_CHECK(false, "Unimplemented MicroGemm."); + } +}; + +template +FORCE_INLINE void default_epilogue(float* __restrict__ c_ptr, + scalar_t* __restrict__ d_ptr, + const int32_t m, const int64_t ldc, + const int64_t ldd) { + using scalar_vec_t = typename cpu_utils::VecTypeTrait::vec_t; + static_assert(n_size % 16 == 0); + + float* __restrict__ curr_c = c_ptr; + scalar_t* __restrict__ curr_d = d_ptr; + for (int32_t i = 0; i < m; ++i) { + float* __restrict__ curr_c_iter = curr_c; + scalar_t* __restrict__ curr_d_iter = curr_d; + vec_op::unroll_loop([&](int32_t n_g_idx) { + vec_op::FP32Vec16 c_vec_fp32(curr_c_iter); + scalar_vec_t c_vec(c_vec_fp32); + c_vec.save(curr_d_iter); + curr_c_iter += 16; + curr_d_iter += 16; + }); + curr_c += ldc; + curr_d += ldd; + } +} + +template +FORCE_INLINE void bias_epilogue(float* __restrict__ c_ptr, + scalar_t* __restrict__ d_ptr, + scalar_t* __restrict__ bias_ptr, + const int32_t m, const int64_t ldc, + const int64_t ldd) { + using scalar_vec_t = typename cpu_utils::VecTypeTrait::vec_t; + static_assert(n_size % 16 == 0); + constexpr int32_t n_group_num = n_size / 16; + static_assert(n_group_num <= 16); + + vec_op::FP32Vec16 bias_vecs[n_group_num]; + scalar_t* __restrict__ curr_bias = bias_ptr; + vec_op::unroll_loop([&](int32_t i) { + scalar_vec_t vec(curr_bias); + bias_vecs[i] = vec_op::FP32Vec16(vec); + curr_bias += 16; + }); + + float* __restrict__ curr_c = c_ptr; + scalar_t* __restrict__ curr_d = d_ptr; + for (int32_t i = 0; i < m; ++i) { + float* __restrict__ curr_c_iter = curr_c; + scalar_t* __restrict__ curr_d_iter = curr_d; + vec_op::unroll_loop([&](int32_t n_g_idx) { + vec_op::FP32Vec16 c_vec_fp32(curr_c_iter); + c_vec_fp32 = c_vec_fp32 + bias_vecs[n_g_idx]; + scalar_vec_t c_vec(c_vec_fp32); + c_vec.save(curr_d_iter); + curr_c_iter += 16; + curr_d_iter += 16; + }); + curr_c += ldc; + curr_d += ldd; + } +} +} // namespace cpu_micro_gemm + +#endif diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp new file mode 100644 index 0000000000000..3985c2f2e5fe4 --- /dev/null +++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp @@ -0,0 +1,115 @@ +#ifndef CPU_MICRO_GEMM_VEC_HPP +#define CPU_MICRO_GEMM_VEC_HPP +#include "cpu/micro_gemm/cpu_micro_gemm_impl.hpp" + +namespace cpu_micro_gemm { +namespace { +// 8-2-16 pattern, 8 regs for A, 2 regs for B, 16 regs for C, [8, K] @ [k, 32] +template +class TileGemm82 { + public: + FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) { + switch (m) { + case 1: + gemm_micro<1>(CPU_MICRO_GEMM_PARAMS); + break; + case 2: + gemm_micro<2>(CPU_MICRO_GEMM_PARAMS); + break; + case 3: + gemm_micro<3>(CPU_MICRO_GEMM_PARAMS); + break; + case 4: + gemm_micro<4>(CPU_MICRO_GEMM_PARAMS); + break; + case 5: + gemm_micro<5>(CPU_MICRO_GEMM_PARAMS); + break; + case 6: + gemm_micro<6>(CPU_MICRO_GEMM_PARAMS); + break; + case 7: + gemm_micro<7>(CPU_MICRO_GEMM_PARAMS); + break; + case 8: + gemm_micro<8>(CPU_MICRO_GEMM_PARAMS); + break; + } + } + + template + static void gemm_micro(DEFINE_CPU_MICRO_GEMM_PARAMS) { + static_assert(0 < M <= 8); + using load_vec_t = typename cpu_utils::VecTypeTrait::vec_t; + + scalar_t* __restrict__ curr_b_0 = b_ptr; + scalar_t* __restrict__ curr_b_1 = b_ptr + b_n_group_stride; + float* __restrict__ curr_c_0 = c_ptr; + float* __restrict__ curr_c_1 = c_ptr + 16; + + vec_op::FP32Vec16 c_regs[M * 2]; + if (accum_c) { + float* __restrict__ curr_m_c_0 = curr_c_0; + float* __restrict__ curr_m_c_1 = curr_c_1; + vec_op::unroll_loop([&](int32_t i) { + c_regs[i * 2] = vec_op::FP32Vec16(curr_m_c_0); + c_regs[i * 2 + 1] = vec_op::FP32Vec16(curr_m_c_1); + + // update + curr_m_c_0 += ldc; + curr_m_c_1 += ldc; + }); + } + + scalar_t* __restrict__ curr_a = a_ptr; + for (int32_t k_idx = 0; k_idx < k; ++k_idx) { + load_vec_t b_0_reg(curr_b_0); + vec_op::FP32Vec16 fp32_b_0_reg(b_0_reg); + load_vec_t b_1_reg(curr_b_1); + vec_op::FP32Vec16 fp32_b_1_reg(b_1_reg); + + scalar_t* __restrict__ curr_m_a = curr_a; + vec_op::unroll_loop([&](int32_t i) { + scalar_t v = *curr_m_a; + load_vec_t a_reg_original(v); + vec_op::FP32Vec16 a_reg(a_reg_original); + c_regs[i * 2] = c_regs[i * 2] + a_reg * fp32_b_0_reg; + c_regs[i * 2 + 1] = c_regs[i * 2 + 1] + a_reg * fp32_b_1_reg; + + // update + curr_m_a += lda; + }); + + // update + curr_a += 1; + curr_b_0 += 16; + curr_b_1 += 16; + } + + vec_op::unroll_loop([&](int32_t i) { + c_regs[i * 2].save(curr_c_0); + c_regs[i * 2 + 1].save(curr_c_1); + + // update + curr_c_0 += ldc; + curr_c_1 += ldc; + }); + } +}; +} // namespace + +// Gemm kernel uses vector instructions, requires B matrix to be packed +template +class MicroGemm { + public: + static constexpr int32_t MaxMSize = 8; + static constexpr int32_t NSize = 32; + + public: + void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) { + TileGemm82::gemm(CPU_MICRO_GEMM_PARAMS); + } +}; +} // namespace cpu_micro_gemm + +#endif diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index 9fefd88cd9b08..e0e3ef71b485f 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -103,6 +103,13 @@ void cpu_attention_with_kv_cache( // Note: just for avoiding importing errors void placeholder_op() { TORCH_CHECK(false, "Unimplemented"); } +void cpu_gemm_wna16(const torch::Tensor& input, const torch::Tensor& q_weight, + torch::Tensor& output, const torch::Tensor& scales, + const std::optional& zeros, + const std::optional& g_idx, + const std::optional& bias, + const int64_t pack_factor, const std::string& isa_hint); + TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops @@ -165,7 +172,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Quantization #if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \ defined(__powerpc64__) - at::Tag stride_tag = at::Tag::needs_fixed_stride_order; // Helper function to release oneDNN handlers ops.def("release_dnnl_matmul_handler(int handler) -> ()", &release_dnnl_matmul_handler); @@ -201,15 +207,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Compute int8 quantized tensor for given scaling factor. ops.def( "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale," - "Tensor? azp) -> ()", - {stride_tag}); + "Tensor? azp) -> ()"); ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant); // Compute int8 quantized tensor and scaling factor ops.def( "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, " - "Tensor!? azp) -> ()", - {stride_tag}); + "Tensor!? azp) -> ()"); ops.impl("dynamic_scaled_int8_quant", torch::kCPU, &dynamic_scaled_int8_quant); #endif @@ -283,6 +287,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("static_scaled_fp8_quant() -> ()", placeholder_op); ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op); ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op); + + // WNA16 +#if defined(__AVX512F__) + ops.def( + "cpu_gemm_wna16(Tensor input, Tensor q_weight, Tensor(a2!) output, " + "Tensor scales, Tensor? zeros, Tensor? g_idx, Tensor? bias, SymInt " + "pack_factor, str isa_hint) -> ()"); + ops.impl("cpu_gemm_wna16", torch::kCPU, &cpu_gemm_wna16); +#endif } TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) { diff --git a/csrc/cpu/utils.hpp b/csrc/cpu/utils.hpp new file mode 100644 index 0000000000000..d3def306b8069 --- /dev/null +++ b/csrc/cpu/utils.hpp @@ -0,0 +1,73 @@ +#ifndef UTILS_HPP +#define UTILS_HPP + +#include +#include +#include +#include + +#if defined(__APPLE__) + #include +#endif + +#include "cpu_types.hpp" + +namespace cpu_utils { +enum class ISA { AMX, VEC }; + +template +struct VecTypeTrait { + using vec_t = void; +}; + +template <> +struct VecTypeTrait { + using vec_t = vec_op::FP32Vec16; +}; + +#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT) +template <> +struct VecTypeTrait { + using vec_t = vec_op::BF16Vec16; +}; +#endif + +template <> +struct VecTypeTrait { + using vec_t = vec_op::FP16Vec16; +}; + +struct Counter { + std::atomic counter; + char _padding[56]; + + Counter() : counter(0) {} + + void reset_counter() { counter.store(0); } + + int64_t acquire_counter() { return counter++; } +}; + +inline int64_t get_l2_size() { + static int64_t size = []() { +#if defined(__APPLE__) + // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname. + int64_t l2_cache_size = 0; + size_t len = sizeof(l2_cache_size); + if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 && + l2_cache_size > 0) { + return l2_cache_size >> 1; // use 50% of L2 cache + } + // Fallback if sysctlbyname fails + return 128LL * 1024 >> 1; // use 50% of 128KB +#else + long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE); + assert(l2_cache_size != -1); + return l2_cache_size >> 1; // use 50% of L2 cache +#endif + }(); + return size; +} +} // namespace cpu_utils + +#endif diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h index 9ae0ed975edde..e1d131e4a7851 100644 --- a/csrc/dispatch_utils.h +++ b/csrc/dispatch_utils.h @@ -117,3 +117,24 @@ break; \ } \ } + +#define VLLM_DISPATCH_RANK234(NUM_DIMS, ...) \ + switch (NUM_DIMS) { \ + case 2: { \ + constexpr int tensor_rank = 2; \ + __VA_ARGS__(); \ + break; \ + } \ + case 3: { \ + constexpr int tensor_rank = 3; \ + __VA_ARGS__(); \ + break; \ + } \ + case 4: { \ + constexpr int tensor_rank = 4; \ + __VA_ARGS__(); \ + break; \ + } \ + default: \ + TORCH_CHECK(false, "Expects rank 2, 3 or 4 tensors but got ", NUM_DIMS); \ + } diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu index 48771e4b3aff9..dfc67b933ccae 100644 --- a/csrc/layernorm_kernels.cu +++ b/csrc/layernorm_kernels.cu @@ -10,16 +10,38 @@ namespace vllm { // TODO(woosuk): Further optimize this kernel. -template +template __global__ void rms_norm_kernel( - scalar_t* __restrict__ out, // [..., hidden_size] - const scalar_t* __restrict__ input, // [..., hidden_size] - const int64_t input_stride, + scalar_t* __restrict__ out, // [..., hidden_size] + const scalar_t* __restrict__ input, // [..., hidden_size] + const int64_t input_stride_d2, // input.stride(-2) + const int64_t input_stride_d3, // input.stride(-3) + const int64_t input_stride_d4, // input.stride(-4) + const int64_t input_shape_d2, // input.size(-2) + const int64_t input_shape_d3, // input.size(-3) const scalar_t* __restrict__ weight, // [hidden_size] const float epsilon, const int num_tokens, const int hidden_size) { __shared__ float s_variance; float variance = 0.0f; - const scalar_t* input_row = input + blockIdx.x * input_stride; + const scalar_t* input_row; + if constexpr (NUM_DIMS == 2) { + // 2D for layernorm normal case [batch_size, hidden] + input_row = input + blockIdx.x * input_stride_d2; + } else if constexpr (NUM_DIMS == 3) { + // 3D for q/k norm [batch_size, num_heads, head_size] + int batch_idx = blockIdx.x / input_shape_d2; + int head_idx = blockIdx.x % input_shape_d2; + input_row = + input + batch_idx * input_stride_d3 + head_idx * input_stride_d2; + } else if constexpr (NUM_DIMS == 4) { + // 4D for transformers model_impl qk norm [batch, seq, head, head_dim] + int batch_idx = blockIdx.x / (input_shape_d3 * input_shape_d2); + int remaining = blockIdx.x % (input_shape_d3 * input_shape_d2); + int seq_idx = remaining / input_shape_d2; + int head_idx = remaining % input_shape_d2; + input_row = input + batch_idx * input_stride_d4 + + seq_idx * input_stride_d3 + head_idx * input_stride_d2; + } auto vec_op = [&variance](const vec_n_t& vec) { #pragma unroll @@ -164,38 +186,44 @@ void rms_norm(torch::Tensor& out, // [..., hidden_size] torch::Tensor& weight, // [hidden_size] double epsilon) { TORCH_CHECK(out.is_contiguous()); + if (input.stride(-1) != 1) { + input = input.contiguous(); + } TORCH_CHECK(input.stride(-1) == 1); TORCH_CHECK(weight.is_contiguous()); int hidden_size = input.size(-1); - // We cannot just use `input.stride(-2)` if the tensor is not row-major. - // Instead, we use a 2d view to get the second-innermost stride. - // That way the dimensions (except the last one) can be arbitrarily permuted. - torch::Tensor input_view = input.view({-1, hidden_size}); - - int num_tokens = input_view.numel() / hidden_size; - int64_t input_stride = input_view.stride(-2); + int num_tokens = input.numel() / hidden_size; + int num_dims = input.dim(); + int64_t input_stride_d2 = input.stride(-2); + int64_t input_stride_d3 = (num_dims >= 3) ? input.stride(-3) : 0; + int64_t input_stride_d4 = (num_dims >= 4) ? input.stride(-4) : 0; + int64_t input_shape_d2 = (num_dims >= 3) ? input.size(-2) : 0; + int64_t input_shape_d3 = (num_dims >= 4) ? input.size(-3) : 0; // For large num_tokens, use smaller blocks to increase SM concurrency. const int max_block_size = (num_tokens < 256) ? 1024 : 256; dim3 grid(num_tokens); - const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES( - input_view.scalar_type(), "rms_norm_kernel", [&] { - const int calculated_vec_size = - std::gcd(16 / sizeof(scalar_t), hidden_size); - const int block_size = - std::min(hidden_size / calculated_vec_size, max_block_size); - dim3 block(block_size); - VLLM_DISPATCH_VEC_SIZE(calculated_vec_size, [&] { - vllm::rms_norm_kernel<<>>( - out.data_ptr(), input_view.data_ptr(), - input_stride, weight.data_ptr(), epsilon, num_tokens, - hidden_size); - }); + VLLM_DISPATCH_RANK234(num_dims, [&] { + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] { + const int calculated_vec_size = + std::gcd(16 / sizeof(scalar_t), hidden_size); + const int block_size = + std::min(hidden_size / calculated_vec_size, max_block_size); + dim3 block(block_size); + VLLM_DISPATCH_VEC_SIZE(calculated_vec_size, [&] { + vllm::rms_norm_kernel + <<>>( + out.data_ptr(), input.data_ptr(), + input_stride_d2, input_stride_d3, input_stride_d4, + input_shape_d2, input_shape_d3, weight.data_ptr(), + epsilon, num_tokens, hidden_size); }); + }); + }); } #define LAUNCH_FUSED_ADD_RMS_NORM(width) \ diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index c3ae06a30e3e8..5af74c2c2a6b0 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -20,18 +20,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops // - // The default behavior in PyTorch 2.6 was changed to "requires_contiguous", - // so we need - // to override this for many GEMMs with the following tag. Otherwise, - // torch.compile will force all input tensors to be contiguous(), which - // will break many custom ops that require column-major weight matrices. - // This was a bug and PyTorch 2.7 has since fixed this. -#if TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 6 - #define stride_tag at::Tag::needs_fixed_stride_order -#else - #define stride_tag -#endif - ops.def( "persistent_masked_m_silu_mul_quant(Tensor input, Tensor counts, Tensor! " "y_q, Tensor! y_s," @@ -241,15 +229,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Quantized GEMM for AWQ. ops.def( "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, " - "Tensor _zeros, SymInt split_k_iters) -> Tensor", - {stride_tag}); + "Tensor _zeros, SymInt split_k_iters) -> Tensor"); ops.impl("awq_gemm", torch::kCUDA, &awq_gemm); // Dequantization for AWQ. ops.def( "awq_dequantize(Tensor _kernel, Tensor _scaling_factors, " - "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor", - {stride_tag}); + "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor"); ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize); // Note about marlin kernel 'workspace' arguments: @@ -271,8 +257,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, " "Tensor b_scales, Tensor workspace, " "int b_q_type, " - "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor", - {stride_tag}); + "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor"); // conditionally compiled so impl in source file // Machete (Dense) Optimized Mixed Precision GEMM for Hopper. @@ -298,8 +283,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor? channel_scales," " Tensor? token_scales," " str? schedule" - ") -> Tensor", - {stride_tag}); + ") -> Tensor"); ops.def( "machete_prepack_B(" " Tensor B," @@ -319,8 +303,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "Tensor b_scales, Tensor? global_scale, Tensor? b_zeros_or_none, Tensor? " "g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_q_type, " "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, " - "bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor", - {stride_tag}); + "bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor"); // conditionally compiled so impl registration is in source file // gptq_marlin repack from GPTQ. @@ -346,8 +329,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor token_scales," " ScalarType? out_type," " str? maybe_schedule" - ") -> Tensor", - {stride_tag}); + ") -> Tensor"); // pack scales ops.def("cutlass_pack_scale_fp8(Tensor scales) -> Tensor"); // encode and reorder weight matrix @@ -394,24 +376,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def( "cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b," " Tensor block_scale_a, Tensor block_scale_b," - " Tensor alpha) -> ()", - {stride_tag}); + " Tensor alpha) -> ()"); ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm); // cutlass blockwise scaledgroup GEMM ops.def( "cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, " "Tensor scales_a, Tensor scales_b, " - "Tensor problem_sizes, Tensor expert_offsets) -> ()", - {stride_tag}); + "Tensor problem_sizes, Tensor expert_offsets) -> ()"); // conditionally compiled so impl registration is in source file // cutlass nvfp4 block scaled group GEMM ops.def( "cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b," " Tensor a_blockscale, Tensor b_blockscales, Tensor alphas," - " Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()", - {stride_tag}); + " Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()"); // conditionally compiled so impl registration is in source file // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column @@ -419,8 +398,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def( "cutlass_scaled_mm(Tensor! out, Tensor a," " Tensor b, Tensor a_scales," - " Tensor b_scales, Tensor? bias) -> ()", - {stride_tag}); + " Tensor b_scales, Tensor? bias) -> ()"); ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm); // CUTLASS w8a8 GEMM, supporting asymmetric per-tensor or per-row/column @@ -429,8 +407,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "cutlass_scaled_mm_azp(Tensor! out, Tensor a," " Tensor b, Tensor a_scales," " Tensor b_scales, Tensor azp_adj," - " Tensor? azp, Tensor? bias) -> ()", - {stride_tag}); + " Tensor? azp, Tensor? bias) -> ()"); ops.impl("cutlass_scaled_mm_azp", torch::kCUDA, &cutlass_scaled_mm_azp); // Check if cutlass scaled_mm is supported for CUDA devices of the given @@ -449,8 +426,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor a_scales, Tensor b_scales, Tensor expert_offsets, " " Tensor problem_sizes, Tensor a_strides, " " Tensor b_strides, Tensor c_strides, bool per_act_token, " - " bool per_out_ch) -> ()", - {stride_tag}); + " bool per_out_ch) -> ()"); ops.impl("cutlass_moe_mm", torch::kCUDA, &cutlass_moe_mm); // A function that computes data required to run fused MoE with w8a8 grouped @@ -464,8 +440,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor! problem_sizes1, Tensor! problem_sizes2, " " Tensor! input_permutation, " " Tensor! output_permutation, int num_experts, " - " int n, int k, Tensor? blockscale_offsets) -> ()", - {stride_tag}); + " int n, int k, Tensor? blockscale_offsets) -> " + "()"); ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data); // A function that computes problem sizes for each expert's multiplication @@ -476,8 +452,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor! problem_sizes1, " " Tensor! problem_sizes2, " " int num_experts, int n, int k, " - " Tensor? blockscale_offsets) -> ()", - {stride_tag}); + " Tensor? blockscale_offsets) -> ()"); ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA, &get_cutlass_moe_mm_problem_sizes); @@ -492,8 +467,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor! problem_sizes2, " " Tensor expert_num_tokens, " " int num_local_experts, int padded_m, " - " int n, int k) -> ()", - {stride_tag}); + " int n, int k) -> ()"); ops.impl("get_cutlass_pplx_moe_mm_data", torch::kCUDA, &get_cutlass_pplx_moe_mm_data); @@ -517,8 +491,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "cutlass_scaled_sparse_mm(Tensor! out, Tensor a," " Tensor bt_nzs," " Tensor bt_meta, Tensor a_scales," - " Tensor b_scales, Tensor? bias) -> ()", - {stride_tag}); + " Tensor b_scales, Tensor? bias) -> ()"); ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm); // CUTLASS sparse matrix compressor @@ -567,8 +540,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, " "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, bool " "use_v2_format, int bit) " - "-> Tensor", - {stride_tag}); + "-> Tensor"); ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm); // Post processing for GPTQ. diff --git a/docker/Dockerfile b/docker/Dockerfile index 964700e2a43ac..709b79e84fbbc 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -56,7 +56,6 @@ ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} # PyTorch provides its own indexes for standard and nightly builds ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl -ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly # PIP supports multiple authentication schemes, including keyring # By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to @@ -98,7 +97,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ ARG PIP_INDEX_URL UV_INDEX_URL ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL ARG PYTORCH_CUDA_INDEX_BASE_URL -ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER # Activate virtual environment and add uv to PATH @@ -317,7 +315,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ ARG PIP_INDEX_URL UV_INDEX_URL ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL ARG PYTORCH_CUDA_INDEX_BASE_URL -ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER # Install uv for faster pip installs @@ -337,20 +334,6 @@ ENV UV_LINK_MODE=copy # or future versions of triton. RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ -# arm64 (GH200) build follows the practice of "use existing pytorch" build, -# we need to install torch and torchvision from the nightly builds first, -# pytorch will not appear as a vLLM dependency in all of the following steps -# after this step -RUN --mount=type=cache,target=/root/.cache/uv \ - if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - uv pip install --system \ - --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ - "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \ - uv pip install --system \ - --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ - --pre pytorch_triton==3.3.0+gitab727c40 ; \ - fi - # Install vllm wheel first, so that torch etc will be installed. RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/uv \ diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 4c961defaeda2..eb3807ef0ca4e 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -37,6 +37,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \ && curl -LsSf https://astral.sh/uv/install.sh | sh +ENV CC=/usr/bin/gcc-12 CXX=/usr/bin/g++-12 ENV CCACHE_DIR=/root/.cache/ccache ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache @@ -122,6 +123,15 @@ WORKDIR /workspace/vllm RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \ cp requirements/test.in requirements/cpu-test.in && \ sed -i '/mamba_ssm/d' requirements/cpu-test.in && \ + remove_packages_not_supported_on_aarch64() { \ + case "$(uname -m)" in \ + aarch64|arm64) \ + sed -i '/decord/d' requirements/cpu-test.in; \ + sed -i '/terratorch/d' requirements/cpu-test.in; \ + ;; \ + esac; \ + }; \ + remove_packages_not_supported_on_aarch64 && \ sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \ sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \ sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \ diff --git a/docs/.nav.yml b/docs/.nav.yml index 3151ea0e2ec22..c8bf00efb2370 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -24,14 +24,16 @@ nav: - deployment/integrations - Training: training - Configuration: - - configuration/README.md - configuration/* + - TPU: https://docs.vllm.ai/projects/tpu/en/latest/ - Models: - models/supported_models.md - models/generative_models.md - models/pooling_models.md - models/extensions - - Hardware Supported Models: models/hardware_supported_models + - Hardware Supported Models: + - models/hardware_supported_models/* + - TPU: https://docs.vllm.ai/projects/tpu/en/latest/recommended_models_features/ - Features: features - Developer Guide: - contributing/README.md diff --git a/docs/configuration/env_vars.md b/docs/configuration/env_vars.md index 2c0a898754fa0..f6d548a19d91f 100644 --- a/docs/configuration/env_vars.md +++ b/docs/configuration/env_vars.md @@ -7,8 +7,6 @@ vLLM uses the following environment variables to configure the system: All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables). -??? code - - ```python - --8<-- "vllm/envs.py:env-vars-definition" - ``` +```python +--8<-- "vllm/envs.py:env-vars-definition" +``` diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md deleted file mode 100644 index 2d24c9c6e2e95..0000000000000 --- a/docs/configuration/tpu.md +++ /dev/null @@ -1,111 +0,0 @@ -# TPU Optimization Tips - -This doc serves as a collection of handy tips for optimizing your vLLM on TPU workload. - -## Get started - -Looking for setup and installation instructions? Find them [here](https://docs.vllm.ai/projects/tpu/en/latest/getting_started/installation/). - -### TPU workload sizing - -When selecting the ideal number of chips for a single serving instance, it's important to account for both the model size and the average request context length. Adequate HBM for the KV cache is essential to ensure a sufficient number of concurrent requests can be processed. - -The following colab [calculator](https://colab.research.google.com/github/ericehanley/rightsize-vllm/blob/main/HBM_Calculator.ipynb) will tell you: - -- KV cache size requirement per token and per request -- TPU/GPU memory consumed by the model weights -- TPU/GPU memory allocated for the KV cache -- Maximum \# of requests you can approximately set (--max-num-seqs) - -This approach serves as a general rule of thumb. - -#### Latency-throughput tradeoff - -As with rightsizing the number of chips for your workload, consider adjusting `--max-num-seqs` to fine-tune the latency-throughput balance. Decreasing `--max-num-seqs` and/or increasing the number of chips can help reduce latency. - -`--max-num-seqs` defines the number of concurrent decode slots, effectively limiting the number of requests the server can process tokens for simultaneously. Increasing this value allows the server to pre-allocate more HBM to handle a higher number of concurrent requests, which can maximize overall throughput. However, this often increases the end-to-end (e2e) latency per request. - -Therefore, carefully tuning `--max-num-seqs` is crucial to achieving the desired balance between latency and throughput for your specific workload. - -In a similar way, `--max-num-batch-tokens` can be adjusted down to improve latency, or adjusted up to improve throughput. - -#### Compilation and Caching - -Coming from a GPU background, one of the key differences you'll notice with TPUs is an initial compilation step. TPUs are specialized accelerators (ASICs) that achieve maximum performance by executing pre-compiled, static computation graphs via the XLA compiler. Unlike GPUs, which can handle dynamic input shapes more flexibly, TPUs require a specific compiled graph for each tensor shape (e.g., batch size and sequence length) they process. - -To manage this, vLLM performs a one-time "warmup" process when you first launch the server. During this phase, it pre-compiles the model for various common input shapes and saves these compiled graphs to a cache on disk or remote storage (located at `~/.cache/vllm/xla_cache` by default). This process can range significantly, anywhere from a few minutes to an hour depending on the size of the model and context length used. - -Although the first compilation can take some time, for all subsequent server launches, vLLM can load these graphs directly from the cache, eliminating the compilation time for future runs. - -Use `VLLM_XLA_CACHE_PATH` environment variable to write to shareable storage for future deployed nodes (like when using autoscaling). - -#### Reducing compilation time - -This initial compilation time ranges significantly and is impacted by many of the arguments discussed in this optimization doc. Factors that influence the length of time to compile are things like model size and `--max-num-batch-tokens`. Other arguments you can tune are things like `VLLM_TPU_MOST_MODEL_LEN`. - -### Optimize based on your data - -#### max-model-len vs. most-model-len - -![most_model_len](../assets/design/tpu/most_model_len.png) - -If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most-model-len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable. - -For example, 1% requests are 32k length and 99% requests are 2k length. You can pass 32k into `--max-model-len 32768` and use `VLLM_TPU_MOST_MODEL_LEN=2048`. - -The requests get subdivided into max-model-len and most-model-len categories, for the latter category, you can gain better performance since the server can process more requests at a time. - -#### Padding - -For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128 (e.g., 128, 256, etc.) - -The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about TPU padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests: - -1. the default exponential padding (pad to the nearest power of 2) -2. bucket padding (pad to the nearest linearly increasing bucket). - -When using bucket padding, the buckets start from 16, end at max_model_len, and increment by `VLLM_TPU_BUCKET_PADDING_GAP`. - -For example, max_model_len=512, padding_gap=64, the buckets will be [16, 32, 64, 128, 192, 256, 320, 384, 448, 512]. - -The fewer tokens you pad, the less unnecessary computation TPU does, the better performance you can get. For example, if num_tokens=300, with exponential padding, you pad to 512, with the bucket_padding above, you pad to 320. - -However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compiled graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding. - -#### Quantization - -If possible, use the precision that matches the chip’s hardware acceleration: - -- v5e has int4/int8 hardware acceleration in the MXU -- v6e has int4/int8 hardware acceleration in the MXU - -Supported quantized formats and features in vLLM on TPU [Jul '25]: - -- INT8 W8A8 -- INT8 W8A16 -- FP8 KV cache -- [WIP] FP8 W8A8 -- [WIP] AWQ -- [WIP] FP4 W4A8 - -#### Parallelization - -Don't set TP to be less than the number of chips on a single-host deployment. - -Although it’s common to do this with GPUs, don't try to fragment 2 or 8 different workloads across 8 chips on a single host. If you need 1 or 4 chips, just create an instance with 1 or 4 chips (these are partial-host machine types). - -### Tune your workloads - -Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case. - -### Future Topics We'll Cover - -#### Profiling - -The auto-tuner provides a profile of optimized configurations as its final step. However, interpreting this profile can be challenging for new users. We plan to expand this section in the future with more detailed guidance. In the meantime, you can learn how to collect a TPU profile using vLLM's native profiling tools [here](../examples/offline_inference/profiling_tpu.md). This profile can provide valuable insights into your workload's performance. - -#### SPMD - -More details to come. - -**Want us to cover something that isn't listed here? Open up an issue please and cite this doc. We'd love to hear your questions or tips.** diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index a7b54f015c2da..d7f5d2f311a37 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -146,6 +146,7 @@ We use "mamba-like" to refer to layers that posses a state that is updated in-pl For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`. It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers. Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this. +It is also worth noting that we should update `MAMBA_TYPE_TO_BACKEND_MAP` and `MambaAttentionBackendEnum` in [`registry.py`](../../../vllm/attention/backends/registry.py) when adding a new mamba backend. Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it. Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this. The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended. diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md index 1c639f3533d47..0e636c87f38a4 100644 --- a/docs/deployment/docker.md +++ b/docs/deployment/docker.md @@ -82,8 +82,7 @@ DOCKER_BUILDKIT=1 docker build . \ ## Building for Arm64/aarch64 -A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use -of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64. +A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64. !!! note Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=` @@ -94,7 +93,6 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- ```bash # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) - python3 use_existing_torch.py DOCKER_BUILDKIT=1 docker build . \ --file docker/Dockerfile \ --target vllm-openai \ @@ -102,7 +100,8 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- -t vllm/vllm-gh200-openai:latest \ --build-arg max_jobs=66 \ --build-arg nvcc_threads=2 \ - --build-arg torch_cuda_arch_list="9.0 10.0+PTX" + --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \ + --build-arg RUN_WHEEL_CHECK=false ``` !!! note diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md index f4a984a6433e2..e9b0d5f0671c3 100644 --- a/docs/deployment/frameworks/skypilot.md +++ b/docs/deployment/frameworks/skypilot.md @@ -4,7 +4,7 @@ vLLM

-vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html). +vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc., can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html). ## Prerequisites diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md index 7663b82266f0b..f0d5a3e934f39 100644 --- a/docs/design/moe_kernel_features.md +++ b/docs/design/moe_kernel_features.md @@ -1,22 +1,22 @@ -# Fused MoE Kernel features +# Fused MoE Kernel Features The purpose of this document is to provide an overview of the various MoE kernels (both modular and non-modular) so it will be easier to select an appropriate set of kernels for any particular situation. This includes information about the all2all backends used by modular kernels. ## Fused MoE Modular All2All backends -There are a number of all2all communication backends that are used to implement expert parallelism (EP) for the `FusedMoE` layer. The different `FusedMoEPrepareAndFinalize` sub-classes provide an interface for each all2all backend. +There are a number of all2all communication backends that are used to implement expert parallelism (EP) for the `FusedMoE` layer. The different `FusedMoEPrepareAndFinalize` subclasses provide an interface for each all2all backend. The following table describes the relevant features of each backend, i.e. activation format, supported quantization schemes and async support. -The output activation format (standard or batched) corresponds to the output of the prepare step of the `FusedMoEPrepareAndFinalize` subclass, the finalize step requires the same format. All the backend `prepare` methods expect activations in standard format and all the `finalize methods return activations in standard format. More details on the formats can be found in the [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) document. +The output activation format (standard or batched) corresponds to the output of the prepare step of the `FusedMoEPrepareAndFinalize` subclass, and the finalize step requires the same format. All the backend `prepare` methods expect activations in the standard format and all the `finalize` methods return activations in standard format. More details on the formats can be found in the [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) document. -The quantization types and formats enumerate which quantization schemes are supported by each `FusedMoEPrepareAndFinalize` class. The quantization can happen before or after the dispatch based on the format the all2all backend supports. e.g. deepep_high_throughput supports only block-quantized fp8 format, any other format will result in dispatching in higher precision and quantizing afterwards. The output of the prepare step for each backend is the quantized type. The finalize step generally requires the same input type as the original activations, e.g. if the original input is bfloat16 and the quantization scheme is fp8 w/per-tensor scales, `prepare` will return fp8/per-tensor scale activations and `finalize` will take bfloat16 activations. See the diagrams in [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) for more details on the types and formats of activations at each step of the MoE process. If no quantization type is specified, the kernel operates on float16 and/or bfloat16. +The quantization types and formats enumerate which quantization schemes are supported by each `FusedMoEPrepareAndFinalize` class. The quantization can happen before or after the dispatch based on the format the all2all backend supports, e.g. deepep_high_throughput supports only block-quantized fp8 format. Any other format will result in dispatching in higher precision and quantizing afterwards. The output of the prepare step for each backend is the quantized type. The finalize step generally requires the same input type as the original activations, e.g. if the original input is bfloat16 and the quantization scheme is fp8 with per-tensor scales, `prepare` will return fp8/per-tensor scale activations and `finalize` will take bfloat16 activations. See the diagrams in [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) for more details on the types and formats of activations at each step of the MoE process. If no quantization type is specified, the kernel operates on float16 and/or bfloat16. Async backends support the use of DBO (Dual Batch Overlap) and shared expert overlap (where shared experts are computed during the combine step). -Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass, for non-modular kernels, it is up to the experts function to deal with this flag. +Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. Llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass. For non-modular kernels, it is up to the experts function to deal with this flag. -unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`. All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP w/o EP. +Unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`. All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP without EP. -| Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Sub-class | -|---------------------------------------|--------------------|-----------------|------------------------|-------|-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| -| naive | standard | all1 | G,A,T | N | 6 | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE.forward_impl] | -| pplx | batched | fp8,int8 | G,A,T | Y | Y | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize] | -| deepep_high_throughput | standard | fp8 | G(128),A,T2 | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] | -| deepep_low_latency | batched | fp8 | G(128),A,T3 | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] | -| flashinfer_all2allv | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferAllToAllMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferAllToAllMoEPrepareAndFinalize] | -| flashinfer4 | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize] | -| flashinfer4 | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize] | -| MoEPrepareAndFinalizeNoEP5 | standard | fp8,int8 | G,A,T | N | Y | [`MoEPrepareAndFinalizeNoEP`][vllm.model_executor.layers.fused_moe.prepare_finalize.MoEPrepareAndFinalizeNoEP] | -| BatchedPrepareAndFinalize5 | batched | fp8,int8 | G,A,T | N | Y | [`BatchedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedPrepareAndFinalize] | +| Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Subclass | +|---------|--------------------|--------------|---------------|-------|-----------------------|-----------| +| naive | standard | all1 | G,A,T | N | 6 | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE.forward_impl] | +| pplx | batched | fp8,int8 | G,A,T | Y | Y | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize] | +| deepep_high_throughput | standard | fp8 | G(128),A,T2 | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] | +| deepep_low_latency | batched | fp8 | G(128),A,T3 | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] | +| flashinfer_all2allv | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferAllToAllMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferAllToAllMoEPrepareAndFinalize] | +| flashinfer4 | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize] | +| MoEPrepareAndFinalizeNoEP5 | standard | fp8,int8 | G,A,T | N | Y | [`MoEPrepareAndFinalizeNoEP`][vllm.model_executor.layers.fused_moe.prepare_finalize.MoEPrepareAndFinalizeNoEP] | +| BatchedPrepareAndFinalize5 | batched | fp8,int8 | G,A,T | N | Y | [`BatchedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedPrepareAndFinalize] | !!! info "Table key" 1. All types: mxfp4, nvfp4, int4, int8, fp8 2. A,T quantization occurs after dispatch. 3. All quantization happens after dispatch. 4. Controlled by different env vars (`VLLM_FLASHINFER_MOE_BACKEND` "throughput" or "latency") - 5. This is a no-op dispatcher that can be used to pair with any modular experts to produce a modular kernel that runs w/o dispatch or combine. These cannot be selected via environment variable. These are generally use for testing or adapting an expert subclass to the `fused_experts` API. + 5. This is a no-op dispatcher that can be used to pair with any modular experts to produce a modular kernel that runs without dispatch or combine. These cannot be selected via environment variable. These are generally use for testing or adapting an expert subclass to the `fused_experts` API. 6. This depends on the experts implementation. --- @@ -66,44 +65,43 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes. - [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod] - [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod] -## Fused MoE Experts Kernels +## Fused Experts Kernels -The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties. +There are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters, so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties. -Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`. +Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx` and `DeepEPLLPrepareAndFinalize`. Similar to the backend kernels, each experts kernel only supports certain quantization formats. For non-modular experts, the activations will be in the original type and quantized internally by the kernel. Modular experts will expect the activations to already be in the quantized format. Both types of experts will yield outputs in the original activation type. -Each experts kernel supports one or more activation functions, e.g. silu, gelu that are applied to the intermediate results. +Each experts kernel supports one or more activation functions, e.g. silu or gelu, which are applied to the intermediate results. As with the backends, some experts support applying topk weights on the input activations. The entries in the column in this table only apply to the non-modular experts. Most experts flavors include an equivalent modular interface which will be a subclass of `FusedMoEPermuteExpertsUnpermute`. -To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels must have compatible activation formats, quantization types and quantization formats. +To be used with a particular `FusedMoEPrepareAndFinalize` subclass, MoE kernels must have compatible activation formats, quantization types and quantization formats. -| Kernel | Input act. format | Quant. types | Quant. format | Activation function | Apply Weight On Input | Modular | Source | -|------------------------------|-----------------------|------------------|---------------|-------------------------------------------------------------|-----------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| triton | standard | all1 | G,A,T | silu, gelu,
swigluoai,
silu_no_mul,
gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],
[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] | -| triton (batched) | batched | all1 | G,A,T | silu, gelu | 6 | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] | -| deep gemm | standard,
batched | fp8 | G(128),A,T | silu, gelu | 6 | Y | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],
[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],
[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] | -| cutlass_fp4 | standard,
batched | nvfp4 | A,T | silu | Y | Y | [`cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp4],
[`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] | -| cutlass_fp8 | standard,
batched | fp8 | A,T | silu, gelu | Y | Y | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],
[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],
[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] | -| flashinfer | standard | nvfp4,
fp8 | T | 5 | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],
[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] | -| gpt oss triton | standard | N/A | N/A | 5 | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],
[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] | -| deep gemm+triton2 | standard,
batched | all1 | G(128),A,T | silu, gelu | 6 | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],
[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] | -| marlin | standard | 3 | 3 | silu,
swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],
[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],
[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] | -| marlin experts | standard,
batched | N/A | N/A | silu,
swigluoai | Y | Y | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],
[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] | -| trtllm | standard | mxfp4,
nvfp4 | G(16),G(32) | 5 | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] | -| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] | -| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] | -| rocm aiter moe | standard | fp8 | G(128),A,T | silu, gelu | Y | N | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts] | -| cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] | -| naive batched4 | batched | int8,
fp8 | G,A,T | silu, gelu | 6 | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts] | +| Kernel | Input act. format | Quant. types | Quant. format | Activation function | Apply Weight On Input | Modular | Source | +|--------|-------------------|--------------|---------------|---------------------|-----------------------|---------|--------| +| triton | standard | all1 | G,A,T | silu, gelu,
swigluoai,
silu_no_mul,
gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],
[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] | +| triton (batched) | batched | all1 | G,A,T | silu, gelu | 6 | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] | +| deep gemm | standard,
batched | fp8 | G(128),A,T | silu, gelu | 6 | Y | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],
[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],
[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] | +| cutlass_fp4 | standard,
batched | nvfp4 | A,T | silu | Y | Y | [`cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp4],
[`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] | +| cutlass_fp8 | standard,
batched | fp8 | A,T | silu, gelu | Y | Y | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],
[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],
[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] | +| flashinfer | standard | nvfp4,
fp8 | T | 5 | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],
[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] | +| gpt oss triton | standard | N/A | N/A | 5 | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],
[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] | +| deep gemm+triton2 | standard,
batched | all1 | G(128),A,T | silu, gelu | 6 | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],
[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] | +| marlin | standard,
batched | 3 / N/A | 3 / N/A | silu,
swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],
[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],
[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] | +| trtllm | standard | mxfp4,
nvfp4 | G(16),G(32) | 5 | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] | +| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] | +| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] | +| rocm aiter moe | standard | fp8 | G(128),A,T | silu, gelu | Y | N | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts] | +| cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] | +| naive batched4 | batched | int8,
fp8 | G,A,T | silu, gelu | 6 | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts] | !!! info "Table key" 1. All types: mxfp4, nvfp4, int4, int8, fp8 - 2. A dispatcher wrapper around triton and deep gemm experts. Will select based on type + shape + quantization params + 2. A dispatcher wrapper around triton and deep gemm experts. Will select based on type + shape + quantization params 3. uint4, uint8, fp8, fp4 4. This is a naive implementation of experts that supports batched format. Mainly used for testing. 5. The `activation` parameter is ignored and SwiGlu is used by default instead. @@ -113,8 +111,8 @@ To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts. Note that the "naive" backend will work with any non-modular experts. -| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses | -|----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------| -| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,
`TritonExperts`,
`TritonOrDeepGemmExperts`,
`CutlassExpertsFp8`,
`MarlinExperts` | -| deepep_low_latency,
pplx | `DeepEPLLPrepareAndFinalize`,
`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,
`BatchedTritonExperts`,
`BatchedTritonOrDeepGemmExperts`,
`CutlassBatchedExpertsFp8`,
`BatchedMarlinExperts`| -| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` | +| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses | +|---------|-----------------------------------------|----------------------------------------------| +| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,
`TritonExperts`,
`TritonOrDeepGemmExperts`,
`CutlassExpertsFp8`,
`MarlinExperts` | +| deepep_low_latency,
pplx | `DeepEPLLPrepareAndFinalize`,
`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,
`BatchedTritonExperts`,
`BatchedTritonOrDeepGemmExperts`,
`CutlassBatchedExpertsFp8`,
`BatchedMarlinExperts` | +| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` | diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index dc2f7c4aed3c3..e8db8047ca4e6 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -49,7 +49,7 @@ Every plugin has three parts: - **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported. -- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name. +- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre-/post-processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name. - **Stat logger plugins** (with group name `vllm.stat_logger_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree loggers into vLLM. The entry point should be a class that subclasses StatLoggerBase. diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md index bd4070f381d81..48536a877bd3f 100644 --- a/docs/design/prefix_caching.md +++ b/docs/design/prefix_caching.md @@ -1,6 +1,6 @@ # Automatic Prefix Caching -Prefix caching kv-cache blocks is a popular optimization in LLM inference to avoid redundant prompt computations. The core idea is simple – we cache the kv-cache blocks of processed requests, and reuse these blocks when a new request comes in with the same prefix as previous requests. Since prefix caching is almost a free lunch and won’t change model outputs, it has been widely used by many public endpoints (e.g., OpenAI, Anthropic, etc) and most open source LLM inference frameworks (e.g., SGLang). +Prefix caching kv-cache blocks is a popular optimization in LLM inference to avoid redundant prompt computations. The core idea is simple – we cache the kv-cache blocks of processed requests, and reuse these blocks when a new request comes in with the same prefix as previous requests. Since prefix caching is almost a free lunch and won’t change model outputs, it has been widely used by many public endpoints (e.g., OpenAI, Anthropic, etc.) and most open source LLM inference frameworks (e.g., SGLang). While there are many ways to implement prefix caching, vLLM chooses a hash-based approach. Specifically, we hash each kv-cache block by the tokens in the block and the tokens in the prefix before the block: diff --git a/docs/features/README.md b/docs/features/README.md index ad9de9ff8f368..5faf3768f3214 100644 --- a/docs/features/README.md +++ b/docs/features/README.md @@ -59,20 +59,23 @@ th:not(:first-child) { ### Feature x Hardware -| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | TPU | Intel GPU | -|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----| ------------| -| [CP](../configuration/optimization.md#chunked-prefill) | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| [APC](automatic_prefix_caching.md) | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26963) | -| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | [❌](https://github.com/vllm-project/vllm/issues/26970) | -| [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | -| enc-dec | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | -| [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26965) | -| logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | -| prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | -| async output | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | -| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅ | ❌ | ✅ | -| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | -| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | -| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/25097) | ✅ | +| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | Intel GPU | +|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------| ------------| +| [CP](../configuration/optimization.md#chunked-prefill) | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| [APC](automatic_prefix_caching.md) | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [🟠](https://github.com/vllm-project/vllm/issues/26963) | +| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/26970) | +| [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| enc-dec | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | +| [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | [🟠](https://github.com/vllm-project/vllm/issues/26965) | +| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | +| logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| async output | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | +| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅ | ✅ | +| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | + +!!! note + For information on feature support on Google TPU, please refer to the [TPU-Inference Recommended Models and Features](https://docs.vllm.ai/projects/tpu/en/latest/recommended_models_features/) documentation. diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index cde2ec165712b..4656ee43ea251 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -365,6 +365,8 @@ You must enable this feature via `enable_mm_embeds=True`. The vLLM engine may crash if incorrect shape of embeddings is passed. Only enable this flag for trusted users! +#### Image Embeddings + ??? code ```python @@ -441,6 +443,36 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd print(generated_text) ``` +#### Audio Embeddings + +You can pass pre-computed audio embeddings similar to image embeddings: + +??? code + + ```python + from vllm import LLM + import torch + + # Enable audio embeddings support + llm = LLM(model="fixie-ai/ultravox-v0_5-llama-3_2-1b", enable_mm_embeds=True) + + # Refer to the HuggingFace repo for the correct format to use + prompt = "USER: